Repository: yichen928/SparseFusion
Branch: main
Commit: 22537781e033
Files: 516
Total size: 3.1 MB
Directory structure:
gitextract_232uyltz/
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── README_zh-CN.md
├── configs/
│ ├── 3dssd/
│ │ ├── 3dssd_kitti-3d-car.py
│ │ └── README.md
│ ├── _base_/
│ │ ├── datasets/
│ │ │ ├── coco_instance.py
│ │ │ ├── kitti-3d-3class.py
│ │ │ ├── kitti-3d-car.py
│ │ │ ├── lyft-3d.py
│ │ │ ├── nuim_instance.py
│ │ │ ├── nus-3d.py
│ │ │ ├── range100_lyft-3d.py
│ │ │ ├── scannet-3d-18class.py
│ │ │ ├── sunrgbd-3d-10class.py
│ │ │ ├── waymoD5-3d-3class.py
│ │ │ └── waymoD5-3d-car.py
│ │ ├── default_runtime.py
│ │ ├── models/
│ │ │ ├── 3dssd.py
│ │ │ ├── cascade_mask_rcnn_r50_fpn.py
│ │ │ ├── centerpoint_01voxel_second_secfpn_nus.py
│ │ │ ├── centerpoint_02pillar_second_secfpn_nus.py
│ │ │ ├── h3dnet.py
│ │ │ ├── hv_pointpillars_fpn_lyft.py
│ │ │ ├── hv_pointpillars_fpn_nus.py
│ │ │ ├── hv_pointpillars_fpn_range100_lyft.py
│ │ │ ├── hv_pointpillars_secfpn_kitti.py
│ │ │ ├── hv_pointpillars_secfpn_waymo.py
│ │ │ ├── hv_second_secfpn_kitti.py
│ │ │ ├── hv_second_secfpn_waymo.py
│ │ │ ├── imvotenet_image.py
│ │ │ ├── mask_rcnn_r50_fpn.py
│ │ │ └── votenet.py
│ │ └── schedules/
│ │ ├── cyclic_20e.py
│ │ ├── cyclic_40e.py
│ │ ├── mmdet_schedule_1x.py
│ │ ├── schedule_2x.py
│ │ └── schedule_3x.py
│ ├── benchmark/
│ │ ├── hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py
│ │ ├── hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py
│ │ ├── hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
│ │ └── hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
│ ├── centerpoint/
│ │ ├── README.md
│ │ ├── centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_flip-tta_20e_nus.py
│ │ ├── centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py
│ │ ├── centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_flip-tta_20e_nus.py
│ │ ├── centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_01voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus.py
│ │ ├── centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py
│ │ └── centerpoint_02pillar_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py
│ ├── dynamic_voxelization/
│ │ ├── README.md
│ │ ├── dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
│ │ ├── dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
│ │ └── dv_second_secfpn_6x8_80e_kitti-3d-car.py
│ ├── fp16/
│ │ ├── README.md
│ │ ├── hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py
│ │ ├── hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py
│ │ └── hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py
│ ├── free_anchor/
│ │ ├── README.md
│ │ ├── hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py
│ │ └── hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
│ ├── h3dnet/
│ │ ├── README.md
│ │ └── h3dnet_3x8_scannet-3d-18class.py
│ ├── imvotenet/
│ │ ├── README.md
│ │ ├── imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
│ │ └── imvotenet_stage2_16x8_sunrgbd-3d-10class.py
│ ├── mvxnet/
│ │ ├── README.md
│ │ └── dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py
│ ├── nuimages/
│ │ ├── README.md
│ │ ├── cascade_mask_rcnn_r101_fpn_1x_nuim.py
│ │ ├── cascade_mask_rcnn_r50_fpn_1x_nuim.py
│ │ ├── cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim.py
│ │ ├── cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim.py
│ │ ├── cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim.py
│ │ ├── htc_r50_fpn_1x_nuim.py
│ │ ├── htc_r50_fpn_coco-20e_1x_nuim.py
│ │ ├── htc_r50_fpn_coco-20e_20e_nuim.py
│ │ ├── htc_without_semantic_r50_fpn_1x_nuim.py
│ │ ├── htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py
│ │ ├── mask_rcnn_r101_fpn_1x_nuim.py
│ │ ├── mask_rcnn_r50_caffe_fpn_1x_nuim.py
│ │ ├── mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py
│ │ ├── mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py
│ │ ├── mask_rcnn_r50_fpn_1x_nuim.py
│ │ ├── mask_rcnn_r50_fpn_coco-2x_1x_nuim.py
│ │ ├── mask_rcnn_r50_fpn_coco-2x_1x_nus-2d.py
│ │ ├── mask_rcnn_swinT_coco-2x_1x_nuim.py
│ │ └── mask_rcnn_x101_32x4d_fpn_1x_nuim.py
│ ├── nuscenes.md
│ ├── parta2/
│ │ ├── README.md
│ │ ├── hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py
│ │ └── hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py
│ ├── pointpillars/
│ │ ├── README.md
│ │ ├── hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_fpn_sbn-all_range100_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py
│ │ ├── hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
│ │ ├── hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.py
│ │ ├── hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py
│ │ ├── hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
│ │ └── hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py
│ ├── regnet/
│ │ ├── README.md
│ │ ├── hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py
│ │ ├── hv_pointpillars_regnet-400mf_fpn_sbn-all_range100_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py
│ │ ├── hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py
│ │ └── hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py
│ ├── second/
│ │ ├── README.md
│ │ ├── hv_second_secfpn_6x8_80e_kitti-3d-3class.py
│ │ ├── hv_second_secfpn_6x8_80e_kitti-3d-car.py
│ │ └── hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
│ ├── sparsefusion_nusc_voxel_LC_SwinT.py
│ ├── sparsefusion_nusc_voxel_LC_r50.py
│ ├── ssn/
│ │ ├── README.md
│ │ ├── hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d.py
│ │ ├── hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d.py
│ │ ├── hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py
│ │ └── hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py
│ ├── transfusion_nusc_pillar_L.py
│ ├── transfusion_nusc_pillar_LC.py
│ ├── transfusion_nusc_voxel_L.py
│ ├── transfusion_nusc_voxel_LC.py
│ ├── transfusion_waymo_voxel_L.py
│ ├── transfusion_waymo_voxel_LC.py
│ ├── votenet/
│ │ ├── README.md
│ │ ├── votenet_16x8_sunrgbd-3d-10class.py
│ │ ├── votenet_8x8_scannet-3d-18class.py
│ │ └── votenet_iouloss_8x8_scannet-3d-18class.py
│ └── waymo.md
├── demo/
│ └── pcd_demo.py
├── docker/
│ └── Dockerfile
├── mmdet3d/
│ ├── __init__.py
│ ├── apis/
│ │ ├── __init__.py
│ │ ├── inference.py
│ │ └── test.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── anchor/
│ │ │ ├── __init__.py
│ │ │ └── anchor_3d_generator.py
│ │ ├── bbox/
│ │ │ ├── __init__.py
│ │ │ ├── assigners/
│ │ │ │ ├── __init__.py
│ │ │ │ └── hungarian_assigner.py
│ │ │ ├── box_np_ops.py
│ │ │ ├── coders/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── anchor_free_bbox_coder.py
│ │ │ │ ├── camera_bbox_coder.py
│ │ │ │ ├── centerpoint_bbox_coders.py
│ │ │ │ ├── delta_xyzwhlr_bbox_coder.py
│ │ │ │ ├── partial_bin_based_bbox_coder.py
│ │ │ │ └── transfusion_bbox_coder.py
│ │ │ ├── iou_calculators/
│ │ │ │ ├── __init__.py
│ │ │ │ └── iou3d_calculator.py
│ │ │ ├── samplers/
│ │ │ │ ├── __init__.py
│ │ │ │ └── iou_neg_piecewise_sampler.py
│ │ │ ├── structures/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── base_box3d.py
│ │ │ │ ├── box_3d_mode.py
│ │ │ │ ├── cam_box3d.py
│ │ │ │ ├── coord_3d_mode.py
│ │ │ │ ├── depth_box3d.py
│ │ │ │ ├── lidar_box3d.py
│ │ │ │ └── utils.py
│ │ │ └── transforms.py
│ │ ├── evaluation/
│ │ │ ├── __init__.py
│ │ │ ├── indoor_eval.py
│ │ │ ├── kitti_utils/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── eval.py
│ │ │ │ └── rotate_iou.py
│ │ │ ├── lyft_eval.py
│ │ │ ├── seg_eval.py
│ │ │ └── waymo_utils/
│ │ │ └── prediction_kitti_to_waymo.py
│ │ ├── points/
│ │ │ ├── __init__.py
│ │ │ ├── base_points.py
│ │ │ ├── cam_points.py
│ │ │ ├── depth_points.py
│ │ │ └── lidar_points.py
│ │ ├── post_processing/
│ │ │ ├── __init__.py
│ │ │ ├── box3d_nms.py
│ │ │ └── merge_augs.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ └── gaussian.py
│ │ ├── visualizer/
│ │ │ ├── __init__.py
│ │ │ ├── open3d_vis.py
│ │ │ └── show_result.py
│ │ └── voxel/
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ └── voxel_generator.py
│ ├── datasets/
│ │ ├── __init__.py
│ │ ├── builder.py
│ │ ├── custom_3d.py
│ │ ├── dataset_wrappers.py
│ │ ├── kitti2d_dataset.py
│ │ ├── kitti_dataset.py
│ │ ├── lyft_dataset.py
│ │ ├── nuscenes_dataset.py
│ │ ├── nuscenes_dataset_viewInfo.py
│ │ ├── pipelines/
│ │ │ ├── __init__.py
│ │ │ ├── data_augment_utils.py
│ │ │ ├── dbsampler.py
│ │ │ ├── formating.py
│ │ │ ├── loading.py
│ │ │ ├── test_time_aug.py
│ │ │ ├── transforms_2d.py
│ │ │ └── transforms_3d.py
│ │ ├── registry.py
│ │ ├── scannet_dataset.py
│ │ ├── semantickitti_dataset.py
│ │ ├── sunrgbd_dataset.py
│ │ └── waymo_dataset.py
│ ├── models/
│ │ ├── __init__.py
│ │ ├── backbones/
│ │ │ ├── DLA.py
│ │ │ ├── __init__.py
│ │ │ ├── base_pointnet.py
│ │ │ ├── multi_backbone.py
│ │ │ ├── nostem_regnet.py
│ │ │ ├── pointnet2_sa_msg.py
│ │ │ ├── pointnet2_sa_ssg.py
│ │ │ ├── second.py
│ │ │ └── swin.py
│ │ ├── builder.py
│ │ ├── dense_heads/
│ │ │ ├── __init__.py
│ │ │ ├── anchor3d_head.py
│ │ │ ├── base_conv_bbox_head.py
│ │ │ ├── centerpoint_head.py
│ │ │ ├── free_anchor3d_head.py
│ │ │ ├── parta2_rpn_head.py
│ │ │ ├── shape_aware_head.py
│ │ │ ├── sparsefusion_head_deform.py
│ │ │ ├── ssd_3d_head.py
│ │ │ ├── train_mixins.py
│ │ │ ├── transfusion_head.py
│ │ │ └── vote_head.py
│ │ ├── detectors/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── centerpoint.py
│ │ │ ├── dynamic_voxelnet.py
│ │ │ ├── h3dnet.py
│ │ │ ├── imvotenet.py
│ │ │ ├── mvx_faster_rcnn.py
│ │ │ ├── mvx_two_stage.py
│ │ │ ├── parta2.py
│ │ │ ├── single_stage.py
│ │ │ ├── sparsefusion.py
│ │ │ ├── ssd3dnet.py
│ │ │ ├── transfusion.py
│ │ │ ├── two_stage.py
│ │ │ ├── votenet.py
│ │ │ └── voxelnet.py
│ │ ├── fusion_layers/
│ │ │ ├── __init__.py
│ │ │ ├── coord_transform.py
│ │ │ ├── point_fusion.py
│ │ │ └── vote_fusion.py
│ │ ├── losses/
│ │ │ ├── __init__.py
│ │ │ ├── axis_aligned_iou_loss.py
│ │ │ ├── chamfer_distance.py
│ │ │ └── uncertainty_loss.py
│ │ ├── middle_encoders/
│ │ │ ├── __init__.py
│ │ │ ├── pillar_scatter.py
│ │ │ ├── sparse_encoder.py
│ │ │ └── sparse_unet.py
│ │ ├── model_utils/
│ │ │ ├── __init__.py
│ │ │ └── vote_module.py
│ │ ├── necks/
│ │ │ ├── __init__.py
│ │ │ └── second_fpn.py
│ │ ├── registry.py
│ │ ├── roi_heads/
│ │ │ ├── __init__.py
│ │ │ ├── base_3droi_head.py
│ │ │ ├── bbox_heads/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── h3d_bbox_head.py
│ │ │ │ └── parta2_bbox_head.py
│ │ │ ├── h3d_roi_head.py
│ │ │ ├── mask_heads/
│ │ │ │ ├── __init__.py
│ │ │ │ ├── pointwise_semantic_head.py
│ │ │ │ └── primitive_head.py
│ │ │ ├── part_aggregation_roi_head.py
│ │ │ └── roi_extractors/
│ │ │ ├── __init__.py
│ │ │ └── single_roiaware_extractor.py
│ │ ├── utils/
│ │ │ ├── __init__.py
│ │ │ ├── clip_sigmoid.py
│ │ │ ├── deformable_decoder.py
│ │ │ ├── depth_encoder.py
│ │ │ ├── drop.py
│ │ │ ├── ffn.py
│ │ │ ├── inverse_sigmoid.py
│ │ │ ├── mlp.py
│ │ │ ├── network_modules.py
│ │ │ ├── ops/
│ │ │ │ ├── functions/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── ms_deform_attn_func.py
│ │ │ │ ├── make.sh
│ │ │ │ ├── modules/
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── ms_deform_attn.py
│ │ │ │ ├── setup.py
│ │ │ │ ├── src/
│ │ │ │ │ ├── cpu/
│ │ │ │ │ │ ├── ms_deform_attn_cpu.cpp
│ │ │ │ │ │ └── ms_deform_attn_cpu.h
│ │ │ │ │ ├── cuda/
│ │ │ │ │ │ ├── ms_deform_attn_cuda.cu
│ │ │ │ │ │ ├── ms_deform_attn_cuda.h
│ │ │ │ │ │ └── ms_deform_im2col_cuda.cuh
│ │ │ │ │ ├── ms_deform_attn.h
│ │ │ │ │ └── vision.cpp
│ │ │ │ └── test.py
│ │ │ ├── projection.py
│ │ │ ├── sparsefusion_models.py
│ │ │ ├── transformer.py
│ │ │ └── transformerdecoder.py
│ │ └── voxel_encoders/
│ │ ├── __init__.py
│ │ ├── pillar_encoder.py
│ │ ├── utils.py
│ │ └── voxel_encoder.py
│ ├── ops/
│ │ ├── __init__.py
│ │ ├── ball_query/
│ │ │ ├── __init__.py
│ │ │ ├── ball_query.py
│ │ │ └── src/
│ │ │ ├── ball_query.cpp
│ │ │ └── ball_query_cuda.cu
│ │ ├── furthest_point_sample/
│ │ │ ├── __init__.py
│ │ │ ├── furthest_point_sample.py
│ │ │ ├── points_sampler.py
│ │ │ ├── src/
│ │ │ │ ├── furthest_point_sample.cpp
│ │ │ │ └── furthest_point_sample_cuda.cu
│ │ │ └── utils.py
│ │ ├── gather_points/
│ │ │ ├── __init__.py
│ │ │ ├── gather_points.py
│ │ │ └── src/
│ │ │ ├── gather_points.cpp
│ │ │ └── gather_points_cuda.cu
│ │ ├── group_points/
│ │ │ ├── __init__.py
│ │ │ ├── group_points.py
│ │ │ └── src/
│ │ │ ├── group_points.cpp
│ │ │ └── group_points_cuda.cu
│ │ ├── interpolate/
│ │ │ ├── __init__.py
│ │ │ ├── src/
│ │ │ │ ├── interpolate.cpp
│ │ │ │ ├── three_interpolate_cuda.cu
│ │ │ │ └── three_nn_cuda.cu
│ │ │ ├── three_interpolate.py
│ │ │ └── three_nn.py
│ │ ├── iou3d/
│ │ │ ├── __init__.py
│ │ │ ├── iou3d_utils.py
│ │ │ └── src/
│ │ │ ├── iou3d.cpp
│ │ │ └── iou3d_kernel.cu
│ │ ├── knn/
│ │ │ ├── __init__.py
│ │ │ ├── knn.py
│ │ │ └── src/
│ │ │ ├── knn.cpp
│ │ │ └── knn_cuda.cu
│ │ ├── norm.py
│ │ ├── pointnet_modules/
│ │ │ ├── __init__.py
│ │ │ ├── builder.py
│ │ │ ├── point_fp_module.py
│ │ │ ├── point_sa_module.py
│ │ │ └── registry.py
│ │ ├── roiaware_pool3d/
│ │ │ ├── __init__.py
│ │ │ ├── points_in_boxes.py
│ │ │ ├── roiaware_pool3d.py
│ │ │ └── src/
│ │ │ ├── points_in_boxes_cpu.cpp
│ │ │ ├── points_in_boxes_cuda.cu
│ │ │ ├── roiaware_pool3d.cpp
│ │ │ └── roiaware_pool3d_kernel.cu
│ │ ├── sparse_block.py
│ │ ├── spconv/
│ │ │ ├── __init__.py
│ │ │ ├── conv.py
│ │ │ ├── functional.py
│ │ │ ├── include/
│ │ │ │ ├── paramsgrid.h
│ │ │ │ ├── prettyprint.h
│ │ │ │ ├── pybind11_utils.h
│ │ │ │ ├── spconv/
│ │ │ │ │ ├── fused_spconv_ops.h
│ │ │ │ │ ├── geometry.h
│ │ │ │ │ ├── indice.cu.h
│ │ │ │ │ ├── indice.h
│ │ │ │ │ ├── maxpool.h
│ │ │ │ │ ├── mp_helper.h
│ │ │ │ │ ├── point2voxel.h
│ │ │ │ │ ├── pool_ops.h
│ │ │ │ │ ├── reordering.cu.h
│ │ │ │ │ ├── reordering.h
│ │ │ │ │ └── spconv_ops.h
│ │ │ │ ├── tensorview/
│ │ │ │ │ ├── helper_kernel.cu.h
│ │ │ │ │ ├── helper_launch.h
│ │ │ │ │ └── tensorview.h
│ │ │ │ ├── torch_utils.h
│ │ │ │ └── utility/
│ │ │ │ └── timer.h
│ │ │ ├── modules.py
│ │ │ ├── ops.py
│ │ │ ├── overwrite_spconv/
│ │ │ │ └── write_spconv2.py
│ │ │ ├── pool.py
│ │ │ ├── src/
│ │ │ │ ├── all.cc
│ │ │ │ ├── indice.cc
│ │ │ │ ├── indice_cuda.cu
│ │ │ │ ├── maxpool.cc
│ │ │ │ ├── maxpool_cuda.cu
│ │ │ │ ├── reordering.cc
│ │ │ │ └── reordering_cuda.cu
│ │ │ ├── structure.py
│ │ │ └── test_utils.py
│ │ └── voxel/
│ │ ├── __init__.py
│ │ ├── scatter_points.py
│ │ ├── src/
│ │ │ ├── scatter_points_cpu.cpp
│ │ │ ├── scatter_points_cuda.cu
│ │ │ ├── voxelization.cpp
│ │ │ ├── voxelization.h
│ │ │ ├── voxelization_cpu.cpp
│ │ │ └── voxelization_cuda.cu
│ │ └── voxelize.py
│ ├── utils/
│ │ ├── __init__.py
│ │ └── collect_env.py
│ └── version.py
├── requirements/
│ ├── build.txt
│ ├── docs.txt
│ ├── optional.txt
│ ├── readthedocs.txt
│ ├── runtime.txt
│ └── tests.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests/
│ ├── test_data/
│ │ ├── test_datasets/
│ │ │ ├── test_dataset_wrappers.py
│ │ │ ├── test_kitti_dataset.py
│ │ │ ├── test_lyft_dataset.py
│ │ │ ├── test_nuscene_dataset.py
│ │ │ ├── test_scannet_dataset.py
│ │ │ ├── test_semantickitti_dataset.py
│ │ │ └── test_sunrgbd_dataset.py
│ │ └── test_pipelines/
│ │ ├── test_augmentations/
│ │ │ ├── test_data_augment_utils.py
│ │ │ ├── test_test_augment_utils.py
│ │ │ └── test_transforms_3d.py
│ │ ├── test_indoor_pipeline.py
│ │ ├── test_indoor_sample.py
│ │ ├── test_loadings/
│ │ │ ├── test_load_points_from_multi_sweeps.py
│ │ │ └── test_loading.py
│ │ └── test_outdoor_pipeline.py
│ ├── test_metrics/
│ │ ├── test_indoor_eval.py
│ │ ├── test_kitti_eval.py
│ │ ├── test_losses.py
│ │ └── test_seg_eval.py
│ ├── test_models/
│ │ ├── test_backbones.py
│ │ ├── test_common_modules/
│ │ │ ├── test_middle_encoders.py
│ │ │ ├── test_pointnet_modules.py
│ │ │ ├── test_pointnet_ops.py
│ │ │ ├── test_roiaware_pool3d.py
│ │ │ ├── test_sparse_unet.py
│ │ │ └── test_vote_module.py
│ │ ├── test_detectors.py
│ │ ├── test_forward.py
│ │ ├── test_fusion/
│ │ │ ├── test_fusion_coord_trans.py
│ │ │ ├── test_point_fusion.py
│ │ │ └── test_vote_fusion.py
│ │ ├── test_heads/
│ │ │ ├── test_heads.py
│ │ │ ├── test_parta2_bbox_head.py
│ │ │ ├── test_roi_extractors.py
│ │ │ └── test_semantic_heads.py
│ │ ├── test_necks/
│ │ │ ├── test_fpn.py
│ │ │ └── test_necks.py
│ │ └── test_voxel_encoder/
│ │ ├── test_dynamic_scatter.py
│ │ ├── test_voxel_encoders.py
│ │ ├── test_voxel_generator.py
│ │ └── test_voxelize.py
│ ├── test_runtime/
│ │ ├── test_apis.py
│ │ └── test_config.py
│ ├── test_samples/
│ │ └── parta2_roihead_inputs.npz
│ └── test_utils/
│ ├── test_anchors.py
│ ├── test_assigners.py
│ ├── test_bbox_coders.py
│ ├── test_box3d.py
│ ├── test_box_np_ops.py
│ ├── test_coord_3d_mode.py
│ ├── test_merge_augs.py
│ ├── test_nms.py
│ ├── test_points.py
│ ├── test_samplers.py
│ └── test_utils.py
├── tools/
│ ├── analysis_tools/
│ │ ├── analyze_logs.py
│ │ ├── benchmark.py
│ │ └── get_flops.py
│ ├── combine_view_info.py
│ ├── create_data.py
│ ├── create_data.sh
│ ├── data_converter/
│ │ ├── __init__.py
│ │ ├── create_gt_database.py
│ │ ├── indoor_converter.py
│ │ ├── kitti_converter.py
│ │ ├── kitti_data_utils.py
│ │ ├── lyft_converter.py
│ │ ├── nuimage_converter.py
│ │ ├── nuscenes_converter.py
│ │ ├── scannet_data_utils.py
│ │ ├── sunrgbd_data_utils.py
│ │ └── waymo_converter.py
│ ├── dist_test.sh
│ ├── dist_train.sh
│ ├── misc/
│ │ ├── fuse_conv_bn.py
│ │ ├── print_config.py
│ │ └── visualize_results.py
│ ├── model_converters/
│ │ ├── convert_votenet_checkpoints.py
│ │ ├── publish_model.py
│ │ └── regnet2mmdet.py
│ ├── slurm_test.sh
│ ├── slurm_train.sh
│ ├── test.py
│ └── train.py
└── train.sh
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
*.ipynb
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
# cython generated cpp
data
.vscode
.idea
# custom
*.pkl
*.pkl.json
*.log.json
work_dirs/
exps/
*~
# Pytorch
*.pth
# demo
*.jpg
*.png
/data/scannet/scans/
/data/sunrgbd/OFFICIAL_SUNRGBD/
*.obj
*.ply
================================================
FILE: LICENSE
================================================
Copyright 2018-2019 Open-MMLab. All rights reserved.
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2018-2019 Open-MMLab.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include requirements/*.txt
include mmdet3d/ops/**/*.cpp mmdet3d/ops/**/*.cu
include mmdet3d/ops/**/*.h mmdet3d/ops/**/*.cc
include mmdet3d/VERSION
================================================
FILE: README.md
================================================
# [ICCV 2023] SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection

## Abstract
We propose SparseFusion, a novel multi-sensor 3D detection method that exclusively uses sparse candidates and sparse representations. Specifically, SparseFusion utilizes the outputs of parallel detectors in the LiDAR and camera modalities as sparse candidates for fusion. We transform the camera candidates into the LiDAR coordinate space by disentangling the object representations. Then, we can fuse the multi-modality candidates in a unified 3D space by a lightweight self-attention module. To mitigate negative transfer between modalities, we propose novel semantic and geometric cross-modality transfer modules that are applied prior to the modality-specific detectors. SparseFusion achieves state-of-the-art performance on the nuScenes benchmark while also running at the fastest speed.
[[paper link]](https://openaccess.thecvf.com/content/ICCV2023/papers/Xie_SparseFusion_Fusing_Multi-Modal_Sparse_Representations_for_Multi-Sensor_3D_Object_Detection_ICCV_2023_paper.pdf) [[Chinese summary (自动驾驶之心)]](https://zhuanlan.zhihu.com/p/671293323)
## Updates
[2023-8-21] Much better training GPU memory efficiency (45GB -> 29GB) with no hurt to the performance and speed!
[2023-7-13] 🔥SparseFusion has been accepted to ICCV 2023!🔥
[2023-3-21] We release the first version code of SparseFusion.
## Overview

Compared to existing fusion algorithms, SparseFusion achieves state-of-the-art performance as well as the fastest inference speed on nuScenes test set. †: Official [repository](https://github.com/zehuichen123/AutoAlignV2) of AutoAlignV2 uses flip as test-time augmentation. ‡: We use BEVFusion-base results in the official [repository](https://github.com/mit-han-lab/bevfusion) of BEVFusion to match the input resolutions of other methods. $\S:$ Swin-T is adopted as image backbone.
## NuScene Performance
We do not use any test-time augmentations or model ensembles to get these results. We have released the configure files and pretrained checkpoints to reproduce our results.
#### Validation Set
| Image Backbone | Point Cloud Backbone | mAP | NDS | Link |
| --------- | ------ | ------ | --------- | --------- |
| ResNet50 | VoxelNet | 70.5 | 72.8 | [config](configs/sparsefusion_nusc_voxel_LC_r50.py)/[ckpt](https://drive.google.com/file/d/1NZIrg7s-VwxkwuPHTTWSQQO7T7IILBGC/view?usp=share_link) |
| Swin-T | VoxelNet | 71.0 | 73.1 | [config](configs/sparsefusion_nusc_voxel_LC_SwinT.py)/[ckpt](https://drive.google.com/file/d/1dAhOKtbLd1e3I5jwk_3E1gzbl61P24qy/view?usp=share_link) |
#### Test Set
| Image Backbone | Point Cloud Backbone | mAP | NDS |
| --------- | ------ | ------ | --------- |
| ResNet50 | VoxelNet | 72.0 | 73.8 |
## Usage
#### Installation
+ We test our code on an environment with CUDA 11.5, python 3.7, PyTorch 1.7.1, TorchVision 0.8.2, NumPy 1.20.0, and numba 0.48.0.
+ We use `mmdet==2.10.0, mmcv==1.2.7` for our code. Please refer to their official instructions for installation.
+ You can install `mmdet3d==0.11.0` directly from our repo by
```
cd SparseFusion
pip install -e .
```
+ We use `spconv==2.3.3`. Please follow the [official instruction](https://github.com/traveller59/spconv) to install it based on your CUDA version.
```
pip install spconv-cuxxx
# e.g. pip install spconv-cu114
```
+ You also need to install the deformable attention module with the following command.
```
pip install ./mmdet3d/models/utils/ops
```
#### Data Preparation
Download nuScenes full dataset from the [official website](https://www.nuscenes.org/download). You should have a folder structure like this:
```
SparseFusion
├── mmdet3d
├── tools
├── configs
├── data
│ ├── nuscenes
│ │ ├── maps
│ │ ├── samples
│ │ ├── sweeps
│ │ ├── v1.0-test
| | ├── v1.0-trainval
```
Then, you can select **either** of the two ways to preprocess the data.
1. Run the following two commands sequentially.
```
python tools/create_data.py nuscenes --root-path ./data/nuscenes --out-dir ./data/nuscenes --extra-tag nuscenes
python tools/combine_view_info.py
```
2. Alternatively, you may directly download our preprocessed data from [Google Drive](https://drive.google.com/drive/folders/1L5lvLsNWBA0vfTlNSMa4OXXBLoZgJbg4?usp=share_link), and put these files in `data/nuscenes`.
#### Initial Weights
Please download the [initial weights](https://drive.google.com/drive/folders/1wmYBi3PBprdcegF843AU-22q2OwDgoZk?usp=share_link) for model training, and put them in `checkpoints/`.
#### Train & Test
In our default setting, we train the model with 4 GPUs.
```
# training
bash tools/dist_train.sh configs/sparsefusion_nusc_voxel_LC_r50.py 4 --work-dir work_dirs/sparsefusion_nusc_voxel_LC_r50
# test
bash tools/dist_test.sh configs/sparsefusion_nusc_voxel_LC_r50.py ${CHECKPOINT_FILE} 4 --eval=bbox
```
Note: We use A6000 GPUs (48GB per-GPU memory) for model training. The training of SparseFusion model (ResNet50 backbone) requires ~29 GB per-GPU memory.
## Contact
If you have any questions, feel free to open an issue or contact us at yichen_xie@berkeley.edu.
## Acknowledgments
We sincerely thank the authors of [mmdetection3d](https://github.com/open-mmlab/mmdetection3d), [TransFusion](https://github.com/XuyangBai/TransFusion), [BEVFusion](https://github.com/mit-han-lab/bevfusion), [MSMDFusion](https://github.com/SxJyJay/MSMDFusion), and [DeepInteraction](https://github.com/fudan-zvg/DeepInteraction) for providing their codes or pretrained weights.
## Reference
If you find our work useful, please consider citing the following paper:
```
@article{xie2023sparsefusion,
title={SparseFusion: Fusing Multi-Modal Sparse Representations for Multi-Sensor 3D Object Detection},
author={Xie, Yichen and Xu, Chenfeng and Rakotosaona, Marie-Julie and Rim, Patrick and Tombari, Federico and Keutzer, Kurt and Tomizuka, Masayoshi and Zhan, Wei},
journal={arXiv preprint arXiv:2304.14340},
year={2023}
}
```
================================================
FILE: README_zh-CN.md
================================================
[](https://mmdetection3d.readthedocs.io/en/latest/)
[](https://github.com/open-mmlab/mmdetection3d/actions)
[](https://codecov.io/gh/open-mmlab/mmdetection3d)
[](https://github.com/open-mmlab/mmdetection3d/blob/master/LICENSE)
**新闻**: 我们发布了版本v0.11.0.
在第三届[ nuScenes 3D 检测挑战赛](https://www.nuscenes.org/object-detection?externalData=all&mapData=all&modalities=Any)(第五届 AI Driving Olympics, NeurIPS 2020)中,我们获得了最佳 PKL 奖、第三名和最好的纯视觉的结果,相关的代码和模型将会在不久后发布。
文档: https://mmdetection3d.readthedocs.io/
## 简介
[English](README.md) | 简体中文
主分支代码目前支持 PyTorch 1.3 以上的版本。
MMDetection3D 是一个基于 PyTorch 的目标检测开源工具箱, 下一代面向3D检测的平台. 它是 OpenMMlab 项目的一部分,这个项目由香港中文大学多媒体实验室和商汤科技联合发起.

### 主要特性
- **支持多模态/单模态的检测器**
支持多模态/单模态检测器,包括 MVXNet,VoteNet,PointPillars 等。
- **支持户内/户外的数据集**
支持室内/室外的3D检测数据集,包括 ScanNet, SUNRGB-D, Waymo, nuScenes, Lyft, KITTI.
对于 nuScenes 数据集, 我们也支持 [nuImages 数据集](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/nuimages).
- **与 2D 检测器的自然整合**
[MMDetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/model_zoo.md) 支持的**300+个模型 , 40+的论文算法**, 和相关模块都可以在此代码库中训练或使用。
- **性能高**
训练速度比其他代码库更快。下表可见主要的对比结果。更多的细节可见[基准测评文档](./docs/benchmarks.md)。我们对比了每秒训练的样本数(值越高越好)。其他代码库不支持的模型被标记为 `×`。
| Methods | MMDetection3D | [OpenPCDet](https://github.com/open-mmlab/OpenPCDet) |[votenet](https://github.com/facebookresearch/votenet)| [Det3D](https://github.com/poodarchu/Det3D) |
|:-------:|:-------------:|:---------:|:-----:|:-----:|
| VoteNet | 358 | × | 77 | × |
| PointPillars-car| 141 | × | × | 140 |
| PointPillars-3class| 107 |44 | × | × |
| SECOND| 40 |30 | × | × |
| Part-A2| 17 |14 | × | × |
和 [MMDetection](https://github.com/open-mmlab/mmdetection),[MMCV](https://github.com/open-mmlab/mmcv) 一样, MMDetection3D 也可以作为一个库去支持各式各样的项目.
## 开源许可证
该项目采用 [Apache 2.0 开源许可证](LICENSE)。
## 更新日志
最新的版本 v0.11.0 在 2021.03.01发布。
如果想了解更多版本更新细节和历史信息,请阅读[更新日志](docs/changelog.md)。
## 基准测试和模型库
测试结果和模型可以在[模型库](docs/model_zoo.md)中找到。
已支持的骨干网络:
- [x] PointNet (CVPR'2017)
- [x] PointNet++ (NeurIPS'2017)
- [x] RegNet (CVPR'2020)
已支持的算法:
- [x] [SECOND (Sensor'2018)](configs/second/README.md)
- [x] [PointPillars (CVPR'2019)](configs/pointpillars/README.md)
- [x] [FreeAnchor (NeurIPS'2019)](configs/free_anchor/README.md)
- [x] [VoteNet (ICCV'2019)](configs/votenet/README.md)
- [x] [H3DNet (ECCV'2020)](configs/h3dnet/README.md)
- [x] [3DSSD (CVPR'2020)](configs/3dssd/README.md)
- [x] [Part-A2 (TPAMI'2020)](configs/parta2/README.md)
- [x] [MVXNet (ICRA'2019)](configs/mvxnet/README.md)
- [x] [CenterPoint (CVPR'2021)](configs/centerpoint/README.md)
- [x] [SSN (ECCV'2020)](configs/ssn/README.md)
- [x] [ImVoteNet (CVPR'2020)](configs/imvotenet/README.md)
| | ResNet | ResNeXt | SENet |PointNet++ | HRNet | RegNetX | Res2Net |
|--------------------|:--------:|:--------:|:--------:|:---------:|:-----:|:--------:|:-----:|
| SECOND | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| PointPillars | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| FreeAnchor | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| VoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ |
| H3DNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ |
| 3DSSD | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ |
| Part-A2 | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| MVXNet | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| CenterPoint | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| SSN | ☐ | ☐ | ☐ | ✗ | ☐ | ✓ | ☐ |
| ImVoteNet | ✗ | ✗ | ✗ | ✓ | ✗ | ✗ | ✗ |
其他特性
- [x] [Dynamic Voxelization](configs/dynamic_voxelization/README.md)
**注意:** [MMDetection](https://github.com/open-mmlab/mmdetection/blob/master/docs/model_zoo.md) 支持的基于2D检测的**300+个模型 , 40+的论文算法**在 MMDetection3D 中都可以被训练或使用。
## 安装
请参考[快速入门文档](docs/get_started.md)进行安装。
## 快速入门
请参考[快速入门文档](docs/get_started.md)学习 MMDetection3D 的基本使用。 我们为新手提供了分别针对[已有数据集](docs/1_exist_data_model.md)和[新数据集](docs/2_new_data_model.md)的使用指南。我们也提供了一些进阶教程,内容覆盖了[学习配置文件](docs/tutorials/config.md), [增加数据集支持](docs/tutorials/customize_dataset.md), [设计新的数据预处理流程](docs/tutorials/data_pipeline.md), [增加自定义模型](docs/tutorials/customize_models.md), [增加自定义的运行时配置](docs/tutorials/customize_runtime.md)和 [Waymo 数据集](docs/tutorials/waymo.md).
## 引用
如果你觉得本项目对你的研究工作有所帮助,请参考如下 bibtex 引用 MMdetection3D
```latex
@misc{mmdet3d2020,
title={{MMDetection3D: OpenMMLab} next-generation platform for general {3D} object detection},
author={MMDetection3D Contributors},
howpublished = {\url{https://github.com/open-mmlab/mmdetection3d}},
year={2020}
}
```
## 贡献指南
我们感谢所有的贡献者为改进和提升 MMDetection3D 所作出的努力。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。
## 致谢
MMDetection3D 是一款由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者,以及提供宝贵反馈的用户。我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具,供用户复现已有算法并开发自己的新的 3D 检测模型。
## OpenMMLab 的其他项目
- [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
================================================
FILE: configs/3dssd/3dssd_kitti-3d-car.py
================================================
_base_ = [
'../_base_/models/3dssd.py', '../_base_/datasets/kitti-3d-car.py',
'../_base_/default_runtime.py'
]
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
point_cloud_range = [0, -40, -5, 70, 40, 3]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
classes=class_names,
sample_groups=dict(Car=15))
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
file_client_args=file_client_args),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0],
global_rot_range=[0.0, 0.0],
rot_range=[-1.0471975511965976, 1.0471975511965976]),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.9, 1.1]),
dict(type='BackgroundPointsFilter', bbox_enlarge_range=(0.5, 2.0, 0.5)),
dict(type='IndoorPointSample', num_points=16384),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='IndoorPointSample', num_points=16384),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
evaluation = dict(interval=2)
# model settings
model = dict(
bbox_head=dict(
num_classes=1,
bbox_coder=dict(
type='AnchorFreeBBoxCoder', num_dir_bins=12, with_rot=True)))
# optimizer
lr = 0.002 # max learning rate
optimizer = dict(type='AdamW', lr=lr, weight_decay=0)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[80, 120])
# runtime settings
total_epochs = 150
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
================================================
FILE: configs/3dssd/README.md
================================================
# 3DSSD: Point-based 3D Single Stage Object Detector
## Introduction
[ALGORITHM]
We implement 3DSSD and provide the results and checkpoints on KITTI datasets.
```
@inproceedings{yang20203dssd,
author = {Zetong Yang and Yanan Sun and Shu Liu and Jiaya Jia},
title = {3DSSD: Point-based 3D Single Stage Object Detector},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
year = {2020}
}
```
### Experiment details on KITTI datasets
Some settings in our implementation are different from the [official implementation](https://github.com/Jia-Research-Lab/3DSSD), which bring marginal differences to the performance on KITTI datasets in our experiments. To simplify and unify the models of our implementation, we skip them in our models. These differences are listed as below:
1. We keep the scenes without any object while the official code skips these scenes in training. In the official implementation, only 3229 and 3394 samples are used as training and validation sets, respectively. In our implementation, we keep using 3712 and 3769 samples as training and validation sets, respectively, as those used for all the other models in our implementation on KITTI datasets.
2. We do not modify the decay of `batch normalization` during training.
3. While using [`DataBaseSampler`](https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/pipelines/dbsampler.py#L80) for data augmentation, the official code uses road planes as reference to place the sampled objects while we do not.
4. We perform detection using LIDAR coordinates while the official code uses camera coordinates.
## Results
### KITTI
| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP |Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [PointNet2SAMSG](./3dssd_kitti-3d-car.py)| Car |72e|4.7||78.39(81.00)1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/3dssd/3dssd_kitti-3d-car_20210324_122002-07e9a19b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/3dssd/3dssd_kitti-3d-car_20210324_122002.log.json)|
[1]: We report two different 3D object detection performance here. 78.39mAP is evaluated by our evaluation code and 81.00mAP is evaluated by the official development kit (so as that used in the paper and official code of 3DSSD ). We found that the commonly used Python implementation of [`rotate_iou`](https://github.com/traveller59/second.pytorch/blob/e42e4a0e17262ab7d180ee96a0a36427f2c20a44/second/core/non_max_suppression/nms_gpu.py#L605) which is used in our KITTI dataset evaluation, is different from the official implemention in [KITTI benchmark](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d).
================================================
FILE: configs/_base_/datasets/coco_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/coco/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_train2017.json',
img_prefix=data_root + 'train2017/',
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/instances_val2017.json',
img_prefix=data_root + 'val2017/',
pipeline=test_pipeline))
evaluation = dict(metric=['bbox', 'segm'])
================================================
FILE: configs/_base_/datasets/kitti-3d-3class.py
================================================
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
classes=class_names,
sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
file_client_args=file_client_args),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0.5],
global_rot_range=[0.0, 0.0],
rot_range=[-0.78539816, 0.78539816]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=6,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'))
evaluation = dict(interval=1)
================================================
FILE: configs/_base_/datasets/kitti-3d-car.py
================================================
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
classes=class_names,
sample_groups=dict(Car=15))
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
file_client_args=file_client_args),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0.5],
global_rot_range=[0.0, 0.0],
rot_range=[-0.78539816, 0.78539816]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=6,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'))
evaluation = dict(interval=1)
================================================
FILE: configs/_base_/datasets/lyft-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-80, -80, -5, 80, 80, 3]
# For Lyft we usually do 9-class detection
class_names = [
'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
'bicycle', 'pedestrian', 'animal'
]
dataset_type = 'LyftDataset'
data_root = 'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/lyft/': 's3://lyft/lyft/',
# 'data/lyft/': 's3://lyft/lyft/'
# }))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_test.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24)
================================================
FILE: configs/_base_/datasets/nuim_instance.py
================================================
dataset_type = 'CocoDataset'
data_root = 'data/nuimages/'
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
dict(
type='Resize',
img_scale=[(1280, 720), (1920, 1080)],
multiscale_mode='range',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1600, 900),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
ann_file=data_root + 'annotations/nuimages_v1.0-train.json',
img_prefix=data_root,
classes=class_names,
pipeline=train_pipeline),
val=dict(
type=dataset_type,
ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline),
test=dict(
type=dataset_type,
ann_file=data_root + 'annotations/nuimages_v1.0-val.json',
img_prefix=data_root,
classes=class_names,
pipeline=test_pipeline))
evaluation = dict(metric=['bbox', 'segm'])
================================================
FILE: configs/_base_/datasets/nus-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
# Input modality for nuScenes dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/nuscenes/': 's3://nuscenes/nuscenes/',
# 'data/nuscenes/': 's3://nuscenes/nuscenes/'
# }))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR'),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24)
================================================
FILE: configs/_base_/datasets/range100_lyft-3d.py
================================================
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-100, -100, -5, 100, 100, 3]
# For Lyft we usually do 9-class detection
class_names = [
'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle',
'bicycle', 'pedestrian', 'animal'
]
dataset_type = 'LyftDataset'
data_root = 'data/lyft/'
# Input modality for Lyft dataset, this is consistent with the submission
# format which requires the information in input_modality.
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel',
# path_mapping=dict({
# './data/lyft/': 's3://lyft/lyft/',
# 'data/lyft/': 's3://lyft/lyft/'
# }))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'lyft_infos_test.pkl',
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True))
# For Lyft dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 24. Please change the interval accordingly if you do not
# use a default schedule.
evaluation = dict(interval=24)
================================================
FILE: configs/_base_/datasets/scannet-3d-18class.py
================================================
# dataset settings
dataset_type = 'ScanNetDataset'
data_root = './data/scannet/'
class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=True,
with_seg_3d=True),
dict(
type='PointSegClassMapping',
valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34,
36, 39)),
dict(type='IndoorPointSample', num_points=40000),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.087266, 0.087266],
scale_ratio_range=[1.0, 1.0],
shift_height=True),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask',
'pts_instance_mask'
])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='IndoorPointSample', num_points=40000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=8,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=5,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_train.pkl',
pipeline=train_pipeline,
filter_empty_gt=False,
classes=class_names,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'scannet_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
================================================
FILE: configs/_base_/datasets/sunrgbd-3d-10class.py
================================================
dataset_type = 'SUNRGBDDataset'
data_root = 'data/sunrgbd/'
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='LoadAnnotations3D'),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.523599, 0.523599],
scale_ratio_range=[0.85, 1.15],
shift_height=True),
dict(type='IndoorPointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(type='IndoorPointSample', num_points=20000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=16,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=5,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
filter_empty_gt=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='Depth')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'sunrgbd_infos_val.pkl',
pipeline=test_pipeline,
classes=class_names,
test_mode=True,
box_type_3d='Depth'))
================================================
FILE: configs/_base_/datasets/waymoD5-3d-3class.py
================================================
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
class_names = ['Car', 'Pedestrian', 'Cyclist']
point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'waymo_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
classes=class_names,
sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
points_loader=dict(
type='LoadPointsFromFile',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
file_client_args=file_client_args),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=5,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_train.pkl',
split='training',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
# load one frame every five frames
load_interval=5)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'))
evaluation = dict(interval=24)
================================================
FILE: configs/_base_/datasets/waymoD5-3d-car.py
================================================
# dataset settings
# D5 in the config name means the whole dataset is divided into 5 folds
# We only use one fold for efficient experiments
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://waymo_data/'))
class_names = ['Car']
point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'waymo_dbinfos_train.pkl',
rate=1.0,
prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
classes=class_names,
sample_groups=dict(Car=15),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
file_client_args=file_client_args),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=6,
use_dim=5,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_train.pkl',
split='training',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
# load one frame every five frames
load_interval=5)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'))
evaluation = dict(interval=24)
================================================
FILE: configs/_base_/default_runtime.py
================================================
checkpoint_config = dict(interval=1)
# yapf:disable push
# By default we use textlogger hook and tensorboard
# For more loggers see
# https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/_base_/models/3dssd.py
================================================
model = dict(
type='SSD3DNet',
backbone=dict(
type='PointNet2SAMSG',
in_channels=4,
num_points=(4096, 512, (256, 256)),
radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
((64, 64, 128), (64, 64, 128), (64, 96, 128)),
((128, 128, 256), (128, 192, 256), (128, 256, 256))),
aggregation_channels=(64, 128, 256),
fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
fps_sample_range_lists=((-1), (-1), (512, -1)),
norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
sa_cfg=dict(
type='PointSAModuleMSG',
pool_mod='max',
use_xyz=True,
normalize_xyz=False)),
bbox_head=dict(
type='SSD3DHead',
in_channels=256,
vote_module_cfg=dict(
in_channels=256,
num_points=256,
gt_per_seed=1,
conv_channels=(128, ),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
with_res_feat=False,
vote_xyz_range=(3.0, 3.0, 2.0)),
vote_aggregation_cfg=dict(
type='PointSAModuleMSG',
num_point=256,
radii=(4.8, 6.4),
sample_nums=(16, 32),
mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)),
norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1),
use_xyz=True,
normalize_xyz=False,
bias=True),
pred_layer_cfg=dict(
in_channels=1536,
shared_conv_channels=(512, 128),
cls_conv_channels=(128, ),
reg_conv_channels=(128, ),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1),
objectness_loss=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0),
center_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
corner_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=1.0),
vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(
sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05),
test_cfg=dict(
nms_cfg=dict(type='nms', iou_thr=0.1),
sample_mod='spec',
score_thr=0.0,
per_class_proposal=True,
max_output_num=100))
# optimizer
# This schedule is mainly used by models on indoor dataset,
# e.g., VoteNet on SUNRGBD and ScanNet
lr = 0.002 # max learning rate
optimizer = dict(type='AdamW', lr=lr, weight_decay=0)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[80, 120])
# runtime settings
total_epochs = 150
================================================
FILE: configs/_base_/models/cascade_mask_rcnn_r50_fpn.py
================================================
# model settings
model = dict(
type='CascadeRCNN',
pretrained='torchvision://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
roi_head=dict(
type='CascadeRoIHead',
num_stages=3,
stage_loss_weights=[1, 0.5, 0.25],
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
],
mask_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
mask_head=dict(
type='FCNMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=80,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False)
]),
test_cfg=dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100,
mask_thr_binary=0.5)))
================================================
FILE: configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py
================================================
voxel_size = [0.1, 0.1, 0.2]
model = dict(
type='CenterPoint',
pts_voxel_layer=dict(
max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)),
pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[41, 1024, 1024],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='CenterHead',
in_channels=sum([256, 256]),
tasks=[
dict(num_class=1, class_names=['car']),
dict(num_class=2, class_names=['truck', 'construction_vehicle']),
dict(num_class=2, class_names=['bus', 'trailer']),
dict(num_class=1, class_names=['barrier']),
dict(num_class=2, class_names=['motorcycle', 'bicycle']),
dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
],
common_heads=dict(
reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
share_conv_channel=64,
bbox_coder=dict(
type='CenterPointBBoxCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_num=500,
score_threshold=0.1,
out_size_factor=8,
voxel_size=voxel_size[:2],
code_size=9),
separate_head=dict(
type='SeparateHead', init_bias=-2.19, final_kernel=3),
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
norm_bbox=True),
# model training and testing settings
train_cfg=dict(
pts=dict(
grid_size=[1024, 1024, 40],
voxel_size=voxel_size,
out_size_factor=8,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
test_cfg=dict(
pts=dict(
post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_per_img=500,
max_pool_nms=False,
min_radius=[4, 12, 10, 1, 0.85, 0.175],
score_threshold=0.1,
out_size_factor=8,
voxel_size=voxel_size[:2],
nms_type='rotate',
pre_max_size=1000,
post_max_size=83,
nms_thr=0.2)))
================================================
FILE: configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py
================================================
voxel_size = [0.2, 0.2, 8]
model = dict(
type='CenterPoint',
pts_voxel_layer=dict(
max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)),
pts_voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=5,
feat_channels=[64],
with_distance=False,
voxel_size=(0.2, 0.2, 8),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
legacy=False),
pts_middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)),
pts_backbone=dict(
type='SECOND',
in_channels=64,
out_channels=[64, 128, 256],
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
out_channels=[128, 128, 128],
upsample_strides=[0.5, 1, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='CenterHead',
in_channels=sum([128, 128, 128]),
tasks=[
dict(num_class=1, class_names=['car']),
dict(num_class=2, class_names=['truck', 'construction_vehicle']),
dict(num_class=2, class_names=['bus', 'trailer']),
dict(num_class=1, class_names=['barrier']),
dict(num_class=2, class_names=['motorcycle', 'bicycle']),
dict(num_class=2, class_names=['pedestrian', 'traffic_cone']),
],
common_heads=dict(
reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
share_conv_channel=64,
bbox_coder=dict(
type='CenterPointBBoxCoder',
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_num=500,
score_threshold=0.1,
out_size_factor=4,
voxel_size=voxel_size[:2],
code_size=9),
separate_head=dict(
type='SeparateHead', init_bias=-2.19, final_kernel=3),
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
norm_bbox=True),
# model training and testing settings
train_cfg=dict(
pts=dict(
grid_size=[512, 512, 1],
voxel_size=voxel_size,
out_size_factor=4,
dense_reg=1,
gaussian_overlap=0.1,
max_objs=500,
min_radius=2,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])),
test_cfg=dict(
pts=dict(
post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
max_per_img=500,
max_pool_nms=False,
min_radius=[4, 12, 10, 1, 0.85, 0.175],
score_threshold=0.1,
pc_range=[-51.2, -51.2],
out_size_factor=4,
voxel_size=voxel_size[:2],
nms_type='rotate',
pre_max_size=1000,
post_max_size=83,
nms_thr=0.2)))
================================================
FILE: configs/_base_/models/h3dnet.py
================================================
primitive_z_cfg = dict(
type='PrimitiveHead',
num_dims=2,
num_classes=18,
primitive_mode='z',
upper_thresh=100.0,
surface_thresh=0.5,
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=1,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=1024,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True),
feat_channels=(128, 128),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.4, 0.6],
reduction='mean',
loss_weight=30.0),
center_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='sum',
loss_src_weight=0.5,
loss_dst_weight=0.5),
semantic_reg_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='sum',
loss_src_weight=0.5,
loss_dst_weight=0.5),
semantic_cls_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
train_cfg=dict(
dist_thresh=0.2,
var_thresh=1e-2,
lower_thresh=1e-6,
num_point=100,
num_point_line=10,
line_thresh=0.2))
primitive_xy_cfg = dict(
type='PrimitiveHead',
num_dims=1,
num_classes=18,
primitive_mode='xy',
upper_thresh=100.0,
surface_thresh=0.5,
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=1,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=1024,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True),
feat_channels=(128, 128),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.4, 0.6],
reduction='mean',
loss_weight=30.0),
center_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='sum',
loss_src_weight=0.5,
loss_dst_weight=0.5),
semantic_reg_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='sum',
loss_src_weight=0.5,
loss_dst_weight=0.5),
semantic_cls_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
train_cfg=dict(
dist_thresh=0.2,
var_thresh=1e-2,
lower_thresh=1e-6,
num_point=100,
num_point_line=10,
line_thresh=0.2))
primitive_line_cfg = dict(
type='PrimitiveHead',
num_dims=0,
num_classes=18,
primitive_mode='line',
upper_thresh=100.0,
surface_thresh=0.5,
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=1,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=1024,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True),
feat_channels=(128, 128),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.4, 0.6],
reduction='mean',
loss_weight=30.0),
center_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='sum',
loss_src_weight=1.0,
loss_dst_weight=1.0),
semantic_reg_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='sum',
loss_src_weight=1.0,
loss_dst_weight=1.0),
semantic_cls_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=2.0),
train_cfg=dict(
dist_thresh=0.2,
var_thresh=1e-2,
lower_thresh=1e-6,
num_point=100,
num_point_line=10,
line_thresh=0.2))
model = dict(
type='H3DNet',
backbone=dict(
type='MultiBackbone',
num_streams=4,
suffixes=['net0', 'net1', 'net2', 'net3'],
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
act_cfg=dict(type='ReLU'),
backbones=dict(
type='PointNet2SASSG',
in_channels=4,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 256)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True))),
rpn_head=dict(
type='VoteHead',
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True),
pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
center_loss=dict(
type='ChamferDistance',
mode='l2',
reduction='sum',
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
roi_head=dict(
type='H3DRoIHead',
primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg],
bbox_head=dict(
type='H3DBboxHead',
gt_per_seed=3,
num_proposal=256,
suface_matching_cfg=dict(
type='PointSAModule',
num_point=256 * 6,
radius=0.5,
num_sample=32,
mlp_channels=[128 + 6, 128, 64, 32],
use_xyz=True,
normalize_xyz=True),
line_matching_cfg=dict(
type='PointSAModule',
num_point=256 * 12,
radius=0.5,
num_sample=32,
mlp_channels=[128 + 12, 128, 64, 32],
use_xyz=True,
normalize_xyz=True),
feat_channels=(128, 128),
primitive_refine_channels=[128, 128, 128],
upper_thresh=100.0,
surface_thresh=0.5,
line_thresh=0.5,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
center_loss=dict(
type='ChamferDistance',
mode='l2',
reduction='sum',
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=0.1),
cues_objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.3, 0.7],
reduction='mean',
loss_weight=5.0),
cues_semantic_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.3, 0.7],
reduction='mean',
loss_weight=5.0),
proposal_objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='none',
loss_weight=5.0),
primitive_center_loss=dict(
type='MSELoss', reduction='none', loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
rpn_proposal=dict(use_nms=False),
rcnn=dict(
pos_distance_thr=0.3,
neg_distance_thr=0.6,
sample_mod='vote',
far_threshold=0.6,
near_threshold=0.3,
mask_surface_threshold=0.3,
label_surface_threshold=0.3,
mask_line_threshold=0.3,
label_line_threshold=0.3)),
test_cfg=dict(
rpn=dict(
sample_mod='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True,
use_nms=False),
rcnn=dict(
sample_mod='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True)))
================================================
FILE: configs/_base_/models/hv_pointpillars_fpn_lyft.py
================================================
_base_ = './hv_pointpillars_fpn_nus.py'
# model settings (based on nuScenes model settings)
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
model = dict(
pts_voxel_layer=dict(
max_num_points=20,
point_cloud_range=[-80, -80, -5, 80, 80, 3],
max_voxels=(60000, 60000)),
pts_voxel_encoder=dict(
feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]),
pts_middle_encoder=dict(output_shape=[640, 640]),
pts_bbox_head=dict(
num_classes=9,
anchor_generator=dict(
ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]),
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
# model training settings (based on nuScenes model settings)
train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
================================================
FILE: configs/_base_/models/hv_pointpillars_fpn_nus.py
================================================
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.25, 0.25, 8]
model = dict(
type='MVXFasterRCNN',
pts_voxel_layer=dict(
max_num_points=64,
point_cloud_range=[-50, -50, -5, 50, 50, 3],
voxel_size=voxel_size,
max_voxels=(30000, 40000)),
pts_voxel_encoder=dict(
type='HardVFE',
in_channels=4,
feat_channels=[64, 64],
with_distance=False,
voxel_size=voxel_size,
with_cluster_center=True,
with_voxel_center=True,
point_cloud_range=[-50, -50, -5, 50, 50, 3],
norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
pts_middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]),
pts_backbone=dict(
type='SECOND',
in_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
out_channels=[64, 128, 256]),
pts_neck=dict(
type='FPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
act_cfg=dict(type='ReLU'),
in_channels=[64, 128, 256],
out_channels=256,
start_level=0,
num_outs=3),
pts_bbox_head=dict(
type='Anchor3DHead',
num_classes=10,
in_channels=256,
feat_channels=256,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
scales=[1, 2, 4],
sizes=[
[0.8660, 2.5981, 1.], # 1.5/sqrt(3)
[0.5774, 1.7321, 1.], # 1/sqrt(3)
[1., 1., 1.],
[0.4, 0.4, 1],
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=True),
assigner_per_size=False,
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
pts=dict(
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
pos_weight=-1,
debug=False)),
test_cfg=dict(
pts=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=1000,
nms_thr=0.2,
score_thr=0.05,
min_bbox_size=0,
max_num=500)))
================================================
FILE: configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py
================================================
_base_ = './hv_pointpillars_fpn_nus.py'
# model settings (based on nuScenes model settings)
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
model = dict(
pts_voxel_layer=dict(
max_num_points=20,
point_cloud_range=[-100, -100, -5, 100, 100, 3],
max_voxels=(60000, 60000)),
pts_voxel_encoder=dict(
feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]),
pts_middle_encoder=dict(output_shape=[800, 800]),
pts_bbox_head=dict(
num_classes=9,
anchor_generator=dict(
ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]),
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)),
# model training settings (based on nuScenes model settings)
train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0])))
================================================
FILE: configs/_base_/models/hv_pointpillars_secfpn_kitti.py
================================================
voxel_size = [0.16, 0.16, 4]
model = dict(
type='VoxelNet',
voxel_layer=dict(
max_num_points=32,
point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1],
voxel_size=voxel_size,
max_voxels=(16000, 40000)),
voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=4,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]),
middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
backbone=dict(
type='SECOND',
in_channels=64,
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
out_channels=[64, 128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[
[0, -39.68, -0.6, 70.4, 39.68, -0.6],
[0, -39.68, -0.6, 70.4, 39.68, -0.6],
[0, -39.68, -1.78, 70.4, 39.68, -1.78],
],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
================================================
FILE: configs/_base_/models/hv_pointpillars_secfpn_waymo.py
================================================
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.32, 0.32, 6]
model = dict(
type='MVXFasterRCNN',
pts_voxel_layer=dict(
max_num_points=20,
point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
voxel_size=voxel_size,
max_voxels=(32000, 32000)),
pts_voxel_encoder=dict(
type='HardVFE',
in_channels=5,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
with_cluster_center=True,
with_voxel_center=True,
point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4],
norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)),
pts_middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]),
pts_backbone=dict(
type='SECOND',
in_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
layer_nums=[3, 5, 5],
layer_strides=[1, 2, 2],
out_channels=[64, 128, 256]),
pts_neck=dict(
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345],
[-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188],
[-74.88, -74.88, 0, 74.88, 74.88, 0]],
sizes=[
[2.08, 4.73, 1.77], # car
[0.84, 1.81, 1.77], # cyclist
[0.84, 0.91, 1.74] # pedestrian
],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
pts=dict(
assigner=[
dict( # car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
dict( # pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
],
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
pos_weight=-1,
debug=False)),
test_cfg=dict(
pts=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=4096,
nms_thr=0.25,
score_thr=0.1,
min_bbox_size=0,
max_num=500)))
================================================
FILE: configs/_base_/models/hv_second_secfpn_kitti.py
================================================
model = dict(
type='VoxelNet',
voxel_layer=dict(
max_num_points=5,
point_cloud_range=[0, -40, -3, 70.4, 40, 1],
voxel_size=[0.05, 0.05, 0.1],
max_voxels=(16000, 40000)),
voxel_encoder=dict(type='HardSimpleVFE'),
middle_encoder=dict(
type='SparseEncoder',
in_channels=4,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act')),
backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78],
],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.35,
neg_iou_thr=0.2,
min_pos_iou=0.2,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.35,
neg_iou_thr=0.2,
min_pos_iou=0.2,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
================================================
FILE: configs/_base_/models/hv_second_secfpn_waymo.py
================================================
# model settings
# Voxel size for voxel encoder
# Usually voxel size is changed consistently with the point cloud range
# If point cloud range is modified, do remember to change all related
# keys in the config.
voxel_size = [0.08, 0.08, 0.1]
model = dict(
type='VoxelNet',
voxel_layer=dict(
max_num_points=10,
point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4],
voxel_size=voxel_size,
max_voxels=(80000, 90000)),
voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
middle_encoder=dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[61, 1280, 1920],
order=('conv', 'norm', 'act')),
backbone=dict(
type='SECOND',
in_channels=384,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345],
[-76.8, -51.2, 0, 76.8, 51.2, 0],
[-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]],
sizes=[
[2.08, 4.73, 1.77], # car
[0.84, 0.91, 1.74], # pedestrian
[0.84, 1.81, 1.77] # cyclist
],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
dict( # cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1)
],
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_pre=4096,
nms_thr=0.25,
score_thr=0.1,
min_bbox_size=0,
max_num=500))
================================================
FILE: configs/_base_/models/imvotenet_image.py
================================================
model = dict(
type='ImVoteNet',
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
img_rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
img_roi_head=dict(
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=False,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
img_rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
img_rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
img_rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=False,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
pos_weight=-1,
debug=False)),
test_cfg=dict(
img_rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
img_rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100)))
================================================
FILE: configs/_base_/models/mask_rcnn_r50_fpn.py
================================================
# model settings
model = dict(
type='MaskRCNN',
pretrained='torchvision://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
roi_head=dict(
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=80,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=False,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
mask_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
mask_head=dict(
type='FCNMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=80,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False)),
test_cfg=dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100,
mask_thr_binary=0.5)))
================================================
FILE: configs/_base_/models/votenet.py
================================================
model = dict(
type='VoteNet',
backbone=dict(
type='PointNet2SASSG',
in_channels=4,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 256)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
bbox_head=dict(
type='VoteHead',
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True),
pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
center_loss=dict(
type='ChamferDistance',
mode='l2',
reduction='sum',
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
# model training and testing settings
train_cfg=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'),
test_cfg=dict(
sample_mod='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True))
================================================
FILE: configs/_base_/schedules/cyclic_20e.py
================================================
# For nuScenes dataset, we usually evaluate the model at the end of training.
# Since the models are trained by 24 epochs by default, we set evaluation
# interval to be 20. Please change the interval accordingly if you do not
# use a default schedule.
# optimizer
# This schedule is mainly used by models on nuScenes dataset
optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4,
)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4,
)
# runtime settings
total_epochs = 20
================================================
FILE: configs/_base_/schedules/cyclic_40e.py
================================================
# The schedule is usually used by models trained on KITTI dataset
# The learning rate set in the cyclic schedule is the initial learning rate
# rather than the max learning rate. Since the target_ratio is (10, 1e-4),
# the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4
lr = 0.0018
# The optimizer follows the setting in SECOND.Pytorch, but here we use
# the offcial AdamW optimizer implemented by PyTorch.
optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
# We use cyclic learning rate and momentum schedule following SECOND.Pytorch
# https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa
# We implement them in mmcv, for more details, please refer to
# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa
# https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4,
)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4,
)
# Although the total_epochs is 40, this schedule is usually used we
# RepeatDataset with repeat ratio N, thus the actual total epoch
# number could be Nx40
total_epochs = 40
================================================
FILE: configs/_base_/schedules/mmdet_schedule_1x.py
================================================
# optimizer
optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[8, 11])
total_epochs = 12
================================================
FILE: configs/_base_/schedules/schedule_2x.py
================================================
# optimizer
# This schedule is mainly used by models on nuScenes dataset
optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=1000,
warmup_ratio=1.0 / 1000,
step=[20, 23])
momentum_config = None
# runtime settings
total_epochs = 24
================================================
FILE: configs/_base_/schedules/schedule_3x.py
================================================
# optimizer
# This schedule is mainly used by models on indoor dataset,
# e.g., VoteNet on SUNRGBD and ScanNet
lr = 0.008 # max learning rate
optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
lr_config = dict(policy='step', warmup=None, step=[24, 32])
# runtime settings
total_epochs = 36
================================================
FILE: configs/benchmark/hv_PartA2_secfpn_4x8_cyclic_80e_pcdet_kitti-3d-3class.py
================================================
# model settings
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z
model = dict(
type='PartA2',
voxel_layer=dict(
max_num_points=5, # max_points_per_voxel
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(16000, 40000) # (training, testing) max_coxels
),
voxel_encoder=dict(type='HardSimpleVFE'),
middle_encoder=dict(
type='SparseUNet',
in_channels=4,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act')),
backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
rpn_head=dict(
type='PartA2RPNHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
assigner_per_size=True,
assign_per_class=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
roi_head=dict(
type='PartAggregationROIHead',
num_classes=3,
semantic_head=dict(
type='PointwiseSemanticHead',
in_channels=16,
extra_width=0.2,
seg_score_thr=0.3,
num_classes=3,
loss_seg=dict(
type='FocalLoss',
use_sigmoid=True,
reduction='sum',
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_part=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
seg_roi_extractor=dict(
type='Single3DRoIAwareExtractor',
roi_layer=dict(
type='RoIAwarePool3d',
out_size=14,
max_pts_per_voxel=128,
mode='max')),
part_roi_extractor=dict(
type='Single3DRoIAwareExtractor',
roi_layer=dict(
type='RoIAwarePool3d',
out_size=14,
max_pts_per_voxel=128,
mode='avg')),
bbox_head=dict(
type='PartA2BboxHead',
num_classes=3,
seg_in_channels=16,
part_in_channels=4,
seg_conv_channels=[64, 64],
part_conv_channels=[64, 64],
merge_conv_channels=[128, 128],
down_conv_channels=[128, 256],
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
shared_fc_channels=[256, 512, 512, 512],
cls_channels=[256, 256],
reg_channels=[256, 256],
dropout_ratio=0.1,
roi_feat_size=14,
with_corner_loss=True,
loss_bbox=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=1.0),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1)
],
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=9000,
nms_post=512,
max_num=512,
nms_thr=0.8,
score_thr=0,
use_rotate_nms=False),
rcnn=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1)
],
sampler=dict(
type='IoUNegPiecewiseSampler',
num=128,
pos_fraction=0.55,
neg_piece_fractions=[0.8, 0.2],
neg_iou_piece_thrs=[0.55, 0.1],
neg_pos_ub=-1,
add_gt_as_proposals=False,
return_iou=True),
cls_pos_thr=0.75,
cls_neg_thr=0.25)),
test_cfg=dict(
rpn=dict(
nms_pre=1024,
nms_post=100,
max_num=100,
nms_thr=0.7,
score_thr=0,
use_rotate_nms=True),
rcnn=dict(
use_rotate_nms=True,
use_raw_score=True,
nms_thr=0.01,
score_thr=0.3)))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
classes=class_names,
sample_groups=dict(Car=20, Pedestrian=15, Cyclist=15))
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True))
# optimizer
lr = 0.001 # max learning rate
optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4)
checkpoint_config = dict(interval=1)
evaluation = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 80
dist_params = dict(backend='nccl', port=29506)
log_level = 'INFO'
find_unused_parameters = True
work_dir = './work_dirs/parta2_secfpn_80e'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/benchmark/hv_pointpillars_secfpn_3x8_100e_det3d_kitti-3d-car.py
================================================
# model settings
voxel_size = [0.16, 0.16, 4]
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
model = dict(
type='VoxelNet',
voxel_layer=dict(
max_num_points=64,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(12000, 20000)),
voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=4,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range),
middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]),
backbone=dict(
type='SECOND',
in_channels=64,
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
out_channels=[64, 128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]],
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=True),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
sample_groups=dict(Car=15),
classes=class_names)
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
loc_noise_std=[0.25, 0.25, 0.25],
global_rot_range=[0.0, 0.0],
rot_uniform_noise=[-0.15707963267, 0.15707963267]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScale',
rot_uniform_noise=[-0.78539816, 0.78539816],
scaling_uniform_noise=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
]
data = dict(
samples_per_gpu=3,
workers_per_gpu=3,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True))
# optimizer
lr = 0.001 # max learning rate
optimizer = dict(
type='AdamW',
lr=lr,
betas=(0.95, 0.99), # the momentum is change during training
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# learning policy
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4)
checkpoint_config = dict(interval=1)
evaluation = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 50
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/pp_secfpn_100e'
load_from = None
resume_from = None
workflow = [('train', 50)]
================================================
FILE: configs/benchmark/hv_pointpillars_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
================================================
# model settings
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
voxel_size = [0.16, 0.16, 4]
model = dict(
type='VoxelNet',
voxel_layer=dict(
max_num_points=32, # max_points_per_voxel
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(16000, 40000) # (training, testing) max_coxels
),
voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=4,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
),
middle_encoder=dict(
type='PointPillarsScatter',
in_channels=64,
output_shape=[496, 432],
),
backbone=dict(
type='SECOND',
in_channels=64,
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
out_channels=[64, 128, 256],
),
neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128],
),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78],
],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2),
),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
Car=5,
Pedestrian=5,
Cyclist=5,
)),
classes=class_names,
sample_groups=dict(
Car=15,
Pedestrian=15,
Cyclist=15,
))
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']),
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True))
# optimizer
lr = 0.0003 # max learning rate
optimizer = dict(
type='AdamW',
lr=lr,
betas=(0.95, 0.99), # the momentum is change during training
weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
# learning policy
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4)
checkpoint_config = dict(interval=1)
evaluation = dict(interval=2)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 80
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/pp_secfpn_80e'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/benchmark/hv_second_secfpn_4x8_80e_pcdet_kitti-3d-3class.py
================================================
# model settings
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict(
type='VoxelNet',
voxel_layer=dict(
max_num_points=5,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(16000, 40000)),
voxel_encoder=dict(type='HardSimpleVFE'),
middle_encoder=dict(
type='SparseEncoder',
in_channels=4,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act')),
backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78],
],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False),
test_cfg=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
input_modality = dict(use_lidar=False, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
Car=5,
Pedestrian=5,
Cyclist=5,
)),
classes=class_names,
sample_groups=dict(
Car=20,
Pedestrian=15,
Cyclist=15,
))
file_client_args = dict(backend='disk')
# file_client_args = dict(
# backend='petrel', path_mapping=dict(data='s3://kitti_data/'))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='LoadAnnotations3D',
with_bbox_3d=True,
with_label_3d=True,
file_client_args=file_client_args),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=4,
file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True))
# optimizer
lr = 0.0003 # max learning rate
optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01)
optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 1e-4),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.85 / 0.95, 1),
cyclic_times=1,
step_ratio_up=0.4)
checkpoint_config = dict(interval=1)
evaluation = dict(interval=2)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
# runtime settings
total_epochs = 80
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = './work_dirs/sec_secfpn_80e'
load_from = None
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/centerpoint/README.md
================================================
# Center-based 3D Object Detection and Tracking
## Introduction
[ALGORITHM]
We implement CenterPoint and provide the result and checkpoints on nuScenes dataset.
We follow the below style to name config files. Contributors are advised to follow the same style.
`{xxx}` is required field and `[yyy]` is optional.
`{model}`: model type like `centerpoint`.
`{model setting}`: voxel size and voxel type like `01voxel`, `02pillar`.
`{backbone}`: backbone type like `second`.
`{neck}`: neck type like `secfpn`.
`[dcn]`: Whether to use deformable convolution.
`[circle]`: Whether to use circular nms.
`[batch_per_gpu x gpu]`: GPUs and samples per GPU, 4x8 is used by default.
`{schedule}`: training schedule, options are 1x, 2x, 20e, etc. 1x and 2x means 12 epochs and 24 epochs respectively. 20e is adopted in cascade models, which denotes 20 epochs. For 1x/2x, initial learning rate decays by a factor of 10 at the 8/16th and 11/22th epochs. For 20e, initial learning rate decays by a factor of 10 at the 16th and 19th epochs.
`{dataset}`: dataset like nus-3d, kitti-3d, lyft-3d, scannet-3d, sunrgbd-3d. We also indicate the number of classes we are using if there exist multiple settings, e.g., kitti-3d-3class and kitti-3d-car means training on KITTI dataset with 3 classes and single class, respectively.
```
@article{yin2021center,
title={Center-based 3D Object Detection and Tracking},
author={Yin, Tianwei and Zhou, Xingyi and Kr{\"a}henb{\"u}hl, Philipp},
journal={CVPR},
year={2021},
}
```
## Usage
### Test time augmentation
We have supported double-flip and scale augmentation during test time. To use test time augmentation, users need to modify the
`test_pipeline` and `test_cfg` in the config.
For example, we change `centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py` to the following.
```python
_base_ = './centerpoint_0075voxel_second_secfpn_circlenms' \
'_4x8_cyclic_20e_nus.py'
model = dict(
test_cfg=dict(
pts=dict(
use_rotate_nms=True,
max_num=83)))
point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
file_client_args = dict(backend='disk')
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=[0.95, 1.0, 1.05],
flip=True,
pcd_horizontal_flip=True,
pcd_vertical_flip=True,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', sync_2d=False),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
```
## Results
### CenterPoint
|Backbone| Voxel type (voxel size) |Dcn|Circular nms| Mem (GB) | Inf time (fps) | mAP |NDS| Download |
| :---------: |:-----: |:-----: | :------: | :------------: | :----: |:----: | :------: |:------: |
|[SECFPN](./centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.1)|✗|✓|4.9| |56.19|64.43|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201001_135205-5db91e00.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201001_135205.log.json)|
|above w/o circle nms|voxel (0.1)|✗|✗| | |56.56|64.46||
|[SECFPN](./centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.1)|✓|✓|5.2| |56.34|64.81|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20201004_075317-26d8176c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20201004_075317.log.json)|
|above w/o circle nms|voxel (0.1)|✓|✗| | |56.60|64.90||
|[SECFPN](./centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.075)|✗|✓|7.8| |57.34|65.23|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20200925_230905-358fbe3b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus_20200925_230905.log.json)|
|above w/o circle nms|voxel (0.075)|✗|✗| | |57.63|65.39| |
|[SECFPN](./centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py)|voxel (0.075)|✓|✓|8.5| |57.27|65.58|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20200930_201619-67c8496f.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus_20200930_201619.log.json)|
|above w/o circle nms|voxel (0.075)|✓|✗| | |57.43|65.63||
|above w/ double flip|voxel (0.075)|✓|✗| | |59.73|67.39||
|above w/ scale tta|voxel (0.075)|✓|✗| | |60.43|67.65||
|above w/ circle nms w/o scale tta|voxel (0.075)|✓|✗| | |59.52|67.24||
|[SECFPN](./centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus.py)|pillar (0.2)|✗|✓|4.4| |49.07|59.66|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201004_170716-a134a233.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus_20201004_170716.log.json)|
|above w/o circle nms|pillar (0.2)|✗|✗| | |49.12|59.66||
|[SECFPN](./centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py)|pillar (0.2)|✓|✗| 4.6| |48.8 |59.67 |[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722-3bb135f2.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus_20200930_103722.log.json)|
|above w/ circle nms|pillar (0.2)|✓|✓| | |48.79|59.65||
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py']
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
voxel_size = [0.075, 0.075, 0.2]
point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
model = dict(
pts_voxel_layer=dict(
voxel_size=voxel_size, point_cloud_range=point_cloud_range),
pts_middle_encoder=dict(sparse_shape=[41, 1440, 1440]),
pts_bbox_head=dict(
bbox_coder=dict(
voxel_size=voxel_size[:2], pc_range=point_cloud_range[:2])),
train_cfg=dict(
pts=dict(
grid_size=[1440, 1440, 40],
voxel_size=voxel_size,
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(voxel_size=voxel_size[:2], pc_range=point_cloud_range[:2])))
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
file_client_args = dict(backend='disk')
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
train=dict(dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(test_cfg=dict(pts=dict(nms_type='circle')))
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(
pts_bbox_head=dict(
separate_head=dict(
type='DCNSeparateHead',
dcn_config=dict(
type='DCN',
in_channels=64,
out_channels=64,
kernel_size=3,
padding=1,
groups=4),
init_bias=-2.19,
final_kernel=3)))
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_flip-tta_20e_nus.py
================================================
_base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py'
point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
file_client_args = dict(backend='disk')
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
# Add double-flip augmentation
flip=True,
pcd_horizontal_flip=True,
pcd_vertical_flip=True,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', sync_2d=False),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_tta_20e_nus.py
================================================
_base_ = './centerpoint_0075voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py'
test_cfg = dict(pts=dict(use_rotate_nms=True, max_num=500))
point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
file_client_args = dict(backend='disk')
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=[0.95, 1.0, 1.05],
# Add double-flip augmentation
flip=True,
pcd_horizontal_flip=True,
pcd_vertical_flip=True,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', sync_2d=False),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_0075voxel_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(
pts_bbox_head=dict(
separate_head=dict(
type='DCNSeparateHead',
dcn_config=dict(
type='DCN',
in_channels=64,
out_channels=64,
kernel_size=3,
padding=1,
groups=4),
init_bias=-2.19,
final_kernel=3)),
test_cfg=dict(pts=dict(nms_type='circle')))
================================================
FILE: configs/centerpoint/centerpoint_0075voxel_second_secfpn_dcn_circlenms_4x8_cyclic_flip-tta_20e_nus.py
================================================
_base_ = './centerpoint_0075voxel_second_secfpn_dcn_' \
'circlenms_4x8_cyclic_20e_nus.py'
point_cloud_range = [-54, -54, -5.0, 54, 54, 3.0]
file_client_args = dict(backend='disk')
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
# Add double-flip augmentation
flip=True,
pcd_horizontal_flip=True,
pcd_vertical_flip=True,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', sync_2d=False),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
val=dict(pipeline=test_pipeline), test=dict(pipeline=test_pipeline))
================================================
FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py
================================================
_base_ = [
'../_base_/datasets/nus-3d.py',
'../_base_/models/centerpoint_01voxel_second_secfpn_nus.py',
'../_base_/schedules/cyclic_20e.py', '../_base_/default_runtime.py'
]
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
model = dict(
pts_voxel_layer=dict(point_cloud_range=point_cloud_range),
pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
# model training and testing settings
train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2])))
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
file_client_args = dict(backend='disk')
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
test_mode=False,
use_valid_flag=True,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR')),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
evaluation = dict(interval=20)
================================================
FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_circlenms_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(test_cfg=dict(pts=dict(nms_type='circle')))
================================================
FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_dcn_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(
pts_bbox_head=dict(
separate_head=dict(
type='DCNSeparateHead',
dcn_config=dict(
type='DCN',
in_channels=64,
out_channels=64,
kernel_size=3,
padding=1,
groups=4),
init_bias=-2.19,
final_kernel=3)))
================================================
FILE: configs/centerpoint/centerpoint_01voxel_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_01voxel_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(
pts_bbox_head=dict(
separate_head=dict(
type='DCNSeparateHead',
dcn_config=dict(
type='DCN',
in_channels=64,
out_channels=64,
kernel_size=3,
padding=1,
groups=4),
init_bias=-2.19,
final_kernel=3)),
test_cfg=dict(pts=dict(nms_type='circle')))
================================================
FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py
================================================
_base_ = [
'../_base_/datasets/nus-3d.py',
'../_base_/models/centerpoint_02pillar_second_secfpn_nus.py',
'../_base_/schedules/cyclic_20e.py', '../_base_/default_runtime.py'
]
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
model = dict(
pts_voxel_layer=dict(point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(point_cloud_range=point_cloud_range),
pts_bbox_head=dict(bbox_coder=dict(pc_range=point_cloud_range[:2])),
# model training and testing settings
train_cfg=dict(pts=dict(point_cloud_range=point_cloud_range)),
test_cfg=dict(pts=dict(pc_range=point_cloud_range[:2])))
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
file_client_args = dict(backend='disk')
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=9,
use_dim=[0, 1, 2, 3, 4],
file_client_args=file_client_args,
pad_empty_sweeps=True,
remove_close=True),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'nuscenes_infos_train.pkl',
pipeline=train_pipeline,
classes=class_names,
test_mode=False,
use_valid_flag=True,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR')),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
evaluation = dict(interval=20)
================================================
FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_circlenms_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(test_cfg=dict(pts=dict(nms_type='circle')))
================================================
FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_dcn_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(
pts_bbox_head=dict(
separate_head=dict(
type='DCNSeparateHead',
dcn_config=dict(
type='DCN',
in_channels=64,
out_channels=64,
kernel_size=3,
padding=1,
groups=4),
init_bias=-2.19,
final_kernel=3)))
================================================
FILE: configs/centerpoint/centerpoint_02pillar_second_secfpn_dcn_circlenms_4x8_cyclic_20e_nus.py
================================================
_base_ = ['./centerpoint_02pillar_second_secfpn_4x8_cyclic_20e_nus.py']
model = dict(
pts_bbox_head=dict(
separate_head=dict(
type='DCNSeparateHead',
dcn_config=dict(
type='DCN',
in_channels=64,
out_channels=64,
kernel_size=3,
padding=1,
groups=4),
init_bias=-2.19,
final_kernel=3)),
test_cfg=dict(pts=dict(nms_type='circle')))
================================================
FILE: configs/dynamic_voxelization/README.md
================================================
# Dynamic Voxelization
## Introduction
[ALGORITHM]
We implement Dynamic Voxelization proposed in and provide its results and models on KITTI dataset.
```
@article{zhou2019endtoend,
title={End-to-End Multi-View Fusion for 3D Object Detection in LiDAR Point Clouds},
author={Yin Zhou and Pei Sun and Yu Zhang and Dragomir Anguelov and Jiyang Gao and Tom Ouyang and James Guo and Jiquan Ngiam and Vijay Vasudevan},
year={2019},
eprint={1910.06528},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
## Results
### KITTI
| Model |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :---------: | :-----: |:-----: | :------: | :------------: | :----: | :------: |
|[SECOND](./dv_second_secfpn_6x8_80e_kitti-3d-car.py)|Car |cyclic 80e|5.5||78.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228-ac2c1c0c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car/dv_second_secfpn_6x8_80e_kitti-3d-car_20200620_235228.log.json)|
|[SECOND](./dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py)| 3 Class|cosine 80e|5.5||65.10|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010-6aa607d3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class_20200620_231010.log.json)|
|[PointPillars](./dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)| Car|cyclic 80e|4.7||77.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844-ee7b75c9.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230844.log.json)|
================================================
FILE: configs/dynamic_voxelization/dv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
================================================
_base_ = '../pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py'
voxel_size = [0.16, 0.16, 4]
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
model = dict(
type='DynamicVoxelNet',
voxel_layer=dict(
max_num_points=-1,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(-1, -1)),
voxel_encoder=dict(
type='DynamicPillarFeatureNet',
in_channels=4,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range))
================================================
FILE: configs/dynamic_voxelization/dv_second_secfpn_2x8_cosine_80e_kitti-3d-3class.py
================================================
_base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py'
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
voxel_size = [0.05, 0.05, 0.1]
model = dict(
type='DynamicVoxelNet',
voxel_layer=dict(
_delete_=True,
max_num_points=-1,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(-1, -1)),
voxel_encoder=dict(
_delete_=True,
type='DynamicSimpleVFE',
voxel_size=voxel_size,
point_cloud_range=point_cloud_range))
# optimizer
lr = 0.003 # max learning rate
optimizer = dict(
_delete_=True,
type='AdamW',
lr=lr,
betas=(0.95, 0.99), # the momentum is change during training
weight_decay=0.001)
lr_config = dict(
_delete_=True,
policy='CosineAnnealing',
warmup='linear',
warmup_iters=1000,
warmup_ratio=1.0 / 10,
min_lr_ratio=1e-5)
momentum_config = None
================================================
FILE: configs/dynamic_voxelization/dv_second_secfpn_6x8_80e_kitti-3d-car.py
================================================
_base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-car.py'
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
voxel_size = [0.05, 0.05, 0.1]
model = dict(
type='DynamicVoxelNet',
voxel_layer=dict(
_delete_=True,
max_num_points=-1,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(-1, -1)),
voxel_encoder=dict(
_delete_=True,
type='DynamicSimpleVFE',
voxel_size=voxel_size,
point_cloud_range=point_cloud_range))
================================================
FILE: configs/fp16/README.md
================================================
# Mixed Precision Training
## Introduction
[OTHERS]
We implement mixed precision training and apply it to VoxelNets (e.g., SECOND and PointPillars).
The results are in the following tables.
**Note**: For mixed precision training, we currently do not support PointNet-based methods (e.g., VoteNet).
Mixed precision training for PointNet-based methods will be supported in the future release.
## Results
### SECOND on KITTI dataset
| Backbone |Class| Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP16 mAP |Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: | :------: |
| [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4|2.9|79.07|78.72|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301-1f5ad833.pth)| [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car_20200924_211301.log.json)|
| [SECFPN](./hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4|2.9|64.41|67.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059-05f67bdf.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class_20200925_110059.log.json)|
### PointPillars on nuScenes dataset
| Backbone | Lr schd | FP32 Mem (GB) | FP16 Mem (GB) | FP32 mAP | FP32 NDS| FP16 mAP | FP16 NDS| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :----: |:----: | :------: |
|[SECFPN](./hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.37|35.17|49.7|35.19|50.27|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626-c3f0483e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d_20201020_222626.log.json)|
|[FPN](./hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py)|2x|16.4|8.40|40.0|53.3|39.26|53.26|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719-269f9dd6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d_20201021_120719.log.json)|
**Note**:
1. With mixed precision training, we can train PointPillars with nuScenes dataset on 8 Titan XP GPUS with batch size of 2.
This will cause OOM error without mixed precision training.
2. The loss scale for PointPillars on nuScenes dataset is specifically tuned to avoid the loss to be Nan. We find 32 is more stable than 512, though loss scale 32 still cause Nan sometimes.
================================================
FILE: configs/fp16/hv_pointpillars_fpn_sbn-all_fp16_2x8_2x_nus-3d.py
================================================
_base_ = '../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py'
data = dict(samples_per_gpu=2, workers_per_gpu=2)
# fp16 settings, the loss scale is specifically tuned to avoid Nan
fp16 = dict(loss_scale=32.)
================================================
FILE: configs/fp16/hv_pointpillars_regnet-400mf_fpn_sbn-all_fp16_2x8_2x_nus-3d.py
================================================
_base_ = '../regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py'
data = dict(samples_per_gpu=2, workers_per_gpu=2)
# fp16 settings, the loss scale is specifically tuned to avoid Nan
fp16 = dict(loss_scale=32.)
================================================
FILE: configs/fp16/hv_pointpillars_secfpn_sbn-all_fp16_2x8_2x_nus-3d.py
================================================
_base_ = '../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py'
data = dict(samples_per_gpu=2, workers_per_gpu=2)
# fp16 settings, the loss scale is specifically tuned to avoid Nan
fp16 = dict(loss_scale=32.)
================================================
FILE: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-3class.py
================================================
_base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py'
# fp16 settings
fp16 = dict(loss_scale=512.)
================================================
FILE: configs/fp16/hv_second_secfpn_fp16_6x8_80e_kitti-3d-car.py
================================================
_base_ = '../second/hv_second_secfpn_6x8_80e_kitti-3d-car.py'
# fp16 settings
fp16 = dict(loss_scale=512.)
================================================
FILE: configs/free_anchor/README.md
================================================
# FreeAnchor for 3D Object Detection
## Introduction
[ALGORITHM]
We implement FreeAnchor in 3D detection systems and provide their first results with PointPillars on nuScenes dataset.
With the implemented `FreeAnchor3DHead`, a PointPillar detector with a big backbone (e.g., RegNet-3.2GF) achieves top performance
on the nuScenes benchmark.
```
@inproceedings{zhang2019freeanchor,
title = {{FreeAnchor}: Learning to Match Anchors for Visual Object Detection},
author = {Zhang, Xiaosong and Wan, Fang and Liu, Chang and Ji, Rongrong and Ye, Qixiang},
booktitle = {Neural Information Processing Systems},
year = {2019}
}
```
## Usage
### Modify config
As in the [baseline config](hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py), we only need to replace the head of an existing one-stage detector to use FreeAnchor head.
Since the config is inherit from a common detector head, `_delete_=True` is necessary to avoid conflicts.
The hyperparameters are specifically tuned according to the original paper.
```python
_base_ = [
'../_base_/models/hv_pointpillars_fpn_lyft.py',
'../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py'
]
model = dict(
pts_bbox_head=dict(
_delete_=True,
type='FreeAnchor3DHead',
num_classes=10,
in_channels=256,
feat_channels=256,
use_direction_classifier=True,
pre_anchor_topk=25,
bbox_thr=0.5,
gamma=2.0,
alpha=0.5,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
scales=[1, 2, 4],
sizes=[
[0.8660, 2.5981, 1.], # 1.5/sqrt(3)
[0.5774, 1.7321, 1.], # 1/sqrt(3)
[1., 1., 1.],
[0.4, 0.4, 1],
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=True),
assigner_per_size=False,
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg = dict(
pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25])))
```
## Results
### PointPillars
| Backbone |FreeAnchor|Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download |
| :---------: |:-----: |:-----: | :------: | :------------: | :----: |:----: | :------: |
|[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|✗|2x|17.1||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)|
|[FPN](./hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|16.2||43.7|55.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200628_210537-09d359fc.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200628_210537.log.json)|
|[RegNetX-400MF-FPN](../regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py)|✗|2x|17.3||44.8|56.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json)|
|[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|17.7||47.9|58.6|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_050311-a334765d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_050311.log.json)|
|[RegNetX-1.6GF-FPN](./hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|24.3||51.2|60.8|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_105446-6ffa59cb.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_105446.log.json)|
|[RegNetX-1.6GF-FPN](./hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|24.3||53.0|62.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200701_201531-036f7de3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200701_201531.log.json)|
|[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py)|✓|2x|29.5||52.2|62.0|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_055854-658125b0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d_20200629_055854.log.json)|
|[RegNetX-3.2GF-FPN](./hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py)*|✓|3x|29.5||55.09|63.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452-297fdc66.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d_20200629_181452.log.json)|
**Note**: Models noted by `*` means it is trained using stronger augmentation with vertical flip under bird-eye-view, global translation, and larger range of global rotation.
================================================
FILE: configs/free_anchor/hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_nus.py',
'../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py'
]
model = dict(
pts_bbox_head=dict(
_delete_=True,
type='FreeAnchor3DHead',
num_classes=10,
in_channels=256,
feat_channels=256,
use_direction_classifier=True,
pre_anchor_topk=25,
bbox_thr=0.5,
gamma=2.0,
alpha=0.5,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-50, -50, -1.8, 50, 50, -1.8]],
scales=[1, 2, 4],
sizes=[
[0.8660, 2.5981, 1.], # 1.5/sqrt(3)
[0.5774, 1.7321, 1.], # 1/sqrt(3)
[1., 1., 1.],
[0.4, 0.4, 1],
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=True),
assigner_per_size=False,
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.8),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.25, 0.25])))
================================================
FILE: configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
================================================
_base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py'
model = dict(
pretrained=dict(pts='open-mmlab://regnetx_1.6gf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_1.6gf',
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[168, 408, 912]))
================================================
FILE: configs/free_anchor/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py
================================================
_base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py'
model = dict(
pretrained=dict(pts='open-mmlab://regnetx_1.6gf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_1.6gf',
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[168, 408, 912]))
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/nuscenes/': 's3://nuscenes/nuscenes/',
'data/nuscenes/': 's3://nuscenes/nuscenes/'
}))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.7854, 0.7854],
scale_ratio_range=[0.95, 1.05],
translation_std=[0.2, 0.2, 0.2]),
dict(
type='RandomFlip3D',
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
data = dict(train=dict(pipeline=train_pipeline))
lr_config = dict(step=[28, 34])
evaluation = dict(interval=36)
total_epochs = 36
================================================
FILE: configs/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
================================================
_base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py'
model = dict(
pretrained=dict(pts='open-mmlab://regnetx_3.2gf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_3.2gf',
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[192, 432, 1008]))
================================================
FILE: configs/free_anchor/hv_pointpillars_regnet-3.2gf_fpn_sbn-all_free-anchor_strong-aug_4x8_3x_nus-3d.py
================================================
_base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py'
model = dict(
pretrained=dict(pts='open-mmlab://regnetx_3.2gf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_3.2gf',
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[192, 432, 1008]))
# If point cloud range is changed, the models should also change their point
# cloud range accordingly
point_cloud_range = [-50, -50, -5, 50, 50, 3]
# For nuScenes we usually do 10-class detection
class_names = [
'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle',
'motorcycle', 'pedestrian', 'traffic_cone', 'barrier'
]
# file_client_args = dict(backend='disk')
# Uncomment the following if use ceph or other file clients.
# See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient
# for more details.
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/nuscenes/': 's3://nuscenes/nuscenes/',
'data/nuscenes/': 's3://nuscenes/nuscenes/'
}))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=5,
file_client_args=file_client_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
file_client_args=file_client_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.7854, 0.7854],
scale_ratio_range=[0.9, 1.1],
translation_std=[0.2, 0.2, 0.2]),
dict(
type='RandomFlip3D',
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
data = dict(train=dict(pipeline=train_pipeline))
lr_config = dict(step=[28, 34])
evaluation = dict(interval=36)
total_epochs = 36
================================================
FILE: configs/free_anchor/hv_pointpillars_regnet-400mf_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py
================================================
_base_ = './hv_pointpillars_fpn_sbn-all_free-anchor_4x8_2x_nus-3d.py'
model = dict(
pretrained=dict(pts='open-mmlab://regnetx_400mf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_400mf',
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
================================================
FILE: configs/h3dnet/README.md
================================================
# H3DNet: 3D Object Detection Using Hybrid Geometric Primitives
## Introduction
[ALGORITHM]
We implement H3DNet and provide the result and checkpoints on ScanNet datasets.
```
@inproceedings{zhang2020h3dnet,
author = {Zhang, Zaiwei and Sun, Bo and Yang, Haitao and Huang, Qixing},
title = {H3DNet: 3D Object Detection Using Hybrid Geometric Primitives},
booktitle = {Proceedings of the European Conference on Computer Vision},
year = {2020}
}
```
## Results
### ScanNet
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [MultiBackbone](./h3dnet_3x8_scannet-3d-18class.py) | 3x |7.9||66.43|48.01|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_scannet-3d-18class_20200830_000136-02e36246.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/h3dnet/h3dnet_scannet-3d-18class/h3dnet_scannet-3d-18class_20200830_000136.log.json) |
================================================
FILE: configs/h3dnet/h3dnet_3x8_scannet-3d-18class.py
================================================
_base_ = [
'../_base_/datasets/scannet-3d-18class.py', '../_base_/models/h3dnet.py',
'../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
rpn_head=dict(
num_classes=18,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=18,
num_dir_bins=24,
with_rot=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]])),
roi_head=dict(
bbox_head=dict(
num_classes=18,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=18,
num_dir_bins=24,
with_rot=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]]))))
data = dict(samples_per_gpu=3, workers_per_gpu=2)
# optimizer
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
================================================
FILE: configs/imvotenet/README.md
================================================
# ImVoteNet: Boosting 3D Object Detection in Point Clouds with Image Votes
## Introduction
[ALGORITHM]
We implement ImVoteNet and provide the result and checkpoints on SUNRGBD.
```
@inproceedings{qi2020imvotenet,
title={Imvotenet: Boosting 3D object detection in point clouds with image votes},
author={Qi, Charles R and Chen, Xinlei and Litany, Or and Guibas, Leonidas J},
booktitle={Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
pages={4404--4413},
year={2020}
}
```
## Results
### SUNRGBD-2D (Stage 1, image branch pre-train)
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [PointNet++](./imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py) | |2.1| ||62.70|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222.log.json)|
### SUNRGBD-3D (Stage 2)
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [PointNet++](./imvotenet_stage2_16x8_sunrgbd-3d-10class.py) | 3x |9.4| |64.04||[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021-d44dcb66.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class/imvotenet_stage2_16x8_sunrgbd-3d-10class_20210323_184021.log.json)|
================================================
FILE: configs/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class.py
================================================
_base_ = [
'../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/default_runtime.py',
'../_base_/models/imvotenet_image.py'
]
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True),
dict(
type='Resize',
img_scale=[(1333, 480), (1333, 504), (1333, 528), (1333, 552),
(1333, 576), (1333, 600)],
multiscale_mode='value',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 600),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(times=1, dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
optimizer_config = dict(grad_clip=None)
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[6])
total_epochs = 8
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
================================================
FILE: configs/imvotenet/imvotenet_stage2_16x8_sunrgbd-3d-10class.py
================================================
_base_ = [
'../_base_/datasets/sunrgbd-3d-10class.py',
'../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py',
'../_base_/models/imvotenet_image.py'
]
class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
model = dict(
pts_backbone=dict(
type='PointNet2SASSG',
in_channels=4,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 256)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)),
pts_bbox_heads=dict(
common=dict(
type='VoteHead',
num_classes=10,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=10,
num_dir_bins=12,
with_rot=True,
mean_sizes=[[2.114256, 1.620300, 0.927272],
[0.791118, 1.279516, 0.718182],
[0.923508, 1.867419, 0.845495],
[0.591958, 0.552978, 0.827272],
[0.699104, 0.454178, 0.75625],
[0.69519, 1.346299, 0.736364],
[0.528526, 1.002642, 1.172878],
[0.500618, 0.632163, 0.683424],
[0.404671, 1.071108, 1.688889],
[0.76584, 1.398258, 0.472728]]),
pred_layer_cfg=dict(
in_channels=128, shared_conv_channels=(128, 128), bias=True),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=dict(
type='CrossEntropyLoss',
class_weight=[0.2, 0.8],
reduction='sum',
loss_weight=5.0),
center_loss=dict(
type='ChamferDistance',
mode='l2',
reduction='sum',
loss_src_weight=10.0,
loss_dst_weight=10.0),
dir_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
dir_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0),
size_class_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0),
size_res_loss=dict(
type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0),
semantic_loss=dict(
type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)),
joint=dict(
vote_module_cfg=dict(
in_channels=512,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(512, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[512, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
pts=dict(
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
img=dict(
vote_module_cfg=dict(
in_channels=256,
vote_per_seed=1,
gt_per_seed=3,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
norm_feats=True,
vote_loss=dict(
type='ChamferDistance',
mode='l1',
reduction='none',
loss_dst_weight=10.0)),
vote_aggregation_cfg=dict(
type='PointSAModule',
num_point=256,
radius=0.3,
num_sample=16,
mlp_channels=[256, 128, 128, 128],
use_xyz=True,
normalize_xyz=True)),
loss_weights=[0.4, 0.3, 0.3]),
img_mlp=dict(
in_channel=18,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU')),
fusion_layer=dict(
type='VoteFusion',
num_classes=len(class_names),
max_imvote_per_pixel=3),
num_sampled_seed=1024,
freeze_img_branch=True,
# model training and testing settings
train_cfg=dict(
pts=dict(
pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote')),
test_cfg=dict(
img_rcnn=dict(score_thr=0.1),
pts=dict(
sample_mod='seed',
nms_thr=0.25,
score_thr=0.05,
per_class_proposal=True)))
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations3D'),
dict(type='LoadAnnotations', with_bbox=True),
dict(type='Resize', img_scale=(1333, 600), keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.523599, 0.523599],
scale_ratio_range=[0.85, 1.15],
shift_height=True),
dict(type='IndoorPointSample', num_points=20000),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=[
'img', 'gt_bboxes', 'gt_labels', 'points', 'gt_bboxes_3d',
'gt_labels_3d', 'calib'
])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='LoadPointsFromFile',
coord_type='DEPTH',
shift_height=True,
load_dim=6,
use_dim=[0, 1, 2]),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 600),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.0),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
),
dict(type='IndoorPointSample', num_points=20000),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['img', 'points', 'calib'])
]),
]
data = dict(
train=dict(dataset=dict(pipeline=train_pipeline)),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# may also use your own pre-trained image branch
load_from = 'https://download.openmmlab.com/mmdetection3d/v0.1.0_models/imvotenet/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class/imvotenet_faster_rcnn_r50_fpn_2x4_sunrgbd-3d-10class_20210323_173222-cad62aeb.pth' # noqa
================================================
FILE: configs/mvxnet/README.md
================================================
# MVX-Net: Multimodal VoxelNet for 3D Object Detection
## Introduction
[ALGORITHM]
We implement MVX-Net and provide its results and models on KITTI dataset.
```
@inproceedings{sindagi2019mvx,
title={MVX-Net: Multimodal voxelnet for 3D object detection},
author={Sindagi, Vishwanath A and Zhou, Yin and Tuzel, Oncel},
booktitle={2019 International Conference on Robotics and Automation (ICRA)},
pages={7276--7282},
year={2019},
organization={IEEE}
}
```
## Results
### KITTI
| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [SECFPN](./dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py)|3 Class|cosine 80e|6.7||63.0|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904-10140f2d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class_20200621_003904.log.json)|
================================================
FILE: configs/mvxnet/dv_mvx-fpn_second_secfpn_adamw_2x8_80e_kitti-3d-3class.py
================================================
# model settings
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict(
type='DynamicMVXFasterRCNN',
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=False),
norm_eval=True,
style='caffe'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
pts_voxel_layer=dict(
max_num_points=-1,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(-1, -1),
),
pts_voxel_encoder=dict(
type='DynamicVFE',
in_channels=4,
feat_channels=[64, 64],
with_distance=False,
voxel_size=voxel_size,
with_cluster_center=True,
with_voxel_center=True,
point_cloud_range=point_cloud_range,
fusion_layer=dict(
type='PointFusion',
img_channels=256,
pts_channels=64,
mid_channels=128,
out_channels=128,
img_levels=[0, 1, 2, 3, 4],
align_corners=False,
activate_out=True,
fuse_out=False)),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=128,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act')),
pts_backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
pts_bbox_head=dict(
type='Anchor3DHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78],
],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
assigner_per_size=True,
diff_rad_by_sin=True,
assign_per_class=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
pts=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.35,
neg_iou_thr=0.2,
min_pos_iou=0.2,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.35,
neg_iou_thr=0.2,
min_pos_iou=0.2,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
],
allowed_border=0,
pos_weight=-1,
debug=False)),
test_cfg=dict(
pts=dict(
use_rotate_nms=True,
nms_across_levels=False,
nms_thr=0.01,
score_thr=0.1,
min_bbox_size=0,
nms_pre=100,
max_num=50)))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
input_modality = dict(use_lidar=True, use_camera=True)
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='Resize',
img_scale=[(640, 192), (2560, 768)],
multiscale_mode='range',
keep_ratio=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
translation_std=[0.2, 0.2, 0.2]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(
type='Collect3D',
keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']),
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1280, 384),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(type='Resize', multiscale_mode='value', keep_ratio=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points', 'img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True))
# Training settings
optimizer = dict(type='AdamW', lr=0.003, betas=(0.95, 0.99), weight_decay=0.01)
# max_norm=10 is better for SECOND
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
lr_config = dict(
policy='CosineAnnealing',
warmup='linear',
warmup_iters=1000,
warmup_ratio=1.0 / 10,
min_lr_ratio=1e-5)
momentum_config = None
checkpoint_config = dict(interval=1)
# yapf:disable
log_config = dict(
interval=50,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
evaluation = dict(interval=1)
# runtime settings
total_epochs = 40
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
# You may need to download the model first is the network is unstable
load_from = 'https://download.openmmlab.com/mmdetection3d/pretrain_models/mvx_faster_rcnn_detectron2-caffe_20e_coco-pretrain_gt-sample_kitti-3-class_moderate-79.3_20200207-a4a6a3c7.pth' # noqa
resume_from = None
workflow = [('train', 1)]
================================================
FILE: configs/nuimages/README.md
================================================
# NuImages Results
## Introduction
[DATASET]
We support and provide some baseline results on [nuImages dataset](https://www.nuscenes.org/nuimages).
We follow the class mapping in nuScenes dataset, which maps the original categories into 10 foreground categories.
The convert script can be found [here](https://github.com/open-mmlab/mmdetection3d/blob/master/tools/data_converter/nuimage_converter.py).
The baseline results include instance segmentation models, e.g., Mask R-CNN, Cascade Mask R-CNN, and HTC.
We will support panoptic segmentation models in the future.

The dataset converted by the script of v0.6.0 only supports instance segmentation. Since v0.7.0, we also support to produce semantic segmentation mask of each image; thus, we can train HTC or semantic segmentation models using the dataset. To convert the nuImages dataset into COCO format, please use the command below:
```shell
python -u tools/data_converter/nuimage_converter.py --data-root ${DATA_ROOT} --version ${VERIONS} \
--out-dir ${OUT_DIR} --nproc ${NUM_WORKERS} --extra-tag ${TAG}
```
- `--data-root`: the root of the dataset, defaults to `./data/nuimages`.
- `--version`: the version of the dataset, defaults to `v1.0-mini`. To get the full dataset, please use `--version v1.0-train v1.0-val v1.0-mini`
- `--out-dir`: the output directory of annotations and semantic masks, defaults to `./data/nuimages/annotations/`.
- `--nproc`: number of workers for data preparation, defaults to `4`. Larger number could reduce the preparation time as images are processed in parallel.
- `--extra-tag`: extra tag of the annotations, defaults to `nuimages`. This can be used to separate different annotations processed in different time for study.
## Results
### Instance Segmentation
We report Mask R-CNN and Cascade Mask R-CNN results on nuimages.
|Method | Backbone|Pretraining | Lr schd | Mem (GB) | Box AP | Mask AP |Download |
| :---------: |:---------: | :---------: | :-----: |:-----: | :------: | :------------: | :----: |
| Mask R-CNN| [R-50](./mask_rcnn_r50_fpn_1x_nuim.py) |IN|1x|7.4|47.8 |38.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238-e99f5182.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_1x_nuim/mask_rcnn_r50_fpn_1x_nuim_20201008_195238.log.json)|
| Mask R-CNN| [R-50](./mask_rcnn_r50_fpn_coco-2x_1x_nuim.py) |IN+COCO-2x|1x|7.4|49.7|40.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238-b1742a60.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_fpn_coco-2x_1x_nuim/mask_rcnn_r50_fpn_coco-2x_1x_nuim_20201008_195238.log.json)|
| Mask R-CNN| [R-50-CAFFE](./mask_rcnn_r50_caffe_fpn_1x_nuim.py) |IN|1x|7.0|47.7|38.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_1x_nuim/)|
| Mask R-CNN| [R-50-CAFFE](./mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py) |IN+COCO-3x|1x|7.0|49.9|40.8|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305-661a992e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim_20201008_195305.log.json)|
| Mask R-CNN| [R-50-CAFFE](./mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py) |IN+COCO-3x|20e|7.0|50.6|41.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002-5529442c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim_20201009_125002.log.json)|
| Mask R-CNN| [R-101](./mask_rcnn_r101_fpn_1x_nuim.py) |IN|1x|10.9|48.9|39.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803-65c7623a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_r101_fpn_1x_nuim/mask_rcnn_r101_fpn_1x_nuim_20201024_134803.log.json)|
| Mask R-CNN| [X-101_32x4d](./mask_rcnn_x101_32x4d_fpn_1x_nuim.py) |IN|1x|13.3|50.4|40.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741-b699ab37.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/mask_rcnn_x101_32x4d_fpn_1x_nuim/mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135741.log.json)|
| Cascade Mask R-CNN| [R-50](./cascade_mask_rcnn_r50_fpn_1x_nuim.py) |IN|1x|8.9|50.8|40.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342-1147c036.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_1x_nuim/cascade_mask_rcnn_r50_fpn_1x_nuim_20201008_195342.log.json)|
| Cascade Mask R-CNN| [R-50](./cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim.py) |IN+COCO-20e|1x|8.9|52.8|42.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158-ad0540e3.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim_20201009_124158.log.json)|
| Cascade Mask R-CNN| [R-50](./cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim.py) |IN+COCO-20e|20e|8.9|52.8|42.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951.log.json)|
| Cascade Mask R-CNN| [R-101](./cascade_mask_rcnn_r101_fpn_1x_nuim.py) |IN|1x|12.5|51.5|40.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804-45215b1e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_r101_fpn_1x_nuim/cascade_mask_rcnn_r101_fpn_1x_nuim_20201024_134804.log.json)|
| Cascade Mask R-CNN| [X-101_32x4d](./cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim.py) |IN|1x|14.9|52.8|41.6|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753-e0e49778.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim_20201024_135753.log.json)|
| HTC w/o semantic|[R-50](./htc_without_semantic_r50_fpn_1x_nuim.py) |IN|1x||[model]() | [log]()|
| HTC|[R-50](./htc_r50_fpn_1x_nuim.py) |IN|1x||[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/)|
| HTC|[R-50](./htc_r50_fpn_coco-20e_1x_nuim.py) |IN+COCO-20e|1x|11.6|53.8|43.8|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203-0b53a65e.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_1x_nuim/htc_r50_fpn_coco-20e_1x_nuim_20201010_070203.log.json)|
| HTC|[R-50](./htc_r50_fpn_coco-20e_20e_nuim.py) |IN+COCO-20e|20e|11.6|54.8|44.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415-d6c60a2c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_r50_fpn_coco-20e_20e_nuim/htc_r50_fpn_coco-20e_20e_nuim_20201008_211415.log.json)|
| HTC|[X-101_64x4d + DCN_c3-c5](./htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py) |IN+COCO-20e|20e|13.3|57.3|46.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222-0b16ac4b.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/nuimages_semseg/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim_20201008_211222.log.json)|
**Note**:
1. `IN` means only using ImageNet pre-trained backbone. `IN+COCO-Nx` and `IN+COCO-Ne` means the backbone is first pre-trained on ImageNet, and then the detector is pre-trained on COCO train2017 dataset by `Nx` and `N` epochs schedules, respectively.
2. All the training hyper-parameters follow the standard schedules on COCO dataset except that the images are resized from
1280 x 720 to 1920 x 1080 (relative ratio 0.8 to 1.2) since the images are in size 1600 x 900.
3. The class order in the detectors released in v0.6.0 is different from the order in the configs because the bug in the convertion script. This bug has been fixed since v0.7.0 and the models trained by the correct class order are also released. If you used nuImages since v0.6.0, please re-convert the data through the convertion script using the above-mentioned command.
================================================
FILE: configs/nuimages/cascade_mask_rcnn_r101_fpn_1x_nuim.py
================================================
_base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py'
model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
================================================
FILE: configs/nuimages/cascade_mask_rcnn_r50_fpn_1x_nuim.py
================================================
_base_ = [
'../_base_/models/cascade_mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
roi_head=dict(
bbox_head=[
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
],
mask_head=dict(num_classes=10)))
================================================
FILE: configs/nuimages/cascade_mask_rcnn_r50_fpn_coco-20e_1x_nuim.py
================================================
_base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py'
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth' # noqa
================================================
FILE: configs/nuimages/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim.py
================================================
_base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py'
# learning policy
lr_config = dict(step=[16, 19])
total_epochs = 20
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/cascade_rcnn/cascade_mask_rcnn_r50_fpn_20e_coco/cascade_mask_rcnn_r50_fpn_20e_coco_bbox_mAP-0.419__segm_mAP-0.365_20200504_174711-4af8e66e.pth' # noqa
================================================
FILE: configs/nuimages/cascade_mask_rcnn_x101_32x4d_fpn_1x_nuim.py
================================================
_base_ = './cascade_mask_rcnn_r50_fpn_1x_nuim.py'
model = dict(
pretrained='open-mmlab://resnext101_32x4d',
backbone=dict(
type='ResNeXt',
depth=101,
groups=32,
base_width=4,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
style='pytorch'))
================================================
FILE: configs/nuimages/htc_r50_fpn_1x_nuim.py
================================================
_base_ = './htc_without_semantic_r50_fpn_1x_nuim.py'
model = dict(
roi_head=dict(
semantic_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
out_channels=256,
featmap_strides=[8]),
semantic_head=dict(
type='FusedSemanticHead',
num_ins=5,
fusion_level=1,
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=32,
ignore_label=0,
loss_weight=0.2)))
data_root = 'data/nuimages/'
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='LoadAnnotations', with_bbox=True, with_mask=True, with_seg=True),
dict(
type='Resize',
img_scale=[(1280, 720), (1920, 1080)],
multiscale_mode='range',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='SegRescale', scale_factor=1 / 8),
dict(type='DefaultFormatBundle'),
dict(
type='Collect',
keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg'])
]
data = dict(
train=dict(
seg_prefix=data_root + 'annotations/semantic_masks/',
pipeline=train_pipeline))
================================================
FILE: configs/nuimages/htc_r50_fpn_coco-20e_1x_nuim.py
================================================
_base_ = './htc_r50_fpn_1x_nuim.py'
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/htc/htc_r50_fpn_20e_coco/htc_r50_fpn_20e_coco_20200319-fe28c577.pth' # noqa
================================================
FILE: configs/nuimages/htc_r50_fpn_coco-20e_20e_nuim.py
================================================
_base_ = './htc_r50_fpn_coco-20e_1x_nuim.py'
# learning policy
lr_config = dict(step=[16, 19])
total_epochs = 20
================================================
FILE: configs/nuimages/htc_without_semantic_r50_fpn_1x_nuim.py
================================================
_base_ = [
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
type='HybridTaskCascade',
pretrained='torchvision://resnet50',
backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)),
roi_head=dict(
type='HybridTaskCascadeRoIHead',
interleaved=True,
mask_info_flow=True,
num_stages=3,
stage_loss_weights=[1, 0.5, 0.25],
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=[
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.05, 0.05, 0.1, 0.1]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0,
loss_weight=1.0)),
dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.033, 0.033, 0.067, 0.067]),
reg_class_agnostic=True,
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=False,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0))
],
mask_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
mask_head=[
dict(
type='HTCMaskHead',
with_conv_res=False,
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=10,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
dict(
type='HTCMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=10,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0)),
dict(
type='HTCMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=10,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))
]),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=2000,
max_num=2000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=[
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.6,
neg_iou_thr=0.6,
min_pos_iou=0.6,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False),
dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.7,
min_pos_iou=0.7,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False)
]),
test_cfg=dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.001,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100,
mask_thr_binary=0.5)))
================================================
FILE: configs/nuimages/htc_x101_64x4d_fpn_dconv_c3-c5_coco-20e_16x1_20e_nuim.py
================================================
_base_ = './htc_r50_fpn_1x_nuim.py'
model = dict(
pretrained='open-mmlab://resnext101_64x4d',
backbone=dict(
type='ResNeXt',
depth=101,
groups=64,
base_width=4,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
dcn=dict(type='DCN', deform_groups=1, fallback_on_stride=False),
stage_with_dcn=(False, True, True, True)))
data = dict(samples_per_gpu=1, workers_per_gpu=1)
# learning policy
lr_config = dict(step=[16, 19])
total_epochs = 20
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/htc/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco/htc_x101_64x4d_fpn_dconv_c3-c5_mstrain_400_1400_16x1_20e_coco_20200312-946fd751.pth' # noqa
================================================
FILE: configs/nuimages/mask_rcnn_r101_fpn_1x_nuim.py
================================================
_base_ = './mask_rcnn_r50_fpn_1x_nuim.py'
model = dict(pretrained='torchvision://resnet101', backbone=dict(depth=101))
================================================
FILE: configs/nuimages/mask_rcnn_r50_caffe_fpn_1x_nuim.py
================================================
_base_ = [
'../_base_/models/mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
pretrained='open-mmlab://detectron2/resnet50_caffe',
backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'),
roi_head=dict(
bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
dict(
type='Resize',
img_scale=[(1280, 720), (1920, 1080)],
multiscale_mode='range',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1600, 900),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
================================================
FILE: configs/nuimages/mask_rcnn_r50_caffe_fpn_coco-3x_1x_nuim.py
================================================
_base_ = [
'../_base_/models/mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
pretrained='open-mmlab://detectron2/resnet50_caffe',
backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'),
roi_head=dict(
bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
dict(
type='Resize',
img_scale=[(1280, 720), (1920, 1080)],
multiscale_mode='range',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1600, 900),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
================================================
FILE: configs/nuimages/mask_rcnn_r50_caffe_fpn_coco-3x_20e_nuim.py
================================================
_base_ = [
'../_base_/models/mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
pretrained='open-mmlab://detectron2/resnet50_caffe',
backbone=dict(norm_cfg=dict(requires_grad=False), style='caffe'),
roi_head=dict(
bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
# use caffe img_norm
img_norm_cfg = dict(
mean=[103.530, 116.280, 123.675], std=[1.0, 1.0, 1.0], to_rgb=False)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
dict(
type='Resize',
img_scale=[(1280, 720), (1920, 1080)],
multiscale_mode='range',
keep_ratio=True),
dict(type='RandomFlip', flip_ratio=0.5),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(1600, 900),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data = dict(
train=dict(pipeline=train_pipeline),
val=dict(pipeline=test_pipeline),
test=dict(pipeline=test_pipeline))
# learning policy
lr_config = dict(step=[16, 19])
total_epochs = 20
load_from = 'http://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth' # noqa
================================================
FILE: configs/nuimages/mask_rcnn_r50_fpn_1x_nuim.py
================================================
_base_ = [
'../_base_/models/mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
roi_head=dict(
bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
================================================
FILE: configs/nuimages/mask_rcnn_r50_fpn_coco-2x_1x_nuim.py
================================================
_base_ = [
'../_base_/models/mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
roi_head=dict(
bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
load_from = 'https://download.openmmlab.com/mmdetection/v2.0/mask_rcnn/mask_rcnn_r50_fpn_2x_coco/mask_rcnn_r50_fpn_2x_coco_bbox_mAP-0.392__segm_mAP-0.354_20200505_003907-3e542a40.pth' # noqa
================================================
FILE: configs/nuimages/mask_rcnn_r50_fpn_coco-2x_1x_nus-2d.py
================================================
_base_ = [
'../_base_/models/mask_rcnn_r50_fpn.py',
'../_base_/datasets/nuim_instance.py',
'../_base_/schedules/mmdet_schedule_1x.py', '../_base_/default_runtime.py'
]
model = dict(
roi_head=dict(
bbox_head=dict(num_classes=10), mask_head=dict(num_classes=10)))
file_client_args = dict(
backend='petrel',
path_mapping=dict({
'./data/nuscenes/': 's3://nuscenes/nuscenes/',
'data/nuscenes/': 's3://nuscenes/nuscenes/'
}))
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
test_pipeline = [
dict(type='LoadImageFromFile', file_client_args=file_client_args),
dict(
type='MultiScaleFlipAug',
img_scale=(1600, 900),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
])
]
data_root = 'data/nuimages/'
# data = dict(
# val=dict(
# ann_file=data_root + 'annotations/nuimages_v1.0-mini.json'),
# test=dict(
# ann_file=data_root + 'annotations/nuimages_v1.0-mini.json'))
================================================
FILE: configs/nuimages/mask_rcnn_swinT_coco-2x_1x_nuim.py
================================================
_base_ = [
'../_base_/datasets/nuim_instance.py', '../_base_/default_runtime.py'
]
model = dict(
type='MaskRCNN',
backbone=dict(
type='SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
patch_norm=True,
out_indices=(0, 1, 2, 3),
with_cp=False,
convert_weights=True,
),
neck=dict(
type='FPN',
in_channels=[96, 192, 384, 768],
out_channels=256,
num_outs=5),
rpn_head=dict(
type='RPNHead',
in_channels=256,
feat_channels=256,
anchor_generator=dict(
type='AnchorGenerator',
scales=[8],
ratios=[0.5, 1.0, 2.0],
strides=[4, 8, 16, 32, 64]),
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[.0, .0, .0, .0],
target_stds=[1.0, 1.0, 1.0, 1.0]),
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
roi_head=dict(
type='StandardRoIHead',
bbox_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
bbox_head=dict(
type='Shared2FCBBoxHead',
in_channels=256,
fc_out_channels=1024,
roi_feat_size=7,
num_classes=10,
bbox_coder=dict(
type='DeltaXYWHBBoxCoder',
target_means=[0., 0., 0., 0.],
target_stds=[0.1, 0.1, 0.2, 0.2]),
reg_class_agnostic=False,
loss_cls=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
mask_roi_extractor=dict(
type='SingleRoIExtractor',
roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
out_channels=256,
featmap_strides=[4, 8, 16, 32]),
mask_head=dict(
type='FCNMaskHead',
num_convs=4,
in_channels=256,
conv_out_channels=256,
num_classes=10,
loss_mask=dict(
type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.7,
neg_iou_thr=0.3,
min_pos_iou=0.3,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=256,
pos_fraction=0.5,
neg_pos_ub=-1,
add_gt_as_proposals=False),
allowed_border=-1,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_across_levels=False,
nms_pre=2000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
assigner=dict(
type='MaxIoUAssigner',
pos_iou_thr=0.5,
neg_iou_thr=0.5,
min_pos_iou=0.5,
match_low_quality=True,
ignore_iof_thr=-1),
sampler=dict(
type='RandomSampler',
num=512,
pos_fraction=0.25,
neg_pos_ub=-1,
add_gt_as_proposals=True),
mask_size=28,
pos_weight=-1,
debug=False)),
test_cfg=dict(
rpn=dict(
nms_across_levels=False,
nms_pre=1000,
nms_post=1000,
max_num=1000,
nms_thr=0.7,
min_bbox_size=0),
rcnn=dict(
score_thr=0.05,
nms=dict(type='nms', iou_threshold=0.5),
max_per_img=100,
mask_thr_binary=0.5)))
load_from = '/data/yc_code/ImplicitFusion/checkpoints/mask_rcnn_swin-t-p4-w7_fpn_ms-crop-3x_coco_20210906_131725-bacf6f7b.pth' # noqa
data = dict(
samples_per_gpu=2,
workers_per_gpu=4
)
# optimizer
optimizer = dict(
type='AdamW',
lr=0.000025,
betas=(0.9, 0.999),
weight_decay=0.05,
paramwise_cfg=dict(
custom_keys={
'absolute_pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}))
lr_config = dict(policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=0.001, step=[8, 11])
optimizer_config = dict(grad_clip=None)
runner = dict(type='EpochBasedRunner', max_epochs=12)
================================================
FILE: configs/nuimages/mask_rcnn_x101_32x4d_fpn_1x_nuim.py
================================================
_base_ = './mask_rcnn_r50_fpn_1x_nuim.py'
model = dict(
pretrained='open-mmlab://resnext101_32x4d',
backbone=dict(
type='ResNeXt',
depth=101,
groups=32,
base_width=4,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
style='pytorch'))
================================================
FILE: configs/nuscenes.md
================================================
# MODEL ZOO
## Common settings and notes
- The experiments are run with PyTorch 1.7.0, CUDA 10.1 and CUDNN 7.6
- The training is conducted on 8 Tesla V100 GPUs
- For the *fade strategy* proposed by PointAugmenting(disenable the copy-and-paste augmentation for the last 5 epochs), we currently implement this strategy by manually stop training at 15 epoch and resume the training without copy-and-paste augmentation. If you find more elegant ways to implement such strategy, please let we know and we really appreciate it. The fade strategy reduces lots of false positive, improving the mAP remarkably especially for TransFusion-L while having less influence on TransFusion.
## Pretrained 2D Backbones
- DLA34: Following PointAugmenting, we directly reuse the checkpoints pretrained on monocular 3D detection task provided by [CenterNet]((https://github.com/xingyizhou/CenterTrack/blob/master/readme/MODEL_ZOO.md#monocular-3d-detection-tracking)).
- ResNet50 on instance segmentation: We acquire the model pretrained on nuImages from [MMDetection3D](https://github.com/open-mmlab/mmdetection3d/blob/v0.12.0/configs/nuimages/README.md).
- ResNet50 on 2D detection: We train a model using the [config](https://github.com/open-mmlab/mmdetection3d/blob/v0.12.0/configs/nuimages/mask_rcnn_r50_fpn_1x_nuim.py) of instance segmentation but remove the mask head.
## nuScenes 3D Detection
All the LiDAR-only models are trained in 20 epochs, the fusion-based models are further trained for 6 epochs from the pretrained LiDAR backbone. We freeze the weight of LiDAR backbone to save GPU memory.
| Model | Backbone | mAP | NDS |
|---------|--------|--------|---------|
| [TransFusion-L](configs/transfusion_nusc_pillar_L.py) | PointPillars | 54.51 | 62.66 |
| [TransFusion](configs/transfusion_nusc_pillar_LC.py) | PointPillars | 60.21 | 65.50 |
| [TransFusion-L](configs/transfusion_nusc_voxel_L.py) | VoxelNet | 65.06 | 70.10 |
| [TransFusion](configs/transfusion_nusc_voxel_LC.py) | VoxelNet | 67.49 | 71.28 |
## nuScenes 3D Tracking
We perform tracking-by-detection with the same tracking algorithms proposed by CenterPoint.
| Model | Backbone | AMOTA | AMOTP |
|---------|--------|--------|---------|
| [TransFusion-L](configs/transfusion_nusc_voxel_L.py) | VoxelNet | 0.703 | 0.553 |
| [TransFusion](configs/transfusion_nusc_voxel_LC.py) | VoxelNet | 0.725 | 0.561 |
## nuScenes Leaderboard
### Detection
We use 300 object queries during inference for online submission for a slightly better performance. We do not use any test-time-augmentation and model ensemble.
| Model | Backbone | Test mAP | Test NDS | Link |
|---------|--------|--------|---------|---------|
| TransFusion-L | VoxelNet | 65.52 | 70.23 | [Detection](https://drive.google.com/file/d/1Wk8p2LJEhwfKfhsKzlU9vDBOd0zn38dN/view?usp=sharing)
| TransFusion | VoxelNet | 68.90 | 71.68 | [Detection](https://drive.google.com/file/d/1X7_ig4v5A2vKsiHtUGtgeMN-0RJKsM6W/view?usp=sharing)
### Tracking
| Model | Backbone | Test AMOTA | Test AMOTP | Link |
|---------|--------|--------|---------|---------|
| TranFusion-L| VoxelNet | 0.686 | 0.529 | [Detection](https://drive.google.com/file/d/1Wk8p2LJEhwfKfhsKzlU9vDBOd0zn38dN/view?usp=sharing) / [Tracking](https://drive.google.com/file/d/1pKvRBUsM9h1Xgturd0Ae_bnGt0m_j3hk/view?usp=sharing)|
| TranFusion| VoxelNet | 0.718 | 0.551 | [Detection](https://drive.google.com/file/d/1X7_ig4v5A2vKsiHtUGtgeMN-0RJKsM6W/view?usp=sharing) / [Tracking](https://drive.google.com/file/d/1EVuS-MAg_HSXUVqMrXEs4-RpZp0p5cfv/view?usp=sharing)|
================================================
FILE: configs/parta2/README.md
================================================
# From Points to Parts: 3D Object Detection from Point Cloud with Part-aware and Part-aggregation Network
## Introduction
[ALGORITHM]
We implement Part-A^2 and provide its results and checkpoints on KITTI dataset.
```
@article{shi2020points,
title={From points to parts: 3d object detection from point cloud with part-aware and part-aggregation network},
author={Shi, Shaoshuai and Wang, Zhe and Shi, Jianping and Wang, Xiaogang and Li, Hongsheng},
journal={IEEE Transactions on Pattern Analysis and Machine Intelligence},
year={2020},
publisher={IEEE}
}
```
## Results
### KITTI
| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP | Download |
| :---------: | :-----: |:-----: | :------: | :------------: | :----: |:----: |
| [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py) |3 Class|cyclic 80e|4.1||67.9|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724-a2672098.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class_20200620_230724.log.json)|
| [SECFPN](./hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py) |Car |cyclic 80e|4.0||79.16|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755-f2a38b9a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car_20200620_230755.log.json)|
================================================
FILE: configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py
================================================
_base_ = ['../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py']
# model settings
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict(
type='PartA2',
voxel_layer=dict(
max_num_points=5,
point_cloud_range=point_cloud_range,
voxel_size=voxel_size,
max_voxels=(16000, 40000)),
voxel_encoder=dict(type='HardSimpleVFE'),
middle_encoder=dict(
type='SparseUNet',
in_channels=4,
sparse_shape=[41, 1600, 1408],
order=('conv', 'norm', 'act')),
backbone=dict(
type='SECOND',
in_channels=256,
layer_nums=[5, 5],
layer_strides=[1, 2],
out_channels=[128, 256]),
neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
upsample_strides=[1, 2],
out_channels=[256, 256]),
rpn_head=dict(
type='PartA2RPNHead',
num_classes=3,
in_channels=512,
feat_channels=512,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -0.6, 70.4, 40.0, -0.6],
[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[0.6, 0.8, 1.73], [0.6, 1.76, 1.73], [1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False),
diff_rad_by_sin=True,
assigner_per_size=True,
assign_per_class=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
roi_head=dict(
type='PartAggregationROIHead',
num_classes=3,
semantic_head=dict(
type='PointwiseSemanticHead',
in_channels=16,
extra_width=0.2,
seg_score_thr=0.3,
num_classes=3,
loss_seg=dict(
type='FocalLoss',
use_sigmoid=True,
reduction='sum',
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_part=dict(
type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
seg_roi_extractor=dict(
type='Single3DRoIAwareExtractor',
roi_layer=dict(
type='RoIAwarePool3d',
out_size=14,
max_pts_per_voxel=128,
mode='max')),
part_roi_extractor=dict(
type='Single3DRoIAwareExtractor',
roi_layer=dict(
type='RoIAwarePool3d',
out_size=14,
max_pts_per_voxel=128,
mode='avg')),
bbox_head=dict(
type='PartA2BboxHead',
num_classes=3,
seg_in_channels=16,
part_in_channels=4,
seg_conv_channels=[64, 64],
part_conv_channels=[64, 64],
merge_conv_channels=[128, 128],
down_conv_channels=[128, 256],
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
shared_fc_channels=[256, 512, 512, 512],
cls_channels=[256, 256],
reg_channels=[256, 256],
dropout_ratio=0.1,
roi_feat_size=14,
with_corner_loss=True,
loss_bbox=dict(
type='SmoothL1Loss',
beta=1.0 / 9.0,
reduction='sum',
loss_weight=1.0),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='sum',
loss_weight=1.0))),
# model training and testing settings
train_cfg=dict(
rpn=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1)
],
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=9000,
nms_post=512,
max_num=512,
nms_thr=0.8,
score_thr=0,
use_rotate_nms=False),
rcnn=dict(
assigner=[
dict( # for Pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
dict( # for Cyclist
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(
type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1)
],
sampler=dict(
type='IoUNegPiecewiseSampler',
num=128,
pos_fraction=0.55,
neg_piece_fractions=[0.8, 0.2],
neg_iou_piece_thrs=[0.55, 0.1],
neg_pos_ub=-1,
add_gt_as_proposals=False,
return_iou=True),
cls_pos_thr=0.75,
cls_neg_thr=0.25)),
test_cfg=dict(
rpn=dict(
nms_pre=1024,
nms_post=100,
max_num=100,
nms_thr=0.7,
score_thr=0,
use_rotate_nms=True),
rcnn=dict(
use_rotate_nms=True,
use_raw_score=True,
nms_thr=0.01,
score_thr=0.1)))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
classes=class_names,
sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6))
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0.5],
global_rot_range=[0.0, 0.0],
rot_range=[-0.78539816, 0.78539816]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_train.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'kitti_infos_val.pkl',
split='training',
pts_prefix='velodyne_reduced',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True))
# Part-A2 uses a different learning rate from what SECOND uses.
lr = 0.001
optimizer = dict(lr=lr)
find_unused_parameters = True
================================================
FILE: configs/parta2/hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-car.py
================================================
_base_ = './hv_PartA2_secfpn_2x8_cyclic_80e_kitti-3d-3class.py'
voxel_size = [0.05, 0.05, 0.1]
point_cloud_range = [0, -40, -3, 70.4, 40, 1] # velodyne coordinates, x, y, z
model = dict(
rpn_head=dict(
type='PartA2RPNHead',
num_classes=1,
anchor_generator=dict(
_delete_=True,
type='Anchor3DRangeGenerator',
ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=False)),
roi_head=dict(
num_classes=1,
semantic_head=dict(num_classes=1),
bbox_head=dict(num_classes=1)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
rpn=dict(
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False),
rpn_proposal=dict(
nms_pre=9000,
nms_post=512,
max_num=512,
nms_thr=0.8,
score_thr=0,
use_rotate_nms=False),
rcnn=dict(
assigner=dict( # for Car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
pos_iou_thr=0.55,
neg_iou_thr=0.55,
min_pos_iou=0.55,
ignore_iof_thr=-1),
sampler=dict(
type='IoUNegPiecewiseSampler',
num=128,
pos_fraction=0.55,
neg_piece_fractions=[0.8, 0.2],
neg_iou_piece_thrs=[0.55, 0.1],
neg_pos_ub=-1,
add_gt_as_proposals=False,
return_iou=True),
cls_pos_thr=0.75,
cls_neg_thr=0.25)),
test_cfg=dict(
rpn=dict(
nms_pre=1024,
nms_post=100,
max_num=100,
nms_thr=0.7,
score_thr=0,
use_rotate_nms=True),
rcnn=dict(
use_rotate_nms=True,
use_raw_score=True,
nms_thr=0.01,
score_thr=0.1)))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
classes=class_names,
sample_groups=dict(Car=15))
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[1.0, 1.0, 0.5],
global_rot_range=[0.0, 0.0],
rot_range=[-0.78539816, 0.78539816]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
train=dict(dataset=dict(pipeline=train_pipeline, classes=class_names)),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
find_unused_parameters = True
================================================
FILE: configs/pointpillars/README.md
================================================
# PointPillars: Fast Encoders for Object Detection from Point Clouds
## Introduction
[ALGORITHM]
We implement PointPillars and provide the results and checkpoints on KITTI, nuScenes, Lyft and Waymo datasets.
```
@inproceedings{lang2019pointpillars,
title={Pointpillars: Fast encoders for object detection from point clouds},
author={Lang, Alex H and Vora, Sourabh and Caesar, Holger and Zhou, Lubing and Yang, Jiong and Beijbom, Oscar},
booktitle={Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
pages={12697--12705},
year={2019}
}
```
## Results
### KITTI
| Backbone|Class | Lr schd | Mem (GB) | Inf time (fps) | AP |Download |
| :---------: | :-----: |:-----: | :------: | :------------: | :----: | :------: |
| [SECFPN](./hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py)|Car|cyclic 160e|5.4||77.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230614-77663cd6.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car_20200620_230614.log.json)|
| [SECFPN](./hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py)|3 Class|cyclic 160e|5.5||59.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20200620_230421-aa0f3adb.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class_20200620_230421.log.json)|
### nuScenes
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
|[SECFPN](./hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)|
|[FPN](./hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)|
### Lyft
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
|[SECFPN](./hv_pointpillars_secfpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||13.4|13.4||
|[FPN](./hv_pointpillars_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||14.0|14.2||
### Waymo
| Backbone | Load Interval | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
| :-------: | :-----------: |:-----:| :------:| :------: | :------------: | :----: | :-----: | :-----: | :-----: | :------: |
| [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py)|5|Car|2x|7.76||70.2|69.6|62.6|62.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315-302fc3e7.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car_20200901_204315.log.json)|
| [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py)|5|3 Class|2x|8.12||64.7|57.6|58.4|52.1|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144-d1a706b1.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class_20200831_204144.log.json)|
| above @ Car|||2x|8.12||68.5|67.9|60.1|59.6| |
| above @ Pedestrian|||2x|8.12||67.8|50.6|59.6|44.3| |
| above @ Cyclist|||2x|8.12||57.7|54.4|55.5|52.4| |
| [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py)|1|Car|2x|7.76||72.1|71.5|63.6|63.1|[log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.log.json)|
| [SECFPN](./hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.py)|1|3 Class|2x|8.12||68.8|63.3|62.6|57.6|[log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.log.json)|
| above @ Car|||2x|8.12||71.6|71.0|63.1|62.5| |
| above @ Pedestrian|||2x|8.12||70.6|56.7|62.9|50.2| |
| above @ Cyclist|||2x|8.12||64.4|62.3|61.9|59.9| |
#### Note:
- **Metric**: For model trained with 3 classes, the average APH@L2 (mAPH@L2) of all the categories is reported and used to rank the model. For model trained with only 1 class, the APH@L2 is reported and used to rank the model.
- **Data Split**: Here we provide several baselines for waymo dataset, among which D5 means that we divide the dataset into 5 folds and only use one fold for efficient experiments. Using the complete dataset can boost the performance a lot, especially for the detection of cyclist and pedestrian, where more than 5 mAP or mAPH improvement can be expected.
- **Implementation Details**: We basically follow the implementation in the [paper](https://arxiv.org/pdf/1912.04838.pdf) in terms of the network architecture (having a
stride of 1 for the first convolutional block). Different settings of voxelization, data augmentation and hyper parameters make these baselines outperform those in the paper by about 7 mAP for car and 4 mAP for pedestrian with only a subset of the whole dataset. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation.
- **License Agreement**: To comply the [license agreement of Waymo dataset](https://waymo.com/open/terms/), the pre-trained models on Waymo dataset are not released. We still release the training log as a reference to ease the future research.
================================================
FILE: configs/pointpillars/hv_pointpillars_fpn_sbn-all_2x8_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_lyft.py',
'../_base_/datasets/lyft-3d.py', '../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py'
]
================================================
FILE: configs/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_nus.py',
'../_base_/datasets/nus-3d.py', '../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py'
]
================================================
FILE: configs/pointpillars/hv_pointpillars_fpn_sbn-all_range100_2x8_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_range100_lyft.py',
'../_base_/datasets/range100_lyft-3d.py',
'../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
]
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_secfpn_kitti.py',
'../_base_/datasets/kitti-3d-3class.py',
'../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py'
]
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
# dataset settings
data_root = 'data/kitti/'
class_names = ['Pedestrian', 'Cyclist', 'Car']
# PointPillars adopted a different sampling strategies among classes
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)),
classes=class_names,
sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10))
# PointPillars uses different augmentation hyper parameters
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[0.25, 0.25, 0.25],
global_rot_range=[0.0, 0.0],
rot_range=[-0.15707963267, 0.15707963267]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
train=dict(dataset=dict(pipeline=train_pipeline, classes=class_names)),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
# In practice PointPillars also uses a different schedule
# optimizer
lr = 0.001
optimizer = dict(lr=lr)
# max_norm=35 is slightly better than 10 for PointPillars in the earlier
# development of the codebase thus we keep the setting. But we does not
# specifically tune this parameter.
optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2))
# Use evaluation interval=2 reduce the number of evaluation timese
evaluation = dict(interval=2)
# PointPillars usually need longer schedule than second, we simply double
# the training schedule. Do remind that since we use RepeatDataset and
# repeat factor is 2, so we actually train 160 epochs.
total_epochs = 80
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_6x8_160e_kitti-3d-car.py
================================================
# model settings
_base_ = './hv_pointpillars_secfpn_6x8_160e_kitti-3d-3class.py'
point_cloud_range = [0, -39.68, -3, 69.12, 39.68, 1]
model = dict(
bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
anchor_generator=dict(
_delete_=True,
type='Anchor3DRangeGenerator',
ranges=[[0, -39.68, -1.78, 69.12, 39.68, -1.78]],
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=True)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False))
# dataset settings
dataset_type = 'KittiDataset'
data_root = 'data/kitti/'
class_names = ['Car']
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'kitti_dbinfos_train.pkl',
rate=1.0,
prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)),
sample_groups=dict(Car=15),
classes=class_names)
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='ObjectNoise',
num_try=100,
translation_std=[0.25, 0.25, 0.25],
global_rot_range=[0.0, 0.0],
rot_range=[-0.15707963267, 0.15707963267]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(pipeline=train_pipeline, classes=class_names)),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_2x8_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_lyft.py',
'../_base_/datasets/lyft-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
pts_neck=dict(
_delete_=True,
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024],
[-80, -80, -0.3033737, 80, 80, -0.3033737],
[-80, -80, -0.3519405, 80, 80, -0.3519405],
[-80, -80, -0.8871424, 80, 80, -0.8871424],
[-80, -80, -0.6276341, 80, 80, -0.6276341],
[-80, -80, -1.3220503, 80, 80, -1.3220503],
[-80, -80, -1.0709302, 80, 80, -1.0709302],
[-80, -80, -0.9122268, 80, 80, -0.9122268],
[-80, -80, -1.8012227, 80, 80, -1.8012227]],
sizes=[
[1.92, 4.75, 1.71], # car
[2.84, 10.24, 3.44], # truck
[2.92, 12.70, 3.42], # bus
[2.42, 6.52, 2.34], # emergency vehicle
[2.75, 8.17, 3.20], # other vehicle
[0.96, 2.35, 1.59], # motorcycle
[0.63, 1.76, 1.44], # bicycle
[0.76, 0.80, 1.76], # pedestrian
[0.35, 0.73, 0.50] # animal
],
rotations=[0, 1.57],
reshape_out=True)))
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_nus.py',
'../_base_/datasets/nus-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
pts_neck=dict(
_delete_=True,
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[
[-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
[-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
[-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
[-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
[-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
[-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
[-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
],
sizes=[
[1.95017717, 4.60718145, 1.72270761], # car
[2.4560939, 6.73778078, 2.73004906], # truck
[2.87427237, 12.01320693, 3.81509561], # trailer
[0.60058911, 1.68452161, 1.27192197], # bicycle
[0.66344886, 0.7256437, 1.75748069], # pedestrian
[0.39694519, 0.40359262, 1.06232151], # traffic_cone
[2.49008838, 0.48578221, 0.98297065], # barrier
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=True)))
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_range100_lyft.py',
'../_base_/datasets/range100_lyft-3d.py',
'../_base_/schedules/schedule_2x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
pts_neck=dict(
_delete_=True,
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024],
[-100, -100, -0.3033737, 100, 100, -0.3033737],
[-100, -100, -0.3519405, 100, 100, -0.3519405],
[-100, -100, -0.8871424, 100, 100, -0.8871424],
[-100, -100, -0.6276341, 100, 100, -0.6276341],
[-100, -100, -1.3220503, 100, 100, -1.3220503],
[-100, -100, -1.0709302, 100, 100, -1.0709302],
[-100, -100, -0.9122268, 100, 100, -0.9122268],
[-100, -100, -1.8012227, 100, 100, -1.8012227]],
sizes=[
[1.92, 4.75, 1.71], # car
[2.84, 10.24, 3.44], # truck
[2.92, 12.70, 3.42], # bus
[2.42, 6.52, 2.34], # emergency vehicle
[2.75, 8.17, 3.20], # other vehicle
[0.96, 2.35, 1.59], # motorcycle
[0.63, 1.76, 1.44], # bicycle
[0.76, 0.80, 1.76], # pedestrian
[0.35, 0.73, 0.50] # animal
],
rotations=[0, 1.57],
reshape_out=True)))
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-3class.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_secfpn_waymo.py',
'../_base_/datasets/waymoD5-3d-3class.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# data settings
data = dict(train=dict(dataset=dict(load_interval=1)))
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymo-3d-car.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_secfpn_waymo.py',
'../_base_/datasets/waymoD5-3d-car.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# data settings
data = dict(train=dict(dataset=dict(load_interval=1)))
# model settings
model = dict(
type='MVXFasterRCNN',
pts_bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]],
sizes=[[2.08, 4.73, 1.77]],
rotations=[0, 1.57],
reshape_out=True)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
pts=dict(
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
pos_weight=-1,
debug=False)))
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_secfpn_waymo.py',
'../_base_/datasets/waymoD5-3d-3class.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
================================================
FILE: configs/pointpillars/hv_pointpillars_secfpn_sbn_2x16_2x_waymoD5-3d-car.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_secfpn_waymo.py',
'../_base_/datasets/waymoD5-3d-car.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pts_bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
anchor_generator=dict(
type='AlignedAnchor3DRangeGenerator',
ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345]],
sizes=[[2.08, 4.73, 1.77]],
rotations=[0, 1.57],
reshape_out=True)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
pts=dict(
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
pos_weight=-1,
debug=False)))
================================================
FILE: configs/regnet/README.md
================================================
# Designing Network Design Spaces
## Introduction
[BACKBONE]
We implement RegNetX models in 3D detection systems and provide their first results with PointPillars on nuScenes dataset.
The pre-trained models are converted from [model zoo of pycls](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md) and maintained in [mmcv](https://github.com/open-mmlab/mmcv).
```
@article{radosavovic2020designing,
title={Designing Network Design Spaces},
author={Ilija Radosavovic and Raj Prateek Kosaraju and Ross Girshick and Kaiming He and Piotr Dollár},
year={2020},
eprint={2003.13678},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
## Usage
To use a regnet model, there are two steps to do:
1. Convert the model to ResNet-style supported by MMDetection
2. Modify backbone and neck in config accordingly
### Convert model
We already prepare models of FLOPs from 800M to 12G in our model zoo.
For more general usage, we also provide script `regnet2mmdet.py` in the tools directory to convert the key of models pretrained by [pycls](https://github.com/facebookresearch/pycls/) to
ResNet-style checkpoints used in MMDetection.
```bash
python -u tools/model_converters/regnet2mmdet.py ${PRETRAIN_PATH} ${STORE_PATH}
```
This script convert model from `PRETRAIN_PATH` and store the converted model in `STORE_PATH`.
### Modify config
The users can modify the config's `depth` of backbone and corresponding keys in `arch` according to the configs in the [pycls model zoo](https://github.com/facebookresearch/pycls/blob/master/MODEL_ZOO.md).
The parameter `in_channels` in FPN can be found in the Figure 15 & 16 of the paper (`wi` in the legend).
This directory already provides some configs with their performance, using RegNetX from 800MF to 12GF level.
For other pre-trained models or self-implemented regnet models, the users are responsible to check these parameters by themselves.
**Note**: Although Fig. 15 & 16 also provide `w0`, `wa`, `wm`, `group_w`, and `bot_mul` for `arch`, they are quantized thus inaccurate, using them sometimes produces different backbone that does not match the key in the pre-trained model.
## Results
### nuScenes
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP |NDS| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
|[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.7|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)|
|[RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py)| 2x |16.4||41.2|55.2|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json)|
|[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|17.1||40.0|53.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405-2fa62f3d.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_fpn_sbn-all_4x8_2x_nus-3d_20200620_230405.log.json)|
|[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|17.3||44.8|56.4|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239-c694dce7.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d_20200620_230239.log.json)|
|[RegNetX-1.6gF-FPN](./hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py)|2x|24.0||48.2|59.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311-dcd4e090.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d_20200629_050311.log.json)|
### Lyft
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
|[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||13.4|13.4||
|[RegNetX-400MF-SECFPN](./hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_lyft-3d.py)| 2x ||||||
|[FPN](../pointpillars/hv_pointpillars_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||14.0|14.2||
|[RegNetX-400MF-FPN](./hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||15.5|15.6||
================================================
FILE: configs/regnet/hv_pointpillars_regnet-1.6gf_fpn_sbn-all_4x8_2x_nus-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_nus.py',
'../_base_/datasets/nus-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pretrained=dict(pts='open-mmlab://regnetx_1.6gf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch='regnetx_1.6gf',
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[168, 408, 912]))
================================================
FILE: configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_lyft.py',
'../_base_/datasets/lyft-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pretrained=dict(pts='open-mmlab://regnetx_400mf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
================================================
FILE: configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_nus.py',
'../_base_/datasets/nus-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pretrained=dict(pts='open-mmlab://regnetx_400mf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
================================================
FILE: configs/regnet/hv_pointpillars_regnet-400mf_fpn_sbn-all_range100_2x8_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_range100_lyft.py',
'../_base_/datasets/range100_lyft-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# model settings
model = dict(
type='MVXFasterRCNN',
pretrained=dict(pts='open-mmlab://regnetx_400mf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
================================================
FILE: configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_2x8_2x_lyft-3d.py
================================================
_base_ = './hv_pointpillars_regnet-400mf_fpn_sbn-all_2x8_2x_lyft-3d.py'
# model settings
model = dict(
pts_neck=dict(
type='SECONDFPN',
_delete_=True,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 160, 384],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[[-80, -80, -1.0715024, 80, 80, -1.0715024],
[-80, -80, -0.3033737, 80, 80, -0.3033737],
[-80, -80, -0.3519405, 80, 80, -0.3519405],
[-80, -80, -0.8871424, 80, 80, -0.8871424],
[-80, -80, -0.6276341, 80, 80, -0.6276341],
[-80, -80, -1.3220503, 80, 80, -1.3220503],
[-80, -80, -1.0709302, 80, 80, -1.0709302],
[-80, -80, -0.9122268, 80, 80, -0.9122268],
[-80, -80, -1.8012227, 80, 80, -1.8012227]],
sizes=[
[1.92, 4.75, 1.71], # car
[2.84, 10.24, 3.44], # truck
[2.92, 12.70, 3.42], # bus
[2.42, 6.52, 2.34], # emergency vehicle
[2.75, 8.17, 3.20], # other vehicle
[0.96, 2.35, 1.59], # motorcycle
[0.63, 1.76, 1.44], # bicycle
[0.76, 0.80, 1.76], # pedestrian
[0.35, 0.73, 0.50] # animal
],
rotations=[0, 1.57],
reshape_out=True)))
================================================
FILE: configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py
================================================
_base_ = './hv_pointpillars_regnet-400mf_fpn_sbn-all_4x8_2x_nus-3d.py'
# model settings
model = dict(
pts_neck=dict(
type='SECONDFPN',
_delete_=True,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 160, 384],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[
[-49.6, -49.6, -1.80032795, 49.6, 49.6, -1.80032795],
[-49.6, -49.6, -1.74440365, 49.6, 49.6, -1.74440365],
[-49.6, -49.6, -1.68526504, 49.6, 49.6, -1.68526504],
[-49.6, -49.6, -1.67339111, 49.6, 49.6, -1.67339111],
[-49.6, -49.6, -1.61785072, 49.6, 49.6, -1.61785072],
[-49.6, -49.6, -1.80984986, 49.6, 49.6, -1.80984986],
[-49.6, -49.6, -1.763965, 49.6, 49.6, -1.763965],
],
sizes=[
[1.95017717, 4.60718145, 1.72270761], # car
[2.4560939, 6.73778078, 2.73004906], # truck
[2.87427237, 12.01320693, 3.81509561], # trailer
[0.60058911, 1.68452161, 1.27192197], # bicycle
[0.66344886, 0.7256437, 1.75748069], # pedestrian
[0.39694519, 0.40359262, 1.06232151], # traffic_cone
[2.49008838, 0.48578221, 0.98297065], # barrier
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=True)))
================================================
FILE: configs/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_range100_2x8_2x_lyft-3d.py
================================================
_base_ = \
'./hv_pointpillars_regnet-400mf_fpn_sbn-all_range100_2x8_2x_lyft-3d.py'
# model settings
model = dict(
pts_neck=dict(
type='SECONDFPN',
_delete_=True,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 160, 384],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
type='Anchor3DHead',
in_channels=384,
feat_channels=384,
anchor_generator=dict(
_delete_=True,
type='AlignedAnchor3DRangeGenerator',
ranges=[[-100, -100, -1.0715024, 100, 100, -1.0715024],
[-100, -100, -0.3033737, 100, 100, -0.3033737],
[-100, -100, -0.3519405, 100, 100, -0.3519405],
[-100, -100, -0.8871424, 100, 100, -0.8871424],
[-100, -100, -0.6276341, 100, 100, -0.6276341],
[-100, -100, -1.3220503, 100, 100, -1.3220503],
[-100, -100, -1.0709302, 100, 100, -1.0709302],
[-100, -100, -0.9122268, 100, 100, -0.9122268],
[-100, -100, -1.8012227, 100, 100, -1.8012227]],
sizes=[
[1.92, 4.75, 1.71], # car
[2.84, 10.24, 3.44], # truck
[2.92, 12.70, 3.42], # bus
[2.42, 6.52, 2.34], # emergency vehicle
[2.75, 8.17, 3.20], # other vehicle
[0.96, 2.35, 1.59], # motorcycle
[0.63, 1.76, 1.44], # bicycle
[0.76, 0.80, 1.76], # pedestrian
[0.35, 0.73, 0.50] # animal
],
rotations=[0, 1.57],
reshape_out=True)))
================================================
FILE: configs/second/README.md
================================================
# Second: Sparsely embedded convolutional detection
## Introduction
[ALGORITHM]
We implement SECOND and provide the results and checkpoints on KITTI dataset.
```
@article{yan2018second,
title={Second: Sparsely embedded convolutional detection},
author={Yan, Yan and Mao, Yuxing and Li, Bo},
journal={Sensors},
year={2018},
publisher={Multidisciplinary Digital Publishing Institute}
}
```
## Results
### KITTI
| Backbone |Class| Lr schd | Mem (GB) | Inf time (fps) | mAP |Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-car.py)| Car |cyclic 80e|5.4||79.07|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238-393f000c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-car/hv_second_secfpn_6x8_80e_kitti-3d-car_20200620_230238.log.json)|
| [SECFPN](./hv_second_secfpn_6x8_80e_kitti-3d-3class.py)| 3 Class |cyclic 80e|5.4||64.41|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238-9208083a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_6x8_80e_kitti-3d-3class/hv_second_secfpn_6x8_80e_kitti-3d-3class_20200620_230238.log.json)|
### Waymo
| Backbone | Load Interval | Class | Lr schd | Mem (GB) | Inf time (fps) | mAP@L1 | mAPH@L1 | mAP@L2 | **mAPH@L2** | Download |
| :-------: | :-----------: |:-----:| :------:| :------: | :------------: | :----: | :-----: | :-----: | :-----: | :------: |
| [SECFPN](./hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py)|5|3 Class|2x|8.12||65.3|61.7|58.9|55.7|[log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/second/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class/hv_second_secfpn_sbn_4x8_2x_waymoD5-3d-3class_20201115_112448.log.json)|
| above @ Car|||2x|8.12||67.1|66.6|58.7|58.2| |
| above @ Pedestrian|||2x|8.12||68.1|59.1|59.5|51.5| |
| above @ Cyclist|||2x|8.12||60.7|59.5|58.4|57.3| |
Note: See more details about metrics and data split on Waymo [HERE](https://github.com/open-mmlab/mmdetection3d/tree/master/configs/pointpillars). For implementation details, we basically follow the original settings. All of these results are achieved without bells-and-whistles, e.g. ensemble, multi-scale training and test augmentation.
================================================
FILE: configs/second/hv_second_secfpn_6x8_80e_kitti-3d-3class.py
================================================
_base_ = [
'../_base_/models/hv_second_secfpn_kitti.py',
'../_base_/datasets/kitti-3d-3class.py',
'../_base_/schedules/cyclic_40e.py', '../_base_/default_runtime.py'
]
================================================
FILE: configs/second/hv_second_secfpn_6x8_80e_kitti-3d-car.py
================================================
_base_ = [
'../_base_/models/hv_second_secfpn_kitti.py',
'../_base_/datasets/kitti-3d-car.py', '../_base_/schedules/cyclic_40e.py',
'../_base_/default_runtime.py'
]
point_cloud_range = [0, -40, -3, 70.4, 40, 1]
model = dict(
bbox_head=dict(
type='Anchor3DHead',
num_classes=1,
anchor_generator=dict(
_delete_=True,
type='Anchor3DRangeGenerator',
ranges=[[0, -40.0, -1.78, 70.4, 40.0, -1.78]],
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.57],
reshape_out=True)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
assigner=dict(
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
allowed_border=0,
pos_weight=-1,
debug=False))
================================================
FILE: configs/second/hv_second_secfpn_sbn_2x16_2x_waymoD5-3d-3class.py
================================================
_base_ = [
'../_base_/models/hv_second_secfpn_waymo.py',
'../_base_/datasets/waymoD5-3d-3class.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format/'
class_names = ['Car', 'Pedestrian', 'Cyclist']
point_cloud_range = [-76.8, -51.2, -2, 76.8, 51.2, 4]
input_modality = dict(use_lidar=True, use_camera=False)
db_sampler = dict(
data_root=data_root,
info_path=data_root + 'waymo_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
classes=class_names,
sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
points_loader=dict(
type='LoadPointsFromFile', load_dim=5, use_dim=[0, 1, 2, 3, 4]))
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='ObjectSample', db_sampler=db_sampler),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05]),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type='RepeatDataset',
times=2,
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_train.pkl',
split='training',
pipeline=train_pipeline,
modality=input_modality,
classes=class_names,
test_mode=False,
# we use box_type_3d='LiDAR' in kitti and nuscenes dataset
# and box_type_3d='Depth' in sunrgbd and scannet dataset.
box_type_3d='LiDAR',
# load one frame every five frames
load_interval=5)),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + 'waymo_infos_val.pkl',
split='training',
pipeline=test_pipeline,
modality=input_modality,
classes=class_names,
test_mode=True,
box_type_3d='LiDAR'))
================================================
FILE: configs/sparsefusion_nusc_voxel_LC_SwinT.py
================================================
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.075, 0.075, 0.2]
out_size_factor = 8
evaluation = dict(interval=1)
dataset_type = 'NuScenesDataset_ViewInfo'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_scale = (800, 448)
num_views = 6
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='MyLoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_centers=True, with_cam_bbox=True, with_visible=True),
dict(type='LoadMultiViewImageFromFiles'),
dict(
type='OurGlobalRotScaleTrans',
rot_range=[-0.3925 * 2, 0.3925 * 2],
scale_ratio_range=[0.9, 1.1],
translation_std=[0.5, 0.5, 0.5],
),
dict(
type='OurRandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
# dict(type='PhotoMetricDistortionMultiViewImage', swap_channel=False), # color augmentation cannot improve the performance
dict(type='OurRandomAffine', scaling_ratio_range=(0.9, 1.1), flip_ratio=0.5, flip_sync_3d=True),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(type='SparseDepth', scale_factors=[4], exp_time=0),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='OurObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels', 'gt_pts_centers_view', 'gt_img_centers_view', 'gt_bboxes_cam_view', 'gt_bboxes_lidar_view', 'sparse_depth', 'gt_visible_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadMultiViewImageFromFiles'),
dict(
type='MultiScaleFlipAug3D',
img_scale=img_scale,
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(type='SparseDepth', scale_factors=[4]),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points', 'img', 'sparse_depth'])
])
]
# our default setting uses 4 GPUs with 3 samples per-GPU, please ensure the LR consistent with your batch size
data = dict(
samples_per_gpu=3,
workers_per_gpu=4,
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_w_views_train.pkl',
load_interval=1,
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_w_views_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_w_views_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='SparseFusionDetector',
freeze_img=False,
img_backbone=dict(
type='SwinTransformer',
embed_dims=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.2,
patch_norm=True,
out_indices=(0, 1, 2, 3),
with_cp=False,
convert_weights=True,
),
img_neck=dict(
type='FPN',
in_channels=[96, 192, 384, 768],
out_channels=256,
num_outs=5),
pts_voxel_layer=dict(
max_num_points=10,
voxel_size=voxel_size,
max_voxels=(120000, 160000),
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='HardSimpleVFE',
num_features=5,
),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[41, 1440, 1440],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='SparseFusionHead2D_Deform',
num_views=num_views,
in_channels_img=256,
out_size_factor_img=4,
in_channels=256 * 2,
hidden_channel=128,
num_heads=8,
num_classes=len(class_names),
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
img_reg_bn=False,
img_reg_layer=3,
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
num_proposals=200, # query number in the LiDAR branch
num_img_proposals=200, # query number in the camera branch
level_num=4,
num_pts_decoder_layers=1, # number of transformer layers in the point detector (if you set it >1, ensure it is consistent with your pretrained LiDAR-only model or set "freeze_lidar_detector=False")
num_img_decoder_layers=1, # number of transformer layers in the image detector
num_fusion_decoder_layers=1, # number of the transformer layers in the fusion stage
initialize_by_heatmap=True, # initialize the queries based on the heatmap (we never set it as False)
semantic_transfer=True, # whether to use semantic transfer (camera to LiDAR)
cross_only=True, # if false, output heatmap would be the average of semantic transfer and the LiDAR-only heatmap of TransFusion-L
cross_heatmap_layer=1,
nms_kernel_size=3, # suppress nearby proposals when initializing queries for the LiDAR branch
geometric_transfer=True, # whether to use geometric transfer
depth_input_channel=2, # channel number of depth features. Do not change it unless you modify the SparseDepth class in "mmdet3d/datasets/pipelines/loading.py"
img_heatmap_layer=2,
img_nms_kernel_size=3, # suppress nearby proposals when initializing queries for the camera branch
view_transform=True, # whether to transform the coordinate for the output bboxes of the camera branch
use_camera='se', # "se" or None: whether to encode the camera parameters in the view transformation
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
code_size=10,
),
bbox_2d_coder=dict(
type='CameraBBoxCoder',
code_size=10,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1),
loss_heatmap_2d=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1),
loss_center_2d=dict(type='L1Loss', reduction='mean', loss_weight=5.0),
),
train_cfg=dict(
pts=dict(
dataset='nuScenes',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25)
),
assigner_2d=dict(
type='HungarianAssignerCameraBox',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='camera'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25),
),
pos_weight=-1,
gaussian_overlap=0.1,
gaussian_overlap_2d=0.1,
min_radius=2,
max_radius=999,
grid_size=[1440, 1440, 40], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
img_code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='nuScenes',
grid_size=[1440, 1440, 40],
img_scale=img_scale,
out_size_factor=out_size_factor,
pc_range=point_cloud_range,
voxel_size=voxel_size,
nms_type='circle',
)))
optimizer = dict(
type='AdamW',
lr=0.000075,
weight_decay=0.01,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1, decay_mult=5),
'img_neck': dict(lr_mult=0.1),
'pts_voxel_layer': dict(lr_mult=0.1),
'pts_voxel_encoder': dict(lr_mult=0.1),
'pts_middle_encoder': dict(lr_mult=0.1),
'pts_backbone': dict(lr_mult=0.1),
'pts_neck': dict(lr_mult=0.1),
'pts_bbox_head.point_transformer': dict(lr_mult=0.1),
'pts_bbox_head.class_encoding': dict(lr_mult=0.1),
'pts_bbox_head.heatmap_head': dict(lr_mult=0.1),
'pts_bbox_head.shared_conv': dict(lr_mult=0.1),
'absolute_pos_embed': dict(decay_mult=0.),
'relative_position_bias_table': dict(decay_mult=0.),
'norm': dict(decay_mult=0.)
}),
) # for 4gpu * 3sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(8, 0.0001),
cyclic_times=1,
step_ratio_up=0.4,
)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 6
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = 'checkpoints/sparsefusion_voxel0075_SwinT_initial.pth'
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
freeze_lidar_components = True # freeze the LiDAR backbone
freeze_lidar_detector = True # freeze the LiDAR detector
find_unused_parameters = True
# Evaluating bboxes of pts_bbox
# mAP: 0.7102
# mATE: 0.2778
# mASE: 0.2477
# mAOE: 0.2701
# mAVE: 0.2529
# mAAE: 0.1881
# NDS: 0.7314
# Eval time: 133.6s
#
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.883 0.171 0.147 0.067 0.263 0.184
# truck 0.651 0.306 0.176 0.078 0.230 0.216
# bus 0.777 0.306 0.178 0.043 0.396 0.256
# trailer 0.453 0.527 0.211 0.466 0.184 0.164
# construction_vehicle 0.308 0.686 0.420 0.857 0.124 0.316
# pedestrian 0.897 0.128 0.280 0.328 0.215 0.099
# motorcycle 0.823 0.188 0.236 0.216 0.421 0.254
# bicycle 0.727 0.164 0.262 0.314 0.189 0.016
# traffic_cone 0.803 0.118 0.298 nan nan nan
# barrier 0.779 0.185 0.269 0.060 nan nan
================================================
FILE: configs/sparsefusion_nusc_voxel_LC_r50.py
================================================
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.075, 0.075, 0.2]
out_size_factor = 8
evaluation = dict(interval=1)
dataset_type = 'NuScenesDataset_ViewInfo'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_scale = (800, 448)
num_views = 6
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='MyLoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, with_bbox=True, with_label=True, with_centers=True, with_cam_bbox=True, with_visible=True),
dict(type='LoadMultiViewImageFromFiles'),
dict(
type='OurGlobalRotScaleTrans',
rot_range=[-0.3925 * 2, 0.3925 * 2],
scale_ratio_range=[0.9, 1.1],
translation_std=[0.5, 0.5, 0.5],
),
dict(
type='OurRandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
# dict(type='PhotoMetricDistortionMultiViewImage', swap_channel=False), # color augmentation cannot improve the performance
dict(type='OurRandomAffine', scaling_ratio_range=(0.9, 1.1), flip_ratio=0.5, flip_sync_3d=True),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(type='SparseDepth', scale_factors=[4], exp_time=0),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='OurObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels', 'gt_pts_centers_view', 'gt_img_centers_view', 'gt_bboxes_cam_view', 'gt_bboxes_lidar_view', 'sparse_depth', 'gt_visible_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadMultiViewImageFromFiles'),
dict(
type='MultiScaleFlipAug3D',
img_scale=img_scale,
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(type='SparseDepth', scale_factors=[4]),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points', 'img', 'sparse_depth'])
])
]
# our default setting uses 4 GPUs with 4 samples per-GPU, please ensure the LR consistent with your batch size
data = dict(
samples_per_gpu=4,
workers_per_gpu=4,
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_w_views_train.pkl',
load_interval=1,
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_w_views_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_w_views_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='SparseFusionDetector',
freeze_img=False,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
in_channels=3,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch',
),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
pts_voxel_layer=dict(
max_num_points=10,
voxel_size=voxel_size,
max_voxels=(120000, 160000),
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='HardSimpleVFE',
num_features=5,
),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[41, 1440, 1440],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='SparseFusionHead2D_Deform',
num_views=num_views,
in_channels_img=256,
out_size_factor_img=4,
in_channels=256 * 2,
hidden_channel=128,
num_heads=8,
num_classes=len(class_names),
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
img_reg_bn=False,
img_reg_layer=3,
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
num_proposals=200, # query number in the LiDAR branch
num_img_proposals=200, # query number in the camera branch
level_num=4,
num_pts_decoder_layers=1, # number of transformer layers in the point detector (if you set it >1, ensure it is consistent with your pretrained LiDAR-only model or set "freeze_lidar_detector=False")
num_img_decoder_layers=1, # number of transformer layers in the image detector
num_fusion_decoder_layers=1, # number of the transformer layers in the fusion stage
initialize_by_heatmap=True, # initialize the queries based on the heatmap (we never set it as False)
semantic_transfer=True, # whether to use semantic transfer (camera to LiDAR)
cross_only=True, # if false, output heatmap would be the average of semantic transfer and the LiDAR-only heatmap of TransFusion-L
cross_heatmap_layer=1,
nms_kernel_size=3, # suppress nearby proposals when initializing queries for the LiDAR branch
geometric_transfer=True, # whether to use geometric transfer
depth_input_channel=2, # channel number of depth features. Do not change it unless you modify the SparseDepth class in "mmdet3d/datasets/pipelines/loading.py"
img_heatmap_layer=2,
img_nms_kernel_size=3, # suppress nearby proposals when initializing queries for the camera branch
view_transform=True, # whether to transform the coordinate for the output bboxes of the camera branch
use_camera='se', # "se" or None: whether to encode the camera parameters in the view transformation
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
code_size=10,
),
bbox_2d_coder=dict(
type='CameraBBoxCoder',
code_size=10,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1),
loss_heatmap_2d=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=0.1),
loss_center_2d=dict(type='L1Loss', reduction='mean', loss_weight=5.0),
),
train_cfg=dict(
pts=dict(
dataset='nuScenes',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25)
),
assigner_2d=dict(
type='HungarianAssignerCameraBox',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='camera'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25),
),
pos_weight=-1,
gaussian_overlap=0.1,
gaussian_overlap_2d=0.1,
min_radius=2,
max_radius=999,
grid_size=[1440, 1440, 40], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
img_code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='nuScenes',
grid_size=[1440, 1440, 40],
img_scale=img_scale,
out_size_factor=out_size_factor,
pc_range=point_cloud_range,
voxel_size=voxel_size,
nms_type='circle',
)))
optimizer = dict(
type='AdamW',
lr=0.0001,
weight_decay=0.01,
paramwise_cfg=dict(
custom_keys={
'img_backbone': dict(lr_mult=0.1),
'img_neck': dict(lr_mult=0.1),
'pts_voxel_layer': dict(lr_mult=0.1),
'pts_voxel_encoder': dict(lr_mult=0.1),
'pts_middle_encoder': dict(lr_mult=0.1),
'pts_backbone': dict(lr_mult=0.1),
'pts_neck': dict(lr_mult=0.1),
'pts_bbox_head.point_transformer': dict(lr_mult=0.1),
'pts_bbox_head.class_encoding': dict(lr_mult=0.1),
'pts_bbox_head.heatmap_head': dict(lr_mult=0.1),
'pts_bbox_head.shared_conv': dict(lr_mult=0.1),
}),
) # for 4gpu * 4sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(8, 0.0001),
cyclic_times=1,
step_ratio_up=0.4,
)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 6
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = 'checkpoints/sparsefusion_voxel0075_R50_initial.pth'
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
freeze_lidar_components = True # freeze the LiDAR backbone
freeze_lidar_detector = True # freeze the LiDAR detector
find_unused_parameters = True
# Evaluating bboxes of pts_bbox
# mAP: 0.7051
# mATE: 0.2757
# mASE: 0.2506
# mAOE: 0.2767
# mAVE: 0.2562
# mAAE: 0.1869
# NDS: 0.7279
# Eval time: 137.2s
#
# Per-class results:
# Object Class AP ATE ASE AOE AVE AAE
# car 0.883 0.171 0.146 0.066 0.262 0.187
# truck 0.643 0.305 0.177 0.071 0.235 0.211
# bus 0.775 0.304 0.177 0.044 0.411 0.250
# trailer 0.447 0.522 0.214 0.432 0.179 0.159
# construction_vehicle 0.303 0.669 0.424 0.842 0.127 0.326
# pedestrian 0.898 0.127 0.282 0.329 0.216 0.104
# motorcycle 0.810 0.189 0.241 0.215 0.426 0.249
# bicycle 0.712 0.164 0.263 0.422 0.193 0.010
# traffic_cone 0.808 0.118 0.309 nan nan nan
# barrier 0.772 0.188 0.273 0.068 nan nan
================================================
FILE: configs/ssn/README.md
================================================
# SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds
## Introduction
[ALGORITHM]
We implement PointPillars with Shape-aware grouping heads used in the SSN and provide the results and checkpoints on the nuScenes and Lyft dataset.
```
@inproceedings{zhu2020ssn,
title={SSN: Shape Signature Networks for Multi-class Object Detection from Point Clouds},
author={Zhu, Xinge and Ma, Yuexin and Wang, Tai and Xu, Yan and Shi, Jianping and Lin, Dahua},
booktitle={Proceedings of the European Conference on Computer Vision},
year={2020}
}
```
## Results
### NuScenes
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | mAP | NDS | Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
|[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||35.17|49.76|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725-0817d270.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230725.log.json)|
|[SSN](./hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py)|2x|9.62||41.56|54.83|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20201023_193737-5fda3f00.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d_20201023_193737.log.json)|
[RegNetX-400MF-SECFPN](../regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d.py)|2x|16.4||41.15|55.20|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334-53044f32.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/regnet/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d/hv_pointpillars_regnet-400mf_secfpn_sbn-all_4x8_2x_nus-3d_20200620_230334.log.json)|
|[RegNetX-400MF-SSN](./hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d.py)|2x|10.26||46.95|58.24|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20201024_232447-7af3d8c8.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d_20201024_232447.log.json)|
### Lyft
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | Private Score | Public Score | Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
|[SECFPN](../pointpillars/hv_pointpillars_secfpn_sbn-all_4x8_2x_lyft-3d.py)|2x|||13.4|13.4||
|[SSN](./hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py)|2x|8.30||17.4|17.5|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20201016_220844-3058d9fc.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d_20201016_220844.log.json)|
|[RegNetX-400MF-SSN](./hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d.py)|2x|9.98||18.1|18.3|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d_20201025_213155-4532096c.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_lyft-3d_20201025_213155.log.json)|
Note:
The main difference of the shape-aware grouping heads with the original SECOND FPN heads is that the former groups objects with similar sizes and shapes together, and design shape-specific heads for each group. Heavier heads (with more convolutions and large strides) are designed for large objects while smaller heads for small objects. Note that there may appear different feature map sizes in the outputs, so an anchor generator tailored to these feature maps is also needed in the implementation.
Users could try other settings in terms of the head design. Here we basically refer to the implementation [HERE](https://github.com/xinge008/SSN).
================================================
FILE: configs/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_1x16_2x_lyft-3d.py
================================================
_base_ = './hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py'
# model settings
model = dict(
type='MVXFasterRCNN',
pretrained=dict(pts='open-mmlab://regnetx_400mf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
# dataset settings
data = dict(samples_per_gpu=1, workers_per_gpu=2)
================================================
FILE: configs/ssn/hv_ssn_regnet-400mf_secfpn_sbn-all_2x16_2x_nus-3d.py
================================================
_base_ = './hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py'
# model settings
model = dict(
type='MVXFasterRCNN',
pretrained=dict(pts='open-mmlab://regnetx_400mf'),
pts_backbone=dict(
_delete_=True,
type='NoStemRegNet',
arch=dict(w0=24, wa=24.48, wm=2.54, group_w=16, depth=22, bot_mul=1.0),
out_indices=(1, 2, 3),
frozen_stages=-1,
strides=(1, 2, 2, 2),
base_channels=64,
stem_channels=64,
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
norm_eval=False,
style='pytorch'),
pts_neck=dict(in_channels=[64, 160, 384]))
================================================
FILE: configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_lyft-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_lyft.py',
'../_base_/datasets/lyft-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
point_cloud_range = [-100, -100, -5, 100, 100, 3]
# Note that the order of class names should be consistent with
# the following anchors' order
class_names = [
'bicycle', 'motorcycle', 'pedestrian', 'animal', 'car',
'emergency_vehicle', 'bus', 'other_vehicle', 'truck'
]
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(pipeline=train_pipeline, classes=class_names),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
# model settings
model = dict(
pts_voxel_layer=dict(point_cloud_range=[-100, -100, -5, 100, 100, 3]),
pts_voxel_encoder=dict(
feat_channels=[32, 64],
point_cloud_range=[-100, -100, -5, 100, 100, 3]),
pts_middle_encoder=dict(output_shape=[800, 800]),
pts_neck=dict(
_delete_=True,
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
_delete_=True,
type='ShapeAwareHead',
num_classes=9,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGeneratorPerCls',
ranges=[[-100, -100, -1.0709302, 100, 100, -1.0709302],
[-100, -100, -1.3220503, 100, 100, -1.3220503],
[-100, -100, -0.9122268, 100, 100, -0.9122268],
[-100, -100, -1.8012227, 100, 100, -1.8012227],
[-100, -100, -1.0715024, 100, 100, -1.0715024],
[-100, -100, -0.8871424, 100, 100, -0.8871424],
[-100, -100, -0.3519405, 100, 100, -0.3519405],
[-100, -100, -0.6276341, 100, 100, -0.6276341],
[-100, -100, -0.3033737, 100, 100, -0.3033737]],
sizes=[
[0.63, 1.76, 1.44], # bicycle
[0.96, 2.35, 1.59], # motorcycle
[0.76, 0.80, 1.76], # pedestrian
[0.35, 0.73, 0.50], # animal
[1.92, 4.75, 1.71], # car
[2.42, 6.52, 2.34], # emergency vehicle
[2.92, 12.70, 3.42], # bus
[2.75, 8.17, 3.20], # other vehicle
[2.84, 10.24, 3.44] # truck
],
custom_values=[],
rotations=[0, 1.57],
reshape_out=False),
tasks=[
dict(
num_class=2,
class_names=['bicycle', 'motorcycle'],
shared_conv_channels=(64, 64),
shared_conv_strides=(1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=2,
class_names=['pedestrian', 'animal'],
shared_conv_channels=(64, 64),
shared_conv_strides=(1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=2,
class_names=['car', 'emergency_vehicle'],
shared_conv_channels=(64, 64, 64),
shared_conv_strides=(2, 1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=3,
class_names=['bus', 'other_vehicle', 'truck'],
shared_conv_channels=(64, 64, 64),
shared_conv_strides=(2, 1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01))
],
assign_per_class=True,
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)))
# model training and testing settings
train_cfg = dict(
_delete_=True,
pts=dict(
assigner=[
dict( # bicycle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # motorcycle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # animal
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
dict( # emergency vehicle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # bus
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
dict( # other vehicle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # truck
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1)
],
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
pos_weight=-1,
debug=False))
================================================
FILE: configs/ssn/hv_ssn_secfpn_sbn-all_2x16_2x_nus-3d.py
================================================
_base_ = [
'../_base_/models/hv_pointpillars_fpn_nus.py',
'../_base_/datasets/nus-3d.py',
'../_base_/schedules/schedule_2x.py',
'../_base_/default_runtime.py',
]
# Note that the order of class names should be consistent with
# the following anchors' order
point_cloud_range = [-50, -50, -5, 50, 50, 3]
class_names = [
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier', 'car',
'truck', 'trailer', 'bus', 'construction_vehicle'
]
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5),
dict(type='LoadPointsFromMultiSweeps', sweeps_num=10),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=4,
train=dict(pipeline=train_pipeline, classes=class_names),
val=dict(pipeline=test_pipeline, classes=class_names),
test=dict(pipeline=test_pipeline, classes=class_names))
# model settings
model = dict(
pts_voxel_layer=dict(max_num_points=20),
pts_voxel_encoder=dict(feat_channels=[64, 64]),
pts_neck=dict(
_delete_=True,
type='SECONDFPN',
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01),
in_channels=[64, 128, 256],
upsample_strides=[1, 2, 4],
out_channels=[128, 128, 128]),
pts_bbox_head=dict(
_delete_=True,
type='ShapeAwareHead',
num_classes=10,
in_channels=384,
feat_channels=384,
use_direction_classifier=True,
anchor_generator=dict(
type='AlignedAnchor3DRangeGeneratorPerCls',
ranges=[[-50, -50, -1.67339111, 50, 50, -1.67339111],
[-50, -50, -1.71396371, 50, 50, -1.71396371],
[-50, -50, -1.61785072, 50, 50, -1.61785072],
[-50, -50, -1.80984986, 50, 50, -1.80984986],
[-50, -50, -1.76396500, 50, 50, -1.76396500],
[-50, -50, -1.80032795, 50, 50, -1.80032795],
[-50, -50, -1.74440365, 50, 50, -1.74440365],
[-50, -50, -1.68526504, 50, 50, -1.68526504],
[-50, -50, -1.80673031, 50, 50, -1.80673031],
[-50, -50, -1.64824291, 50, 50, -1.64824291]],
sizes=[
[0.60058911, 1.68452161, 1.27192197], # bicycle
[0.76279481, 2.09973778, 1.44403034], # motorcycle
[0.66344886, 0.72564370, 1.75748069], # pedestrian
[0.39694519, 0.40359262, 1.06232151], # traffic cone
[2.49008838, 0.48578221, 0.98297065], # barrier
[1.95017717, 4.60718145, 1.72270761], # car
[2.45609390, 6.73778078, 2.73004906], # truck
[2.87427237, 12.01320693, 3.81509561], # trailer
[2.94046906, 11.1885991, 3.47030982], # bus
[2.73050468, 6.38352896, 3.13312415] # construction vehicle
],
custom_values=[0, 0],
rotations=[0, 1.57],
reshape_out=False),
tasks=[
dict(
num_class=2,
class_names=['bicycle', 'motorcycle'],
shared_conv_channels=(64, 64),
shared_conv_strides=(1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=1,
class_names=['pedestrian'],
shared_conv_channels=(64, 64),
shared_conv_strides=(1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=2,
class_names=['traffic_cone', 'barrier'],
shared_conv_channels=(64, 64),
shared_conv_strides=(1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=1,
class_names=['car'],
shared_conv_channels=(64, 64, 64),
shared_conv_strides=(2, 1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01)),
dict(
num_class=4,
class_names=[
'truck', 'trailer', 'bus', 'construction_vehicle'
],
shared_conv_channels=(64, 64, 64),
shared_conv_strides=(2, 1, 1),
norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01))
],
assign_per_class=True,
diff_rad_by_sin=True,
dir_offset=0.7854, # pi/4
dir_limit_offset=0,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9),
loss_cls=dict(
type='FocalLoss',
use_sigmoid=True,
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0),
loss_dir=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)),
# model training and testing settings
train_cfg=dict(
_delete_=True,
pts=dict(
assigner=[
dict( # bicycle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # motorcycle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.3,
min_pos_iou=0.3,
ignore_iof_thr=-1),
dict( # pedestrian
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # traffic cone
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # barrier
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # car
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.6,
neg_iou_thr=0.45,
min_pos_iou=0.45,
ignore_iof_thr=-1),
dict( # truck
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # trailer
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1),
dict( # bus
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.55,
neg_iou_thr=0.4,
min_pos_iou=0.4,
ignore_iof_thr=-1),
dict( # construction vehicle
type='MaxIoUAssigner',
iou_calculator=dict(type='BboxOverlapsNearest3D'),
pos_iou_thr=0.5,
neg_iou_thr=0.35,
min_pos_iou=0.35,
ignore_iof_thr=-1)
],
allowed_border=0,
code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
pos_weight=-1,
debug=False)))
================================================
FILE: configs/transfusion_nusc_pillar_L.py
================================================
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.2, 0.2, 8]
out_size_factor = 4
evaluation = dict(interval=1)
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='ObjectSample',
db_sampler=dict(
data_root=None,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
))),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925 * 2, 0.3925 * 2],
scale_ratio_range=[0.9, 1.1],
translation_std=[0.5, 0.5, 0.5]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=6,
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/nuscenes_infos_train.pkl',
load_interval=1,
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='TransFusionDetector',
pts_voxel_layer=dict(
max_num_points=20,
voxel_size=voxel_size,
max_voxels=(30000, 60000),
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=5,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
point_cloud_range=point_cloud_range,
),
pts_middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)
),
pts_backbone=dict(
type='SECOND',
in_channels=64,
out_channels=[64, 128, 256],
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
out_channels=[128, 128, 128],
upsample_strides=[0.5, 1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='TransFusionHead',
num_proposals=200,
auxiliary=True,
in_channels=128 * 3,
hidden_channel=128,
num_classes=len(class_names),
num_decoder_layers=1,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=True,
nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
code_size=10,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
# loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
),
train_cfg=dict(
pts=dict(
dataset='nuScenes',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25)
),
pos_weight=-1,
gaussian_overlap=0.1,
min_radius=2,
grid_size=[512, 512, 1], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='nuScenes',
grid_size=[512, 512, 1],
out_size_factor=out_size_factor,
pc_range=point_cloud_range[0:2],
voxel_size=voxel_size[:2],
nms_type=None,
)))
optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 0.0001),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 20
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = None
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
================================================
FILE: configs/transfusion_nusc_pillar_LC.py
================================================
point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.2, 0.2, 8]
out_size_factor = 4
evaluation = dict(interval=1)
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_scale = (800, 448)
num_views = 6
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='LoadMultiViewImageFromFiles'),
# dict(
# type='GlobalRotScaleTrans',
# rot_range=[-0.3925 * 2, 0.3925 * 2],
# scale_ratio_range=[0.9, 1.1],
# translation_std=[0.5, 0.5, 0.5]),
# dict(
# type='RandomFlip3D',
# sync_2d=True,
# flip_ratio_bev_horizontal=0.5,
# flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadMultiViewImageFromFiles'),
dict(
type='MultiScaleFlipAug3D',
img_scale=img_scale,
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points', 'img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=6,
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_train.pkl',
load_interval=1,
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='TransFusionDetector',
freeze_img=True,
# img_backbone=dict(
# type='DLASeg',
# num_layers=34,
# heads={},
# head_convs=-1,
# ),
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
pts_voxel_layer=dict(
max_num_points=20,
voxel_size=voxel_size,
max_voxels=(30000, 60000),
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='PillarFeatureNet',
in_channels=5,
feat_channels=[64],
with_distance=False,
voxel_size=voxel_size,
norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
point_cloud_range=point_cloud_range,
),
pts_middle_encoder=dict(
type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)
),
pts_backbone=dict(
type='SECOND',
in_channels=64,
out_channels=[64, 128, 256],
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[64, 128, 256],
out_channels=[128, 128, 128],
upsample_strides=[0.5, 1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='TransFusionHead',
fuse_img=True,
num_views=num_views,
in_channels_img=256,
out_size_factor_img=4,
num_proposals=200,
auxiliary=True,
in_channels=128 * 3,
hidden_channel=128,
num_classes=len(class_names),
num_decoder_layers=1,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=True,
nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
code_size=10,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
# loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
),
train_cfg=dict(
pts=dict(
dataset='nuScenes',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25)
),
pos_weight=-1,
gaussian_overlap=0.1,
min_radius=2,
grid_size=[512, 512, 1], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='nuScenes',
grid_size=[512, 512, 1],
out_size_factor=out_size_factor,
pc_range=point_cloud_range[0:2],
voxel_size=voxel_size[:2],
nms_type=None,
)))
optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 0.0001),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 6
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = 'checkpoints/fusion_pillar02_R50.pth'
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
freeze_lidar_components = True
find_unused_parameters = True
================================================
FILE: configs/transfusion_nusc_voxel_L.py
================================================
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.075, 0.075, 0.2]
out_size_factor = 8
evaluation = dict(interval=1)
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='ObjectSample',
db_sampler=dict(
data_root=None,
info_path=data_root + 'nuscenes_dbinfos_train.pkl',
rate=1.0,
prepare=dict(
filter_by_difficulty=[-1],
filter_by_min_points=dict(
car=5,
truck=5,
bus=5,
trailer=5,
construction_vehicle=5,
traffic_cone=5,
barrier=5,
motorcycle=5,
bicycle=5,
pedestrian=5)),
classes=class_names,
sample_groups=dict(
car=2,
truck=3,
construction_vehicle=7,
bus=4,
trailer=6,
barrier=2,
motorcycle=6,
bicycle=6,
pedestrian=2,
traffic_cone=2),
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
))),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925 * 2, 0.3925 * 2],
scale_ratio_range=[0.9, 1.1],
translation_std=[0.5, 0.5, 0.5]),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='MultiScaleFlipAug3D',
img_scale=(1333, 800),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=6,
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/nuscenes_infos_train.pkl',
load_interval=1,
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='TransFusionDetector',
pts_voxel_layer=dict(
max_num_points=10,
voxel_size=voxel_size,
max_voxels=(120000, 160000),
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='HardSimpleVFE',
num_features=5,
),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[41, 1440, 1440],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='TransFusionHead',
num_proposals=200,
auxiliary=True,
in_channels=256 * 2,
hidden_channel=128,
num_classes=len(class_names),
num_decoder_layers=1,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=True,
nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
code_size=10,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
# loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
),
train_cfg=dict(
pts=dict(
dataset='nuScenes',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25)
),
pos_weight=-1,
gaussian_overlap=0.1,
min_radius=2,
grid_size=[1440, 1440, 40], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='nuScenes',
grid_size=[1440, 1440, 40],
out_size_factor=out_size_factor,
pc_range=point_cloud_range[0:2],
voxel_size=voxel_size[:2],
nms_type=None,
)))
optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 0.0001),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 20
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = None
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
================================================
FILE: configs/transfusion_nusc_voxel_LC.py
================================================
point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
class_names = [
'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
]
voxel_size = [0.075, 0.075, 0.2]
out_size_factor = 8
evaluation = dict(interval=1)
dataset_type = 'NuScenesDataset'
data_root = 'data/nuscenes/'
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_scale = (800, 448)
num_views = 6
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='LoadMultiViewImageFromFiles'),
# dict(
# type='GlobalRotScaleTrans',
# rot_range=[-0.3925 * 2, 0.3925 * 2],
# scale_ratio_range=[0.9, 1.1],
# translation_std=[0.5, 0.5, 0.5]),
# dict(
# type='RandomFlip3D',
# sync_2d=True,
# flip_ratio_bev_horizontal=0.5,
# flip_ratio_bev_vertical=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=5,
use_dim=[0, 1, 2, 3, 4],
),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
use_dim=[0, 1, 2, 3, 4],
),
dict(type='LoadMultiViewImageFromFiles'),
dict(
type='MultiScaleFlipAug3D',
img_scale=img_scale,
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points', 'img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=6,
train=dict(
type='CBGSDataset',
dataset=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_train_20pc.pkl',
load_interval=1,
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/nuscenes_infos_val.pkl',
load_interval=1,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='TransFusionDetector',
freeze_img=True,
# img_backbone=dict(
# type='DLASeg',
# num_layers=34,
# heads={},
# head_convs=-1,
# ),
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
pts_voxel_layer=dict(
max_num_points=10,
voxel_size=voxel_size,
max_voxels=(120000, 160000),
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='HardSimpleVFE',
num_features=5,
),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=5,
sparse_shape=[41, 1440, 1440],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='TransFusionHead',
fuse_img=True,
num_views=num_views,
in_channels_img=256,
out_size_factor_img=4,
num_proposals=200,
auxiliary=True,
in_channels=256 * 2,
hidden_channel=128,
num_classes=len(class_names),
num_decoder_layers=1,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=True,
nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
score_threshold=0.0,
code_size=10,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
# loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
),
train_cfg=dict(
pts=dict(
dataset='nuScenes',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.15),
reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
iou_cost=dict(type='IoU3DCost', weight=0.25)
),
pos_weight=-1,
gaussian_overlap=0.1,
min_radius=2,
grid_size=[1440, 1440, 40], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='nuScenes',
grid_size=[1440, 1440, 40],
out_size_factor=out_size_factor,
pc_range=point_cloud_range[0:2],
voxel_size=voxel_size[:2],
nms_type=None,
)))
optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 0.0001),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 6
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = 'checkpoints/fusion_voxel0075_R50.pth'
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
freeze_lidar_components = True
find_unused_parameters = True
================================================
FILE: configs/transfusion_waymo_voxel_L.py
================================================
point_cloud_range = [-75.2, -75.2, -2, 75.2, 75.2, 4]
class_names = ['Car', 'Pedestrian', 'Cyclist']
voxel_size = [0.1, 0.1, 0.15]
out_size_factor = 8
evaluation = dict(interval=1)
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format'
input_modality = dict(
use_lidar=True,
use_camera=False,
use_radar=False,
use_map=False,
use_external=False)
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
# dict(type='ObjectSample',
# db_sampler=dict(
# data_root=data_root,
# info_path=data_root + '/waymo_dbinfos_train.pkl',
# rate=1.0,
# prepare=dict(
# filter_by_difficulty=[-1],
# filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
# classes=class_names,
# sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10),
# points_loader=dict(
# type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4]))
# ),
dict(
type='RandomFlip3D',
sync_2d=False,
flip_ratio_bev_horizontal=0.5,
flip_ratio_bev_vertical=0.5),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5),
dict(
type='MultiScaleFlipAug3D',
img_scale=(800, 1333),
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
data = dict(
samples_per_gpu=4,
workers_per_gpu=6,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
load_interval=1,
ann_file=data_root + '/waymo_infos_train.pkl',
split='training',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/waymo_infos_val.pkl',
split='training',
load_interval=10,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
ann_file=data_root + '/waymo_infos_val.pkl',
split='training',
load_interval=10,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='TransFusionDetector',
pts_voxel_layer=dict(
max_num_points=5,
voxel_size=voxel_size,
max_voxels=150000,
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='HardVFE',
in_channels=5,
# num_features=5,
feat_channels=[64],
with_distance=False,
with_cluster_center=False,
with_voxel_center=False,
voxel_size=voxel_size,
norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
point_cloud_range=point_cloud_range,
),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=64,
sparse_shape=[41, 1504, 1504],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='TransFusionHead',
num_proposals=300,
auxiliary=True,
in_channels=256 * 2,
hidden_channel=128,
num_classes=len(class_names),
num_decoder_layers=1,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=True,
nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-80, -80, -10.0, 80, 80, 10.0],
score_threshold=0.0,
code_size=8,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
# loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=2.0),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
),
train_cfg=dict(
pts=dict(
dataset='Waymo',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.6),
reg_cost=dict(type='BBoxBEVL1Cost', weight=2.0),
iou_cost=dict(type='IoU3DCost', weight=2.0)
),
pos_weight=-1,
gaussian_overlap=0.1,
min_radius=2,
grid_size=[1504, 1504, 40], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='Waymo',
grid_size=[1504, 1504, 40],
out_size_factor=out_size_factor,
voxel_size=voxel_size[:2],
nms_type=None,
)))
optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 4sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 0.0001),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 36
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = None
resume_from = None
workflow = [('train', 1)]
gpu_ids = range(0, 8)
================================================
FILE: configs/transfusion_waymo_voxel_LC.py
================================================
point_cloud_range = [-75.2, -75.2, -2, 75.2, 75.2, 4]
class_names = ['Car', 'Pedestrian', 'Cyclist']
voxel_size = [0.1, 0.1, 0.15]
out_size_factor = 8
evaluation = dict(interval=1)
dataset_type = 'WaymoDataset'
data_root = 'data/waymo/kitti_format'
input_modality = dict(
use_lidar=True,
use_camera=True,
use_radar=False,
use_map=False,
use_external=False)
img_scale = (640, 960)
num_views = 5
img_norm_cfg = dict(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
train_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(type='LoadMultiViewImageFromFiles', img_scale=(1280, 1920)),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyPad', size_divisor=32),
# dict(
# type='RandomFlip3D',
# sync_2d=True,
# flip_ratio_bev_horizontal=0.5,
# flip_ratio_bev_vertical=0.5),
# dict(
# type='GlobalRotScaleTrans',
# rot_range=[-0.78539816, 0.78539816],
# scale_ratio_range=[0.95, 1.05],
# ),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5),
dict(type='LoadMultiViewImageFromFiles', img_scale=(1280, 1920)),
dict(
type='MultiScaleFlipAug3D',
img_scale=img_scale,
pts_scale_ratio=1,
flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1.0, 1.0],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(type='MyNormalize', **img_norm_cfg),
dict(type='MyResize', img_scale=img_scale, keep_ratio=True),
dict(type='MyPad', size_divisor=32),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points', 'img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=6,
train=dict(
type='RepeatDataset',
times=1,
dataset=dict(
type=dataset_type,
data_root=data_root,
load_interval=1,
num_views=num_views,
ann_file=data_root + '/waymo_infos_train.pkl',
split='training',
pipeline=train_pipeline,
classes=class_names,
modality=input_modality,
test_mode=False,
box_type_3d='LiDAR')),
val=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/waymo_infos_val.pkl',
split='training',
load_interval=10,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'),
test=dict(
type=dataset_type,
data_root=data_root,
num_views=num_views,
ann_file=data_root + '/waymo_infos_val.pkl',
split='training',
load_interval=10,
pipeline=test_pipeline,
classes=class_names,
modality=input_modality,
test_mode=True,
box_type_3d='LiDAR'))
model = dict(
type='TransFusionDetector',
freeze_img=True,
img_backbone=dict(
type='ResNet',
depth=50,
num_stages=4,
out_indices=(0, 1, 2, 3),
frozen_stages=1,
norm_cfg=dict(type='BN', requires_grad=True),
norm_eval=True,
style='pytorch'),
img_neck=dict(
type='FPN',
in_channels=[256, 512, 1024, 2048],
out_channels=256,
num_outs=5),
pts_voxel_layer=dict(
max_num_points=5,
voxel_size=voxel_size,
max_voxels=150000,
point_cloud_range=point_cloud_range),
pts_voxel_encoder=dict(
type='HardVFE',
in_channels=5,
feat_channels=[64],
with_distance=False,
with_cluster_center=False,
with_voxel_center=False,
voxel_size=voxel_size,
norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
point_cloud_range=point_cloud_range,
),
pts_middle_encoder=dict(
type='SparseEncoder',
in_channels=64,
sparse_shape=[41, 1504, 1504],
output_channels=128,
order=('conv', 'norm', 'act'),
encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)),
encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)),
block_type='basicblock'),
pts_backbone=dict(
type='SECOND',
in_channels=256,
out_channels=[128, 256],
layer_nums=[5, 5],
layer_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)),
pts_neck=dict(
type='SECONDFPN',
in_channels=[128, 256],
out_channels=[256, 256],
upsample_strides=[1, 2],
norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
use_conv_for_no_stride=True),
pts_bbox_head=dict(
type='TransFusionHead',
fuse_img=True,
num_views=num_views,
in_channels_img=256,
out_size_factor_img=4,
num_proposals=300,
auxiliary=True,
in_channels=256 * 2,
hidden_channel=128,
num_classes=len(class_names),
num_decoder_layers=1,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=True,
nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
common_heads=dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2)),
bbox_coder=dict(
type='TransFusionBBoxCoder',
pc_range=point_cloud_range[:2],
voxel_size=voxel_size[:2],
out_size_factor=out_size_factor,
post_center_range=[-80, -80, -10.0, 80, 80, 10.0],
score_threshold=0.0,
code_size=8,
),
loss_cls=dict(type='FocalLoss', use_sigmoid=True, gamma=2, alpha=0.25, reduction='mean', loss_weight=1.0),
# loss_iou=dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='mean', loss_weight=0.0),
loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=2.0),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean', loss_weight=1.0),
),
train_cfg=dict(
pts=dict(
dataset='Waymo',
assigner=dict(
type='HungarianAssigner3D',
iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
cls_cost=dict(type='FocalLossCost', gamma=2, alpha=0.25, weight=0.6),
reg_cost=dict(type='BBoxBEVL1Cost', weight=2.0),
iou_cost=dict(type='IoU3DCost', weight=2.0)
),
pos_weight=-1,
gaussian_overlap=0.1,
min_radius=2,
grid_size=[1504, 1504, 40], # [x_len, y_len, 1]
voxel_size=voxel_size,
out_size_factor=out_size_factor,
code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
point_cloud_range=point_cloud_range)),
test_cfg=dict(
pts=dict(
dataset='Waymo',
pc_range=point_cloud_range[:2],
grid_size=[1504, 1504, 40],
out_size_factor=out_size_factor,
voxel_size=voxel_size[:2],
nms_type=None,
)))
optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.01) # for 8gpu * 2sample_per_gpu
optimizer_config = dict(grad_clip=dict(max_norm=0.1, norm_type=2))
lr_config = dict(
policy='cyclic',
target_ratio=(10, 0.0001),
cyclic_times=1,
step_ratio_up=0.4)
momentum_config = dict(
policy='cyclic',
target_ratio=(0.8947368421052632, 1),
cyclic_times=1,
step_ratio_up=0.4)
total_epochs = 12
checkpoint_config = dict(interval=1)
log_config = dict(
interval=50,
hooks=[dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')])
dist_params = dict(backend='nccl')
log_level = 'INFO'
work_dir = None
load_from = 'checkpoints/waymo_36e_R50.pth'
resume_from = None
workflow = [('train', 1)]
freeze_lidar_components = True
find_unused_parameters = True
gpu_ids = range(0, 8)
================================================
FILE: configs/votenet/README.md
================================================
# Deep Hough Voting for 3D Object Detection in Point Clouds
## Introduction
[ALGORITHM]
We implement VoteNet and provide the result and checkpoints on ScanNet and SUNRGBD datasets.
```
@inproceedings{qi2019deep,
author = {Qi, Charles R and Litany, Or and He, Kaiming and Guibas, Leonidas J},
title = {Deep Hough Voting for 3D Object Detection in Point Clouds},
booktitle = {Proceedings of the IEEE International Conference on Computer Vision},
year = {2019}
}
```
## Results
### ScanNet
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [PointNet++](./votenet_8x8_scannet-3d-18class.py) | 3x |4.1||62.90|39.91|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20200620_230238-2cea9c3a.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_8x8_scannet-3d-18class/votenet_8x8_scannet-3d-18class_20200620_230238.log.json)|
### SUNRGBD
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [PointNet++](./votenet_16x8_sunrgbd-3d-10class.py) | 3x |8.1||59.07|35.77|[model](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20200620_230238-4483c0c0.pth) | [log](https://download.openmmlab.com/mmdetection3d/v0.1.0_models/votenet/votenet_16x8_sunrgbd-3d-10class/votenet_16x8_sunrgbd-3d-10class_20200620_230238.log.json)|
**Notice**: If your current mmdetection3d version >= 0.6.0, and you are using the checkpoints downloaded from the above links or using checkpoints trained with mmdetection3d version < 0.6.0, the checkpoints have to be first converted via [tools/model_converters/convert_votenet_checkpoints.py](../../tools/model_converters/convert_votenet_checkpoints.py):
```
python ./tools/model_converters/convert_votenet_checkpoints.py ${ORIGINAL_CHECKPOINT_PATH} --out=${NEW_CHECKPOINT_PATH}
```
Then you can use the converted checkpoints following [getting_started.md](../../docs/getting_started.md).
## Indeterminism
Since test data preparation randomly downsamples the points, and the test script uses fixed random seeds while the random seeds of validation in training are not fixed, the test results may be slightly different from the results reported above.
## IoU loss
Adding IoU loss (simply = 1-IoU) boosts VoteNet's performance. To use IoU loss, add this loss term to the config file:
```python
iou_loss=dict(type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 / 3.0)
```
| Backbone | Lr schd | Mem (GB) | Inf time (fps) | AP@0.25 |AP@0.5| Download |
| :---------: | :-----: | :------: | :------------: | :----: |:----: | :------: |
| [PointNet++](./votenet_iouloss_8x8_scannet-3d-18class.py) | 3x |4.1||63.81|44.21|/|
For now, we only support calculating IoU loss for axis-aligned bounding boxes since the CUDA op of general 3D IoU calculation does not implement the backward method. Therefore, IoU loss can only be used for ScanNet dataset for now.
================================================
FILE: configs/votenet/votenet_16x8_sunrgbd-3d-10class.py
================================================
_base_ = [
'../_base_/datasets/sunrgbd-3d-10class.py', '../_base_/models/votenet.py',
'../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
bbox_head=dict(
num_classes=10,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=10,
num_dir_bins=12,
with_rot=True,
mean_sizes=[
[2.114256, 1.620300, 0.927272], [0.791118, 1.279516, 0.718182],
[0.923508, 1.867419, 0.845495], [0.591958, 0.552978, 0.827272],
[0.699104, 0.454178, 0.75625], [0.69519, 1.346299, 0.736364],
[0.528526, 1.002642, 1.172878], [0.500618, 0.632163, 0.683424],
[0.404671, 1.071108, 1.688889], [0.76584, 1.398258, 0.472728]
]),
))
================================================
FILE: configs/votenet/votenet_8x8_scannet-3d-18class.py
================================================
_base_ = [
'../_base_/datasets/scannet-3d-18class.py', '../_base_/models/votenet.py',
'../_base_/schedules/schedule_3x.py', '../_base_/default_runtime.py'
]
# model settings
model = dict(
bbox_head=dict(
num_classes=18,
bbox_coder=dict(
type='PartialBinBasedBBoxCoder',
num_sizes=18,
num_dir_bins=1,
with_rot=False,
mean_sizes=[[0.76966727, 0.8116021, 0.92573744],
[1.876858, 1.8425595, 1.1931566],
[0.61328, 0.6148609, 0.7182701],
[1.3955007, 1.5121545, 0.83443564],
[0.97949594, 1.0675149, 0.6329687],
[0.531663, 0.5955577, 1.7500148],
[0.9624706, 0.72462326, 1.1481868],
[0.83221924, 1.0490936, 1.6875663],
[0.21132214, 0.4206159, 0.5372846],
[1.4440073, 1.8970833, 0.26985747],
[1.0294262, 1.4040797, 0.87554324],
[1.3766412, 0.65521795, 1.6813129],
[0.6650819, 0.71111923, 1.298853],
[0.41999173, 0.37906948, 1.7513971],
[0.59359556, 0.5912492, 0.73919016],
[0.50867593, 0.50656086, 0.30136237],
[1.1511526, 1.0546296, 0.49706793],
[0.47535285, 0.49249494, 0.5802117]])))
# optimizer
# yapf:disable
log_config = dict(
interval=30,
hooks=[
dict(type='TextLoggerHook'),
dict(type='TensorboardLoggerHook')
])
# yapf:enable
================================================
FILE: configs/votenet/votenet_iouloss_8x8_scannet-3d-18class.py
================================================
_base_ = ['./votenet_8x8_scannet-3d-18class.py']
# model settings, add iou loss
model = dict(
bbox_head=dict(
iou_loss=dict(
type='AxisAlignedIoULoss', reduction='sum', loss_weight=10.0 /
3.0)))
================================================
FILE: configs/waymo.md
================================================
# MODEL ZOO
## Common settings and notes
- The experiments are run with PyTorch 1.7.0, CUDA 10.1 and CUDNN 7.6
- The training is conducted on 8 Tesla V100 GPUs
## Waymo 3D Detection
We try a few training schedules for TransFusion-L and list the performance below. The fusion-based models are further trained for 6 epochs from the pretrained LiDAR backbone. We freeze the weight of LiDAR backbone to save GPU memory.
| Model | Backbone | epoch | Veh_L2 | Ped_L2 | Cyc_L2 | MAPH |
|---------|--------|--------|---------|---------|---------|---------|
| [TransFusion-L](configs/transfusion_waymo_voxel_L.py) | VoxelNet | 12 | 63.86 | 62.84 | 67.17 | 64.63
| [TransFusion-L](configs/transfusion_waymo_voxel_L.py) | VoxelNet | 24 | 64.54 | 63.39 | 66.43 | 64.78
| [TransFusion-L](configs/transfusion_waymo_voxel_L.py) | VoxelNet | 36 | 65.07 | 63.70 | 65.97 | 64.91
| [TransFusion](configs/transfusion_waymo_voxel_LC.py) | VoxelNet | 36 + 6| 65.11 | 64.02 | 67.40 | 65.51
================================================
FILE: demo/pcd_demo.py
================================================
from argparse import ArgumentParser
from mmdet3d.apis import inference_detector, init_detector, show_result_meshlab
def main():
parser = ArgumentParser()
parser.add_argument('pcd', help='Point cloud file')
parser.add_argument('config', help='Config file')
parser.add_argument('checkpoint', help='Checkpoint file')
parser.add_argument(
'--device', default='cuda:0', help='Device used for inference')
parser.add_argument(
'--score-thr', type=float, default=0.6, help='bbox score threshold')
parser.add_argument(
'--out-dir', type=str, default='demo', help='dir to save results')
args = parser.parse_args()
# build the model from a config file and a checkpoint file
model = init_detector(args.config, args.checkpoint, device=args.device)
# test a single image
result, data = inference_detector(model, args.pcd)
# show the results
show_result_meshlab(data, result, args.out_dir)
if __name__ == '__main__':
main()
================================================
FILE: docker/Dockerfile
================================================
ARG PYTORCH="1.6.0"
ARG CUDA="10.1"
ARG CUDNN="7"
FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# Install MMCV
RUN pip install mmcv-full==latest+torch1.6.0+cu101 -f https://openmmlab.oss-accelerate.aliyuncs.com/mmcv/dist/index.html
RUN pip install mmdet
# Install MMDetection
RUN conda clean --all
RUN git clone https://github.com/open-mmlab/mmdetection3d.git /mmdetection3d
WORKDIR /mmdetection3d
ENV FORCE_CUDA="1"
RUN pip install -r requirements/build.txt
RUN pip install --no-cache-dir -e .
# uninstall pycocotools installed by nuscenes-devkit and reinstall mmpycocotools
RUN pip uninstall pycocotools --no-cache-dir -y
RUN pip install mmpycocotools --no-cache-dir --force --no-deps
================================================
FILE: mmdet3d/__init__.py
================================================
import mmcv
import mmdet
from .version import __version__, short_version
def digit_version(version_str):
digit_version = []
for x in version_str.split('.'):
if x.isdigit():
digit_version.append(int(x))
elif x.find('rc') != -1:
patch_version = x.split('rc')
digit_version.append(int(patch_version[0]) - 1)
digit_version.append(int(patch_version[1]))
return digit_version
mmcv_minimum_version = '1.2.4'
mmcv_maximum_version = '1.4.0'
mmcv_version = digit_version(mmcv.__version__)
assert (mmcv_version >= digit_version(mmcv_minimum_version)
and mmcv_version <= digit_version(mmcv_maximum_version)), \
f'MMCV=={mmcv.__version__} is used but incompatible. ' \
f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.'
mmdet_minimum_version = '2.5.0'
mmdet_maximum_version = '3.0.0'
mmdet_version = digit_version(mmdet.__version__)
assert (mmdet_version >= digit_version(mmdet_minimum_version)
and mmdet_version <= digit_version(mmdet_maximum_version)), \
f'MMDET=={mmdet.__version__} is used but incompatible. ' \
f'Please install mmdet>={mmdet_minimum_version}, ' \
f'<={mmdet_maximum_version}.'
__all__ = ['__version__', 'short_version']
================================================
FILE: mmdet3d/apis/__init__.py
================================================
from .inference import (convert_SyncBN, inference_detector, init_detector,
show_result_meshlab)
from .test import single_gpu_test
__all__ = [
'inference_detector', 'init_detector', 'single_gpu_test',
'show_result_meshlab', 'convert_SyncBN'
]
================================================
FILE: mmdet3d/apis/inference.py
================================================
import mmcv
import torch
from copy import deepcopy
from mmcv.parallel import collate, scatter
from mmcv.runner import load_checkpoint
from os import path as osp
from mmdet3d.core import Box3DMode, show_result
from mmdet3d.core.bbox import get_box_type
from mmdet3d.datasets.pipelines import Compose
from mmdet3d.models import build_detector
def convert_SyncBN(config):
"""Convert config's naiveSyncBN to BN.
Args:
config (str or :obj:`mmcv.Config`): Config file path or the config
object.
"""
if isinstance(config, dict):
for item in config:
if item == 'norm_cfg':
config[item]['type'] = config[item]['type']. \
replace('naiveSyncBN', 'BN')
else:
convert_SyncBN(config[item])
def init_detector(config, checkpoint=None, device='cuda:0'):
"""Initialize a detector from config file.
Args:
config (str or :obj:`mmcv.Config`): Config file path or the config
object.
checkpoint (str, optional): Checkpoint path. If left as None, the model
will not load any weights.
device (str): Device to use.
Returns:
nn.Module: The constructed detector.
"""
if isinstance(config, str):
config = mmcv.Config.fromfile(config)
elif not isinstance(config, mmcv.Config):
raise TypeError('config must be a filename or Config object, '
f'but got {type(config)}')
config.model.pretrained = None
convert_SyncBN(config.model)
config.model.train_cfg = None
model = build_detector(config.model, test_cfg=config.get('test_cfg'))
if checkpoint is not None:
checkpoint = load_checkpoint(model, checkpoint)
if 'CLASSES' in checkpoint['meta']:
model.CLASSES = checkpoint['meta']['CLASSES']
else:
model.CLASSES = config.class_names
model.cfg = config # save the config in the model for convenience
model.to(device)
model.eval()
return model
def inference_detector(model, pcd):
"""Inference point cloud with the detector.
Args:
model (nn.Module): The loaded detector.
pcd (str): Point cloud files.
Returns:
tuple: Predicted results and data from pipeline.
"""
cfg = model.cfg
device = next(model.parameters()).device # model device
# build the data pipeline
test_pipeline = deepcopy(cfg.data.test.pipeline)
test_pipeline = Compose(test_pipeline)
box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d)
data = dict(
pts_filename=pcd,
box_type_3d=box_type_3d,
box_mode_3d=box_mode_3d,
sweeps=[],
# set timestamp = 0
timestamp=[0],
img_fields=[],
bbox3d_fields=[],
pts_mask_fields=[],
pts_seg_fields=[],
bbox_fields=[],
mask_fields=[],
seg_fields=[])
data = test_pipeline(data)
data = collate([data], samples_per_gpu=1)
if next(model.parameters()).is_cuda:
# scatter to specified GPU
data = scatter(data, [device.index])[0]
else:
# this is a workaround to avoid the bug of MMDataParallel
data['img_metas'] = data['img_metas'][0].data
data['points'] = data['points'][0].data
# forward the model
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
return result, data
def show_result_meshlab(data, result, out_dir):
"""Show result by meshlab.
Args:
data (dict): Contain data from pipeline.
result (dict): Predicted result from model.
out_dir (str): Directory to save visualized result.
"""
points = data['points'][0][0].cpu().numpy()
pts_filename = data['img_metas'][0][0]['pts_filename']
file_name = osp.split(pts_filename)[-1].split('.')[0]
assert out_dir is not None, 'Expect out_dir, got none.'
if 'pts_bbox' in result[0].keys():
pred_bboxes = result[0]['pts_bbox']['boxes_3d'].tensor.numpy()
else:
pred_bboxes = result[0]['boxes_3d'].tensor.numpy()
# for now we convert points into depth mode
if data['img_metas'][0][0]['box_mode_3d'] != Box3DMode.DEPTH:
points = points[..., [1, 0, 2]]
points[..., 0] *= -1
pred_bboxes = Box3DMode.convert(pred_bboxes,
data['img_metas'][0][0]['box_mode_3d'],
Box3DMode.DEPTH)
show_result(points, None, pred_bboxes, out_dir, file_name, show=False)
return out_dir, file_name
================================================
FILE: mmdet3d/apis/test.py
================================================
import mmcv
import torch
def single_gpu_test(model, data_loader, show=False, out_dir=None):
"""Test model with single gpu.
This method tests model with single gpu and gives the 'show' option.
By setting ``show=True``, it saves the visualization results under
``out_dir``.
Args:
model (nn.Module): Model to be tested.
data_loader (nn.Dataloader): Pytorch data loader.
show (bool): Whether to save viualization results.
Default: True.
out_dir (str): The path to save visualization results.
Default: None.
Returns:
list[dict]: The prediction results.
"""
model.eval()
results = []
dataset = data_loader.dataset
prog_bar = mmcv.ProgressBar(len(dataset))
for i, data in enumerate(data_loader):
with torch.no_grad():
result = model(return_loss=False, rescale=True, **data)
if show:
model.module.show_results(data, result, out_dir)
results.extend(result)
batch_size = len(result)
for _ in range(batch_size):
prog_bar.update()
return results
================================================
FILE: mmdet3d/core/__init__.py
================================================
from .anchor import * # noqa: F401, F403
from .bbox import * # noqa: F401, F403
from .evaluation import * # noqa: F401, F403
from .points import * # noqa: F401, F403
from .post_processing import * # noqa: F401, F403
from .utils import * # noqa: F401, F403
from .visualizer import * # noqa: F401, F403
from .voxel import * # noqa: F401, F403
================================================
FILE: mmdet3d/core/anchor/__init__.py
================================================
from mmdet.core.anchor import build_anchor_generator
from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator,
AlignedAnchor3DRangeGeneratorPerCls,
Anchor3DRangeGenerator)
__all__ = [
'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator',
'build_anchor_generator', 'AlignedAnchor3DRangeGeneratorPerCls'
]
================================================
FILE: mmdet3d/core/anchor/anchor_3d_generator.py
================================================
import mmcv
import torch
from mmdet.core.anchor import ANCHOR_GENERATORS
@ANCHOR_GENERATORS.register_module()
class Anchor3DRangeGenerator(object):
"""3D Anchor Generator by range.
This anchor generator generates anchors by the given range in different
feature levels.
Due the convention in 3D detection, different anchor sizes are related to
different ranges for different categories. However we find this setting
does not effect the performance much in some datasets, e.g., nuScenes.
Args:
ranges (list[list[float]]): Ranges of different anchors.
The ranges are the same across different feature levels. But may
vary for different anchor sizes if size_per_range is True.
sizes (list[list[float]]): 3D sizes of anchors.
scales (list[int]): Scales of anchors in different feature levels.
rotations (list[float]): Rotations of anchors in a feature grid.
custom_values (tuple[float]): Customized values of that anchor. For
example, in nuScenes the anchors have velocities.
reshape_out (bool): Whether to reshape the output into (N x 4).
size_per_range: Whether to use separate ranges for different sizes.
If size_per_range is True, the ranges should have the same length
as the sizes, if not, it will be duplicated.
"""
def __init__(self,
ranges,
sizes=[[1.6, 3.9, 1.56]],
scales=[1],
rotations=[0, 1.5707963],
custom_values=(),
reshape_out=True,
size_per_range=True):
assert mmcv.is_list_of(ranges, list)
if size_per_range:
if len(sizes) != len(ranges):
assert len(ranges) == 1
ranges = ranges * len(sizes)
assert len(ranges) == len(sizes)
else:
assert len(ranges) == 1
assert mmcv.is_list_of(sizes, list)
assert isinstance(scales, list)
self.sizes = sizes
self.scales = scales
self.ranges = ranges
self.rotations = rotations
self.custom_values = custom_values
self.cached_anchors = None
self.reshape_out = reshape_out
self.size_per_range = size_per_range
def __repr__(self):
s = self.__class__.__name__ + '('
s += f'anchor_range={self.ranges},\n'
s += f'scales={self.scales},\n'
s += f'sizes={self.sizes},\n'
s += f'rotations={self.rotations},\n'
s += f'reshape_out={self.reshape_out},\n'
s += f'size_per_range={self.size_per_range})'
return s
@property
def num_base_anchors(self):
"""list[int]: Total number of base anchors in a feature grid."""
num_rot = len(self.rotations)
num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0)
return num_rot * num_size
@property
def num_levels(self):
"""int: Number of feature levels that the generator is applied to."""
return len(self.scales)
def grid_anchors(self, featmap_sizes, device='cuda'):
"""Generate grid anchors in multiple feature levels.
Args:
featmap_sizes (list[tuple]): List of feature map sizes in
multiple feature levels.
device (str): Device where the anchors will be put on.
Returns:
list[torch.Tensor]: Anchors in multiple feature levels. \
The sizes of each tensor should be [N, 4], where \
N = width * height * num_base_anchors, width and height \
are the sizes of the corresponding feature lavel, \
num_base_anchors is the number of anchors for that level.
"""
assert self.num_levels == len(featmap_sizes)
multi_level_anchors = []
for i in range(self.num_levels):
anchors = self.single_level_grid_anchors(
featmap_sizes[i], self.scales[i], device=device)
if self.reshape_out:
anchors = anchors.reshape(-1, anchors.size(-1))
multi_level_anchors.append(anchors)
return multi_level_anchors
def single_level_grid_anchors(self, featmap_size, scale, device='cuda'):
"""Generate grid anchors of a single level feature map.
This function is usually called by method ``self.grid_anchors``.
Args:
featmap_size (tuple[int]): Size of the feature map.
scale (float): Scale factor of the anchors in the current level.
device (str, optional): Device the tensor will be put on.
Defaults to 'cuda'.
Returns:
torch.Tensor: Anchors in the overall feature map.
"""
# We reimplement the anchor generator using torch in cuda
# torch: 0.6975 s for 1000 times
# numpy: 4.3345 s for 1000 times
# which is ~5 times faster than the numpy implementation
if not self.size_per_range:
return self.anchors_single_range(
featmap_size,
self.ranges[0],
scale,
self.sizes,
self.rotations,
device=device)
mr_anchors = []
for anchor_range, anchor_size in zip(self.ranges, self.sizes):
mr_anchors.append(
self.anchors_single_range(
featmap_size,
anchor_range,
scale,
anchor_size,
self.rotations,
device=device))
mr_anchors = torch.cat(mr_anchors, dim=-3)
return mr_anchors
def anchors_single_range(self,
feature_size,
anchor_range,
scale=1,
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.5707963],
device='cuda'):
"""Generate anchors in a single range.
Args:
feature_size (list[float] | tuple[float]): Feature map size. It is
either a list of a tuple of [D, H, W](in order of z, y, and x).
anchor_range (torch.Tensor | list[float]): Range of anchors with
shape [6]. The order is consistent with that of anchors, i.e.,
(x_min, y_min, z_min, x_max, y_max, z_max).
scale (float | int, optional): The scale factor of anchors.
sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
shape [N, 3], in order of x, y, z.
rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
anchors in a single feature grid.
device (str): Devices that the anchors will be put on.
Returns:
torch.Tensor: Anchors with shape \
[*feature_size, num_sizes, num_rots, 7].
"""
if len(feature_size) == 2:
feature_size = [1, feature_size[0], feature_size[1]]
anchor_range = torch.tensor(anchor_range, device=device)
z_centers = torch.linspace(
anchor_range[2], anchor_range[5], feature_size[0], device=device)
y_centers = torch.linspace(
anchor_range[1], anchor_range[4], feature_size[1], device=device)
x_centers = torch.linspace(
anchor_range[0], anchor_range[3], feature_size[2], device=device)
sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
rotations = torch.tensor(rotations, device=device)
# torch.meshgrid default behavior is 'id', np's default is 'xy'
rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations)
# torch.meshgrid returns a tuple rather than list
rets = list(rets)
tile_shape = [1] * 5
tile_shape[-2] = int(sizes.shape[0])
for i in range(len(rets)):
rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
tile_size_shape = list(rets[0].shape)
tile_size_shape[3] = 1
sizes = sizes.repeat(tile_size_shape)
rets.insert(3, sizes)
ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
# [1, 200, 176, N, 2, 7] for kitti after permute
if len(self.custom_values) > 0:
custom_ndim = len(self.custom_values)
custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
# custom[:] = self.custom_values
ret = torch.cat([ret, custom], dim=-1)
# [1, 200, 176, N, 2, 9] for nus dataset after permute
return ret
@ANCHOR_GENERATORS.register_module()
class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator):
"""Aligned 3D Anchor Generator by range.
This anchor generator uses a different manner to generate the positions
of anchors' centers from :class:`Anchor3DRangeGenerator`.
Note:
The `align` means that the anchor's center is aligned with the voxel
grid, which is also the feature grid. The previous implementation of
:class:`Anchor3DRangeGenerator` does not generate the anchors' center
according to the voxel grid. Rather, it generates the center by
uniformly distributing the anchors inside the minimum and maximum
anchor ranges according to the feature map sizes.
However, this makes the anchors center does not match the feature grid.
The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the
feature map sizes to obtain the corners of the voxel grid. Then it
shifts the coordinates to the center of voxel grid and use the left
up corner to distribute anchors.
Args:
anchor_corner (bool): Whether to align with the corner of the voxel
grid. By default it is False and the anchor's center will be
the same as the corresponding voxel's center, which is also the
center of the corresponding greature grid.
"""
def __init__(self, align_corner=False, **kwargs):
super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs)
self.align_corner = align_corner
def anchors_single_range(self,
feature_size,
anchor_range,
scale,
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.5707963],
device='cuda'):
"""Generate anchors in a single range.
Args:
feature_size (list[float] | tuple[float]): Feature map size. It is
either a list of a tuple of [D, H, W](in order of z, y, and x).
anchor_range (torch.Tensor | list[float]): Range of anchors with
shape [6]. The order is consistent with that of anchors, i.e.,
(x_min, y_min, z_min, x_max, y_max, z_max).
scale (float | int, optional): The scale factor of anchors.
sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
shape [N, 3], in order of x, y, z.
rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
anchors in a single feature grid.
device (str): Devices that the anchors will be put on.
Returns:
torch.Tensor: Anchors with shape \
[*feature_size, num_sizes, num_rots, 7].
"""
if len(feature_size) == 2:
feature_size = [1, feature_size[0], feature_size[1]]
anchor_range = torch.tensor(anchor_range, device=device)
z_centers = torch.linspace(
anchor_range[2],
anchor_range[5],
feature_size[0] + 1,
device=device)
y_centers = torch.linspace(
anchor_range[1],
anchor_range[4],
feature_size[1] + 1,
device=device)
x_centers = torch.linspace(
anchor_range[0],
anchor_range[3],
feature_size[2] + 1,
device=device)
sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale
rotations = torch.tensor(rotations, device=device)
# shift the anchor center
if not self.align_corner:
z_shift = (z_centers[1] - z_centers[0]) / 2
y_shift = (y_centers[1] - y_centers[0]) / 2
x_shift = (x_centers[1] - x_centers[0]) / 2
z_centers += z_shift
y_centers += y_shift
x_centers += x_shift
# torch.meshgrid default behavior is 'id', np's default is 'xy'
rets = torch.meshgrid(x_centers[:feature_size[2]],
y_centers[:feature_size[1]],
z_centers[:feature_size[0]], rotations)
# torch.meshgrid returns a tuple rather than list
rets = list(rets)
tile_shape = [1] * 5
tile_shape[-2] = int(sizes.shape[0])
for i in range(len(rets)):
rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1)
sizes = sizes.reshape([1, 1, 1, -1, 1, 3])
tile_size_shape = list(rets[0].shape)
tile_size_shape[3] = 1
sizes = sizes.repeat(tile_size_shape)
rets.insert(3, sizes)
ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5])
if len(self.custom_values) > 0:
custom_ndim = len(self.custom_values)
custom = ret.new_zeros([*ret.shape[:-1], custom_ndim])
# TODO: check the support of custom values
# custom[:] = self.custom_values
ret = torch.cat([ret, custom], dim=-1)
return ret
@ANCHOR_GENERATORS.register_module()
class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator):
"""3D Anchor Generator by range for per class.
This anchor generator generates anchors by the given range for per class.
Note that feature maps of different classes may be different.
Args:
kwargs (dict): Arguments are the same as those in \
:class:`AlignedAnchor3DRangeGenerator`.
"""
def __init__(self, **kwargs):
super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs)
assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \
' not supported currently in this kind of anchor generator.'
def grid_anchors(self, featmap_sizes, device='cuda'):
"""Generate grid anchors in multiple feature levels.
Args:
featmap_sizes (list[tuple]): List of feature map sizes for \
different classes in a single feature level.
device (str): Device where the anchors will be put on.
Returns:
list[list[torch.Tensor]]: Anchors in multiple feature levels. \
Note that in this anchor generator, we currently only \
support single feature level. The sizes of each tensor \
should be [num_sizes/ranges*num_rots*featmap_size, \
box_code_size].
"""
multi_level_anchors = []
anchors = self.multi_cls_grid_anchors(
featmap_sizes, self.scales[0], device=device)
multi_level_anchors.append(anchors)
return multi_level_anchors
def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'):
"""Generate grid anchors of a single level feature map for multi-class
with different feature map sizes.
This function is usually called by method ``self.grid_anchors``.
Args:
featmap_sizes (list[tuple]): List of feature map sizes for \
different classes in a single feature level.
scale (float): Scale factor of the anchors in the current level.
device (str, optional): Device the tensor will be put on.
Defaults to 'cuda'.
Returns:
torch.Tensor: Anchors in the overall feature map.
"""
assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \
'The number of different feature map sizes anchor sizes and ' + \
'ranges should be the same.'
multi_cls_anchors = []
for i in range(len(featmap_sizes)):
anchors = self.anchors_single_range(
featmap_sizes[i],
self.ranges[i],
scale,
self.sizes[i],
self.rotations,
device=device)
# [*featmap_size, num_sizes/ranges, num_rots, box_code_size]
ndim = len(featmap_sizes[i])
anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1))
# [*featmap_size, num_sizes/ranges*num_rots, box_code_size]
anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1)
# [num_sizes/ranges*num_rots, *featmap_size, box_code_size]
multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1)))
# [num_sizes/ranges*num_rots*featmap_size, box_code_size]
return multi_cls_anchors
================================================
FILE: mmdet3d/core/bbox/__init__.py
================================================
from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner
from .coders import DeltaXYZWLHRBBoxCoder
# from .bbox_target import bbox_target
from .iou_calculators import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
BboxOverlapsNearest3D,
axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
bbox_overlaps_nearest_3d)
from .samplers import (BaseSampler, CombinedSampler,
InstanceBalancedPosSampler, IoUBalancedNegSampler,
PseudoSampler, RandomSampler, SamplingResult)
from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes,
Coord3DMode, DepthInstance3DBoxes,
LiDARInstance3DBoxes, get_box_type, limit_period,
points_cam2img, xywhr2xyxyr)
from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back
__all__ = [
'BaseSampler', 'AssignResult', 'BaseAssigner', 'MaxIoUAssigner',
'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler',
'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult',
'DeltaXYZWLHRBBoxCoder', 'BboxOverlapsNearest3D', 'BboxOverlaps3D',
'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d',
'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d', 'Box3DMode',
'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'bbox3d2roi',
'bbox3d2result', 'DepthInstance3DBoxes', 'BaseInstance3DBoxes',
'bbox3d_mapping_back', 'xywhr2xyxyr', 'limit_period', 'points_cam2img',
'get_box_type', 'Coord3DMode'
]
================================================
FILE: mmdet3d/core/bbox/assigners/__init__.py
================================================
from mmdet.core.bbox import AssignResult, BaseAssigner, MaxIoUAssigner
from .hungarian_assigner import HungarianAssigner3D, HeuristicAssigner3D, HungarianAssignerView2D, HungarianAssignerViewProj2D, HungarianAssignerCameraBox
__all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult', 'HungarianAssigner3D', 'HeuristicAssigner',
'HungarianAssignerView2D', 'HungarianAssignerViewProj2D', 'HungarianAssignerCameraBox']
================================================
FILE: mmdet3d/core/bbox/assigners/hungarian_assigner.py
================================================
from mmdet.core.bbox.builder import BBOX_ASSIGNERS
from mmdet.core.bbox.assigners import AssignResult, BaseAssigner
from mmdet.core.bbox.match_costs import build_match_cost
from mmdet.core.bbox.match_costs.builder import MATCH_COST
from mmdet.core.bbox.iou_calculators import build_iou_calculator
from mmdet.core.bbox.assigners import HungarianAssigner
from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy
import torch
try:
from scipy.optimize import linear_sum_assignment
except ImportError:
linear_sum_assignment = None
@MATCH_COST.register_module()
class BBox3DL1Cost(object):
def __init__(self, weight):
self.weight = weight
def __call__(self, bboxes, gt_bboxes, train_cfg=None):
reg_cost = torch.cdist(bboxes, gt_bboxes, p=1)
return reg_cost * self.weight
@MATCH_COST.register_module()
class BBoxBEVL1Cost(object):
def __init__(self, weight):
self.weight = weight
def __call__(self, bboxes, gt_bboxes, train_cfg):
pc_start = bboxes.new(train_cfg['point_cloud_range'][0:2])
pc_range = bboxes.new(train_cfg['point_cloud_range'][3:5]) - bboxes.new(train_cfg['point_cloud_range'][0:2])
# normalize the box center to [0, 1]
normalized_bboxes_xy = (bboxes[:, :2] - pc_start) / pc_range
normalized_gt_bboxes_xy = (gt_bboxes[:, :2] - pc_start) / pc_range
reg_cost = torch.cdist(normalized_bboxes_xy, normalized_gt_bboxes_xy, p=1)
return reg_cost * self.weight
@MATCH_COST.register_module()
class IoU3DCost(object):
def __init__(self, weight):
self.weight = weight
def __call__(self, iou):
iou_cost = - iou
return iou_cost * self.weight
@BBOX_ASSIGNERS.register_module()
class HeuristicAssigner3D(BaseAssigner):
def __init__(self,
dist_thre=100,
iou_calculator=dict(type='BboxOverlaps3D')
):
self.dist_thre = dist_thre # distance in meter
self.iou_calculator = build_iou_calculator(iou_calculator)
def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None, query_labels=None):
dist_thre = self.dist_thre
num_gts, num_bboxes = len(gt_bboxes), len(bboxes)
bev_dist = torch.norm(bboxes[:, 0:2][None, :, :] - gt_bboxes[:, 0:2][:, None, :], dim=-1) # [num_gts, num_bboxes]
if query_labels is not None:
# only match the gt box and query with same category
not_same_class = (query_labels[None] != gt_labels[:, None])
bev_dist += not_same_class * dist_thre
# for each gt box, assign it to the nearest pred box
nearest_values, nearest_indices = bev_dist.min(1) # [num_gts]
assigned_gt_inds = torch.ones([num_bboxes, ]).to(bboxes) * 0
assigned_gt_vals = torch.ones([num_bboxes, ]).to(bboxes) * 10000
assigned_gt_labels = torch.ones([num_bboxes, ]).to(bboxes) * -1
for idx_gts in range(num_gts):
# for idx_pred in torch.where(bev_dist[idx_gts] < dist_thre)[0]: # each gt match to all the pred box within some radius
idx_pred = nearest_indices[idx_gts] # each gt only match to the nearest pred box
if bev_dist[idx_gts, idx_pred] <= dist_thre:
if bev_dist[idx_gts, idx_pred] < assigned_gt_vals[idx_pred]: # if this pred box is assigned, then compare
assigned_gt_vals[idx_pred] = bev_dist[idx_gts, idx_pred]
assigned_gt_inds[idx_pred] = idx_gts + 1 # for AssignResult, 0 is negative, -1 is ignore, 1-based indices are positive
assigned_gt_labels[idx_pred] = gt_labels[idx_gts]
max_overlaps = torch.zeros([num_bboxes, ]).to(bboxes)
matched_indices = torch.where(assigned_gt_inds > 0)
matched_iou = self.iou_calculator(gt_bboxes[assigned_gt_inds[matched_indices].long() - 1], bboxes[matched_indices]).diag()
max_overlaps[matched_indices] = matched_iou
return AssignResult(
num_gts, assigned_gt_inds.long(), max_overlaps, labels=assigned_gt_labels
)
@BBOX_ASSIGNERS.register_module()
class HungarianAssigner3D(BaseAssigner):
def __init__(self,
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxBEVL1Cost', weight=1.0),
iou_cost=dict(type='IoU3DCost', weight=1.0),
iou_calculator=dict(type='BboxOverlaps3D')
):
self.cls_cost = build_match_cost(cls_cost)
self.reg_cost = build_match_cost(reg_cost)
self.iou_cost = build_match_cost(iou_cost)
self.iou_calculator = build_iou_calculator(iou_calculator)
def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg):
num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
# 1. assign -1 by default
assigned_gt_inds = bboxes.new_full((num_bboxes,),
-1,
dtype=torch.long)
assigned_labels = bboxes.new_full((num_bboxes,),
-1,
dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, torch.zeros(assigned_gt_inds.shape[0]).to(assigned_gt_inds.device), labels=assigned_labels)
# return AssignResult(
# num_gts, assigned_gt_inds, None, labels=assigned_labels)
# 2. compute the weighted costs
# see mmdetection/mmdet/core/bbox/match_costs/match_cost.py
cls_cost = self.cls_cost(cls_pred[0].T, gt_labels)
reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg)
iou = self.iou_calculator(bboxes, gt_bboxes)
iou_cost = self.iou_cost(iou)
# weighted sum of above three costs
cost = cls_cost + reg_cost + iou_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = torch.from_numpy(matched_row_inds).to(bboxes.device)
matched_col_inds = torch.from_numpy(matched_col_inds).to(bboxes.device)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
max_overlaps = torch.zeros_like(iou.max(1).values)
max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds]
# max_overlaps = iou.max(1).values
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
@BBOX_ASSIGNERS.register_module()
class HungarianAssignerView2D(HungarianAssigner):
def __init__(self,
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxL1Cost', weight=1.0),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)):
super(HungarianAssignerView2D, self).__init__(cls_cost, reg_cost, iou_cost)
self.view_cost = ViewCost()
def assign(self,
bbox_pred,
cls_pred,
view,
gt_bboxes,
gt_labels,
img_w,
img_h,
gt_bboxes_ignore=None,
eps=1e-7):
"""Computes one-to-one matching based on the weighted costs.
This method assign each query prediction to a ground truth or
background. The `assigned_gt_inds` with -1 means don't care,
0 means negative sample, and positive number is the index (1-based)
of assigned gt.
The assignment is done in the following steps, the order matters.
1. assign every prediction to -1
2. compute the weighted costs
3. do Hungarian matching on CPU based on the costs
4. assign all to 0 (background) first, then for each matched pair
between predictions and gts, treat this prediction as foreground
and assign the corresponding gt index (plus 1) to it.
Args:
bbox_pred (Tensor): Predicted boxes with normalized coordinates
(cx, cy, w, h), which are all in range [0, 1]. Shape
[num_query, 4].
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_bboxes (Tensor): Ground truth boxes with unnormalized
coordinates (cx, cy, w, h). Shape [num_gt, 4].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
img_meta (dict): Meta information for current image.
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`. Default None.
eps (int | float, optional): A value added to the denominator for
numerical stability. Default 1e-7.
Returns:
:obj:`AssignResult`: The assigned result.
"""
assert gt_bboxes_ignore is None, \
'Only case when gt_bboxes_ignore is None is supported.'
num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes)
gt_views = gt_labels[..., 1]
gt_labels = gt_labels[..., 0]
# 1. assign -1 by default
assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
-1,
dtype=torch.long)
assigned_labels = bbox_pred.new_full((num_bboxes, ),
-1,
dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, None, labels=assigned_labels)
factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0)
# 2. compute the weighted costs
# classification and bboxcost.
cls_cost = self.cls_cost(cls_pred, gt_labels)
# regression L1 cost
normalize_gt_bboxes = gt_bboxes / factor
reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
# regression iou cost, defaultly giou is used in official DETR.
bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
iou_cost = self.iou_cost(bboxes, gt_bboxes)
iou = -iou_cost / self.iou_cost.weight
view_cost = self.view_cost(view, gt_views)
# weighted sum of above three costs
cost = cls_cost + reg_cost + iou_cost + view_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = torch.from_numpy(matched_row_inds).to(
bbox_pred.device)
matched_col_inds = torch.from_numpy(matched_col_inds).to(
bbox_pred.device)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
max_overlaps = torch.zeros_like(iou.max(1).values)
max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds]
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
@BBOX_ASSIGNERS.register_module()
class HungarianAssignerViewProj2D(HungarianAssigner):
def __init__(self,
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxL1Cost', weight=1.0),
iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0)):
super(HungarianAssignerViewProj2D, self).__init__(cls_cost, reg_cost, iou_cost)
self.view_cost = ViewCost()
def assign(self,
bbox_pred,
cls_pred,
center_pred,
offset_pred,
view,
gt_bboxes,
gt_labels,
gt_centers,
gt_offsets,
img_w,
img_h,
gt_bboxes_ignore=None,
eps=1e-7):
"""Computes one-to-one matching based on the weighted costs.
This method assign each query prediction to a ground truth or
background. The `assigned_gt_inds` with -1 means don't care,
0 means negative sample, and positive number is the index (1-based)
of assigned gt.
The assignment is done in the following steps, the order matters.
1. assign every prediction to -1
2. compute the weighted costs
3. do Hungarian matching on CPU based on the costs
4. assign all to 0 (background) first, then for each matched pair
between predictions and gts, treat this prediction as foreground
and assign the corresponding gt index (plus 1) to it.
Args:
bbox_pred (Tensor): Predicted boxes with normalized coordinates
(cx, cy, w, h), which are all in range [0, 1]. Shape
[num_query, 4].
cls_pred (Tensor): Predicted classification logits, shape
[num_query, num_class].
gt_bboxes (Tensor): Ground truth boxes with unnormalized
coordinates (cx, cy, w, h). Shape [num_gt, 4].
gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
img_meta (dict): Meta information for current image.
gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
labelled as `ignored`. Default None.
eps (int | float, optional): A value added to the denominator for
numerical stability. Default 1e-7.
Returns:
:obj:`AssignResult`: The assigned result.
"""
assert gt_bboxes_ignore is None, \
'Only case when gt_bboxes_ignore is None is supported.'
num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0)
gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes)
gt_views = gt_labels[..., 1]
gt_labels = gt_labels[..., 0]
# 1. assign -1 by default
assigned_gt_inds = bbox_pred.new_full((num_bboxes, ),
-1,
dtype=torch.long)
assigned_labels = bbox_pred.new_full((num_bboxes, ),
-1,
dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, torch.zeros(assigned_gt_inds.shape[0]).to(assigned_gt_inds.device), labels=assigned_labels)
factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0)
# 2. compute the weighted costs
# classification and bboxcost.
cls_cost = self.cls_cost(cls_pred, gt_labels)
# regression L1 cost
# reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes)
normalize_gt_centers = gt_centers / factor[:, :2]
reg_cost = self.reg_cost(center_pred, normalize_gt_centers)
normalize_gt_offsets = gt_offsets / factor
# reg_cost = reg_cost + self.reg_cost(offset_pred, normalize_gt_offsets) / 2
reg_cost = reg_cost + self.reg_cost(offset_pred, normalize_gt_offsets)
# regression iou cost, defaultly giou is used in official DETR.
bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
iou_cost = self.iou_cost(bboxes, gt_bboxes)
iou = -iou_cost / self.iou_cost.weight
view_cost = self.view_cost(view, gt_views)
# weighted sum of above three costs
cost = cls_cost + reg_cost + iou_cost + view_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = torch.from_numpy(matched_row_inds).to(
bbox_pred.device)
matched_col_inds = torch.from_numpy(matched_col_inds).to(
bbox_pred.device)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
max_overlaps = torch.zeros_like(iou.max(1).values)
max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds]
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
class ViewCost:
def __init__(self, weight=1000):
self.weight = weight
def __call__(self, view_pred, gt_views):
view_cost = torch.cdist(view_pred.unsqueeze(-1).float(), gt_views.unsqueeze(-1).float(), p=1)
view_cost = torch.clamp(view_cost, max=1, min=0)
return view_cost * self.weight
@BBOX_ASSIGNERS.register_module()
class HungarianAssignerCameraBox(BaseAssigner):
def __init__(self,
cls_cost=dict(type='ClassificationCost', weight=1.),
reg_cost=dict(type='BBoxBEVL1Cost', weight=1.0),
iou_cost=dict(type='IoU3DCost', weight=1.0),
iou_calculator=dict(type='BboxOverlaps3D')
):
self.cls_cost = build_match_cost(cls_cost)
self.reg_cost = build_match_cost(reg_cost)
self.iou_cost = build_match_cost(iou_cost)
self.iou_calculator = build_iou_calculator(iou_calculator)
self.view_cost = ViewCost()
def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, view, train_cfg):
num_gts, num_bboxes = gt_bboxes.size(0), bboxes.size(0)
# 1. assign -1 by default
assigned_gt_inds = bboxes.new_full((num_bboxes,),
-1,
dtype=torch.long)
assigned_labels = bboxes.new_full((num_bboxes,),
-1,
dtype=torch.long)
if num_gts == 0 or num_bboxes == 0:
# No ground truth or boxes, return empty assignment
if num_gts == 0:
# No ground truth, assign all to background
assigned_gt_inds[:] = 0
return AssignResult(
num_gts, assigned_gt_inds, torch.zeros(assigned_gt_inds.shape[0]).to(assigned_gt_inds.device), labels=assigned_labels)
# 2. compute the weighted costs
# see mmdetection/mmdet/core/bbox/match_costs/match_cost.py
gt_views = gt_labels[..., 1]
gt_labels = gt_labels[..., 0]
cls_cost = self.cls_cost(cls_pred[0].T, gt_labels)
reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg)
iou = self.iou_calculator(bboxes, gt_bboxes)
iou_cost = self.iou_cost(iou)
view_cost = self.view_cost(view, gt_views)
# weighted sum of above three costs
cost = cls_cost + reg_cost + iou_cost + view_cost
# 3. do Hungarian matching on CPU using linear_sum_assignment
cost = cost.detach().cpu()
if linear_sum_assignment is None:
raise ImportError('Please run "pip install scipy" '
'to install scipy first.')
matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
matched_row_inds = torch.from_numpy(matched_row_inds).to(bboxes.device)
matched_col_inds = torch.from_numpy(matched_col_inds).to(bboxes.device)
# 4. assign backgrounds and foregrounds
# assign all indices to backgrounds first
assigned_gt_inds[:] = 0
# assign foregrounds based on matching results
assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
assigned_labels[matched_row_inds] = gt_labels[matched_col_inds]
max_overlaps = torch.zeros_like(iou.max(1).values)
max_overlaps[matched_row_inds] = iou[matched_row_inds, matched_col_inds]
# max_overlaps = iou.max(1).values
return AssignResult(
num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
================================================
FILE: mmdet3d/core/bbox/box_np_ops.py
================================================
# TODO: clean the functions in this file and move the APIs into box structures
# in the future
import numba
import numpy as np
def camera_to_lidar(points, r_rect, velo2cam):
"""Convert points in camera coordinate to lidar coordinate.
Args:
points (np.ndarray, shape=[N, 3]): Points in camera coordinate.
r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
Returns:
np.ndarray, shape=[N, 3]: Points in lidar coordinate.
"""
points_shape = list(points.shape[0:-1])
if points.shape[-1] == 3:
points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1)
lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T)
return lidar_points[..., :3]
def box_camera_to_lidar(data, r_rect, velo2cam):
"""Covert boxes in camera coordinate to lidar coordinate.
Args:
data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
Returns:
np.ndarray, shape=[N, 3]: Boxes in lidar coordinate.
"""
xyz = data[:, 0:3]
l, h, w = data[:, 3:4], data[:, 4:5], data[:, 5:6]
r = data[:, 6:7]
xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam)
return np.concatenate([xyz_lidar, w, l, h, r], axis=1)
def corners_nd(dims, origin=0.5):
"""Generate relative box corners based on length per dim and origin point.
Args:
dims (np.ndarray, shape=[N, ndim]): Array of length per dim
origin (list or array or float): origin point relate to smallest point.
Returns:
np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners.
point layout example: (2d) x0y0, x0y1, x1y0, x1y1;
(3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
where x0 < x1, y0 < y1, z0 < z1.
"""
ndim = int(dims.shape[1])
corners_norm = np.stack(
np.unravel_index(np.arange(2**ndim), [2] * ndim),
axis=1).astype(dims.dtype)
# now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1
# (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1
# so need to convert to a format which is convenient to do other computing.
# for 2d boxes, format is clockwise start with minimum point
# for 3d boxes, please draw lines by your hand.
if ndim == 2:
# generate clockwise box corners
corners_norm = corners_norm[[0, 1, 3, 2]]
elif ndim == 3:
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
corners_norm = corners_norm - np.array(origin, dtype=dims.dtype)
corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape(
[1, 2**ndim, ndim])
return corners
def rotation_2d(points, angles):
"""Rotation 2d points based on origin point clockwise when angle positive.
Args:
points (np.ndarray): Points to be rotated with shape \
(N, point_size, 2).
angles (np.ndarray): Rotation angle with shape (N).
Returns:
np.ndarray: Same shape as points.
"""
rot_sin = np.sin(angles)
rot_cos = np.cos(angles)
rot_mat_T = np.stack([[rot_cos, -rot_sin], [rot_sin, rot_cos]])
return np.einsum('aij,jka->aik', points, rot_mat_T)
def center_to_corner_box2d(centers, dims, angles=None, origin=0.5):
"""Convert kitti locations, dimensions and angles to corners.
format: center(xy), dims(xy), angles(clockwise when positive)
Args:
centers (np.ndarray): Locations in kitti label file with shape (N, 2).
dims (np.ndarray): Dimensions in kitti label file with shape (N, 2).
angles (np.ndarray): Rotation_y in kitti label file with shape (N).
Returns:
np.ndarray: Corners with the shape of (N, 4, 2).
"""
# 'length' in kitti format is in x axis.
# xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
# center in kitti format is [0.5, 1.0, 0.5] in xyz.
corners = corners_nd(dims, origin=origin)
# corners: [N, 4, 2]
if angles is not None:
corners = rotation_2d(corners, angles)
corners += centers.reshape([-1, 1, 2])
return corners
@numba.jit(nopython=True)
def depth_to_points(depth, trunc_pixel):
"""Convert depth map to points.
Args:
depth (np.array, shape=[H, W]): Depth map which
the row of [0~`trunc_pixel`] are truncated.
trunc_pixel (int): The number of truncated row.
Returns:
np.ndarray: Points in camera coordinates.
"""
num_pts = np.sum(depth[trunc_pixel:, ] > 0.1)
points = np.zeros((num_pts, 3), dtype=depth.dtype)
x = np.array([0, 0, 1], dtype=depth.dtype)
k = 0
for i in range(trunc_pixel, depth.shape[0]):
for j in range(depth.shape[1]):
if depth[i, j] > 0.1:
x = np.array([j, i, 1], dtype=depth.dtype)
points[k] = x * depth[i, j]
k += 1
return points
def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam):
"""Convert depth map to points in lidar coordinate.
Args:
depth (np.array, shape=[H, W]): Depth map which
the row of [0~`trunc_pixel`] are truncated.
trunc_pixel (int): The number of truncated row.
P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
Returns:
np.ndarray: Points in lidar coordinates.
"""
pts = depth_to_points(depth, trunc_pixel)
points_shape = list(pts.shape[0:-1])
points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1)
points = points @ np.linalg.inv(P2.T)
lidar_points = camera_to_lidar(points, r_rect, velo2cam)
return lidar_points
def rotation_3d_in_axis(points, angles, axis=0):
"""Rotate points in specific axis.
Args:
points (np.ndarray, shape=[N, point_size, 3]]):
angles (np.ndarray, shape=[N]]):
axis (int): Axis to rotate at.
Returns:
np.ndarray: Rotated points.
"""
# points: [N, point_size, 3]
rot_sin = np.sin(angles)
rot_cos = np.cos(angles)
ones = np.ones_like(rot_cos)
zeros = np.zeros_like(rot_cos)
if axis == 1:
rot_mat_T = np.stack([[rot_cos, zeros, -rot_sin], [zeros, ones, zeros],
[rot_sin, zeros, rot_cos]])
elif axis == 2 or axis == -1:
rot_mat_T = np.stack([[rot_cos, -rot_sin, zeros],
[rot_sin, rot_cos, zeros], [zeros, zeros, ones]])
elif axis == 0:
rot_mat_T = np.stack([[zeros, rot_cos, -rot_sin],
[zeros, rot_sin, rot_cos], [ones, zeros, zeros]])
else:
raise ValueError('axis should in range')
return np.einsum('aij,jka->aik', points, rot_mat_T)
def center_to_corner_box3d(centers,
dims,
angles=None,
origin=(0.5, 1.0, 0.5),
axis=1):
"""Convert kitti locations, dimensions and angles to corners.
Args:
centers (np.ndarray): Locations in kitti label file with shape (N, 3).
dims (np.ndarray): Dimensions in kitti label file with shape (N, 3).
angles (np.ndarray): Rotation_y in kitti label file with shape (N).
origin (list or array or float): Origin point relate to smallest point.
use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
axis (int): Rotation axis. 1 for camera and 2 for lidar.
Returns:
np.ndarray: Corners with the shape of (N, 8, 3).
"""
# 'length' in kitti format is in x axis.
# yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar)
# center in kitti format is [0.5, 1.0, 0.5] in xyz.
corners = corners_nd(dims, origin=origin)
# corners: [N, 8, 3]
if angles is not None:
corners = rotation_3d_in_axis(corners, angles, axis=axis)
corners += centers.reshape([-1, 1, 3])
return corners
@numba.jit(nopython=True)
def box2d_to_corner_jit(boxes):
"""Convert box2d to corner.
Args:
boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation.
Returns:
box_corners (np.ndarray, shape=[N, 4, 2]): Box corners.
"""
num_box = boxes.shape[0]
corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
corners_norm[1, 1] = 1.0
corners_norm[2] = 1.0
corners_norm[3, 0] = 1.0
corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape(
1, 4, 2)
rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype)
for i in range(num_box):
rot_sin = np.sin(boxes[i, -1])
rot_cos = np.cos(boxes[i, -1])
rot_mat_T[0, 0] = rot_cos
rot_mat_T[0, 1] = -rot_sin
rot_mat_T[1, 0] = rot_sin
rot_mat_T[1, 1] = rot_cos
box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2]
return box_corners
@numba.njit
def corner_to_standup_nd_jit(boxes_corner):
"""Convert boxes_corner to aligned (min-max) boxes.
Args:
boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners.
Returns:
np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes.
"""
num_boxes = boxes_corner.shape[0]
ndim = boxes_corner.shape[-1]
result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype)
for i in range(num_boxes):
for j in range(ndim):
result[i, j] = np.min(boxes_corner[i, :, j])
for j in range(ndim):
result[i, j + ndim] = np.max(boxes_corner[i, :, j])
return result
@numba.jit(nopython=True)
def corner_to_surfaces_3d_jit(corners):
"""Convert 3d box corners from corner function above to surfaces that
normal vectors all direct to internal.
Args:
corners (np.ndarray): 3d box corners with the shape of (N, 8, 3).
Returns:
np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
"""
# box_corners: [N, 8, 3], must from corner functions in this module
num_boxes = corners.shape[0]
surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype)
corner_idxes = np.array([
0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7
]).reshape(6, 4)
for i in range(num_boxes):
for j in range(6):
for k in range(4):
surfaces[i, j, k] = corners[i, corner_idxes[j, k]]
return surfaces
def rotation_points_single_angle(points, angle, axis=0):
"""Rotate points with a single angle.
Args:
points (np.ndarray, shape=[N, 3]]):
angles (np.ndarray, shape=[1]]):
axis (int): Axis to rotate at.
Returns:
np.ndarray: Rotated points.
"""
# points: [N, 3]
rot_sin = np.sin(angle)
rot_cos = np.cos(angle)
if axis == 1:
rot_mat_T = np.array(
[[rot_cos, 0, -rot_sin], [0, 1, 0], [rot_sin, 0, rot_cos]],
dtype=points.dtype)
elif axis == 2 or axis == -1:
rot_mat_T = np.array(
[[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]],
dtype=points.dtype)
elif axis == 0:
rot_mat_T = np.array(
[[1, 0, 0], [0, rot_cos, -rot_sin], [0, rot_sin, rot_cos]],
dtype=points.dtype)
else:
raise ValueError('axis should in range')
return points @ rot_mat_T, rot_mat_T
def points_cam2img(points_3d, proj_mat, with_depth=False):
"""Project points in camera coordinates to image coordinates.
Args:
points_3d (np.ndarray): Points in shape (N, 3)
proj_mat (np.ndarray): Transformation matrix between coordinates.
Returns:
np.ndarray: Points in image coordinates with shape [N, 2].
"""
points_shape = list(points_3d.shape)
points_shape[-1] = 1
points_4 = np.concatenate([points_3d, np.ones(points_shape)], axis=-1)
assert len(proj_mat.shape) == 2, 'The dimension of the projection'\
f' matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), 'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = np.eye(4, dtype=proj_mat.dtype)
proj_mat_expanded[:d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
point_2d = points_4 @ proj_mat.T
point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
if with_depth:
points_2d_depth = np.concatenate([point_2d_res, point_2d[..., 2:3]], axis=-1)
return points_2d_depth
return point_2d_res
def box3d_to_bbox(box3d, P2):
"""Convert box3d in camera coordinates to bbox in image coordinates.
Args:
box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate.
P2 (np.array, shape=[4, 4]): Intrinsics of Camera2.
Returns:
np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates.
"""
box_corners = center_to_corner_box3d(
box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1)
box_corners_in_image = points_cam2img(box_corners, P2)
# box_corners_in_image: [N, 8, 2]
minxy = np.min(box_corners_in_image, axis=1)
maxxy = np.max(box_corners_in_image, axis=1)
bbox = np.concatenate([minxy, maxxy], axis=1)
return bbox
def corner_to_surfaces_3d(corners):
"""convert 3d box corners from corner function above to surfaces that
normal vectors all direct to internal.
Args:
corners (np.ndarray): 3D box corners with shape of (N, 8, 3).
Returns:
np.ndarray: Surfaces with the shape of (N, 6, 4, 3).
"""
# box_corners: [N, 8, 3], must from corner functions in this module
surfaces = np.array([
[corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]],
[corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]],
[corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]],
[corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]],
[corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]],
[corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]],
]).transpose([2, 0, 1, 3])
return surfaces
def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)):
"""Check points in rotated bbox and return indicces.
Args:
points (np.ndarray, shape=[N, 3+dim]): Points to query.
rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation.
z_axis (int): Indicate which axis is height.
origin (tuple[int]): Indicate the position of box center.
Returns:
np.ndarray, shape=[N, M]: Indices of points in each box.
"""
# TODO: this function is different from PointCloud3D, be careful
# when start to use nuscene, check the input
rbbox_corners = center_to_corner_box3d(
rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis)
surfaces = corner_to_surfaces_3d(rbbox_corners)
indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces)
return indices
def minmax_to_corner_2d(minmax_box):
"""Convert minmax box to corners2d.
Args:
minmax_box (np.ndarray, shape=[N, dims]): minmax boxes.
Returns:
np.ndarray: 2d corners of boxes
"""
ndim = minmax_box.shape[-1] // 2
center = minmax_box[..., :ndim]
dims = minmax_box[..., ndim:] - center
return center_to_corner_box2d(center, dims, origin=0.0)
def limit_period(val, offset=0.5, period=np.pi):
"""Limit the value into a period for periodic function.
Args:
val (np.ndarray): The value to be converted.
offset (float, optional): Offset to set the value range. \
Defaults to 0.5.
period (float, optional): Period of the value. Defaults to np.pi.
Returns:
torch.Tensor: Value in the range of \
[-offset * period, (1-offset) * period]
"""
return val - np.floor(val / period + offset) * period
def create_anchors_3d_range(feature_size,
anchor_range,
sizes=((1.6, 3.9, 1.56), ),
rotations=(0, np.pi / 2),
dtype=np.float32):
"""Create anchors 3d by range.
Args:
feature_size (list[float] | tuple[float]): Feature map size. It is
either a list of a tuple of [D, H, W](in order of z, y, and x).
anchor_range (torch.Tensor | list[float]): Range of anchors with
shape [6]. The order is consistent with that of anchors, i.e.,
(x_min, y_min, z_min, x_max, y_max, z_max).
sizes (list[list] | np.ndarray | torch.Tensor): Anchor size with
shape [N, 3], in order of x, y, z.
rotations (list[float] | np.ndarray | torch.Tensor): Rotations of
anchors in a single feature grid.
dtype (type): Data type. Default to np.float32.
Returns:
np.ndarray: Range based anchors with shape of \
(*feature_size, num_sizes, num_rots, 7).
"""
anchor_range = np.array(anchor_range, dtype)
z_centers = np.linspace(
anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype)
y_centers = np.linspace(
anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype)
x_centers = np.linspace(
anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype)
sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3])
rotations = np.array(rotations, dtype=dtype)
rets = np.meshgrid(
x_centers, y_centers, z_centers, rotations, indexing='ij')
tile_shape = [1] * 5
tile_shape[-2] = int(sizes.shape[0])
for i in range(len(rets)):
rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape)
rets[i] = rets[i][..., np.newaxis] # for concat
sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3])
tile_size_shape = list(rets[0].shape)
tile_size_shape[3] = 1
sizes = np.tile(sizes, tile_size_shape)
rets.insert(3, sizes)
ret = np.concatenate(rets, axis=-1)
return np.transpose(ret, [2, 1, 0, 3, 4, 5])
def center_to_minmax_2d(centers, dims, origin=0.5):
"""Center to minmax.
Args:
centers (np.ndarray): Center points.
dims (np.ndarray): Dimensions.
origin (list or array or float): origin point relate to smallest point.
Returns:
np.ndarray: Minmax points.
"""
if origin == 0.5:
return np.concatenate([centers - dims / 2, centers + dims / 2],
axis=-1)
corners = center_to_corner_box2d(centers, dims, origin=origin)
return corners[:, [0, 2]].reshape([-1, 4])
def rbbox2d_to_near_bbox(rbboxes):
"""convert rotated bbox to nearest 'standing' or 'lying' bbox.
Args:
rbboxes (np.ndarray): Rotated bboxes with shape of \
(N, 5(x, y, xdim, ydim, rad)).
Returns:
np.ndarray: Bounding boxes with the shpae of
(N, 4(xmin, ymin, xmax, ymax)).
"""
rots = rbboxes[..., -1]
rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi))
cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis]
bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4])
bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:])
return bboxes
@numba.jit(nopython=True)
def iou_jit(boxes, query_boxes, mode='iou', eps=0.0):
"""Calculate box iou. Note that jit version runs ~10x faster than the
box_overlaps function in mmdet3d.core.evaluation.
Args:
boxes (np.ndarray): Input bounding boxes with shape of (N, 4).
query_boxes (np.ndarray): Query boxes with shape of (K, 4).
Returns:
np.ndarray: Overlap between boxes and query_boxes
with the shape of [N, K].
"""
N = boxes.shape[0]
K = query_boxes.shape[0]
overlaps = np.zeros((N, K), dtype=boxes.dtype)
for k in range(K):
box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) *
(query_boxes[k, 3] - query_boxes[k, 1] + eps))
for n in range(N):
iw = (
min(boxes[n, 2], query_boxes[k, 2]) -
max(boxes[n, 0], query_boxes[k, 0]) + eps)
if iw > 0:
ih = (
min(boxes[n, 3], query_boxes[k, 3]) -
max(boxes[n, 1], query_boxes[k, 1]) + eps)
if ih > 0:
if mode == 'iou':
ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
(boxes[n, 3] - boxes[n, 1] + eps) + box_area -
iw * ih)
else:
ua = ((boxes[n, 2] - boxes[n, 0] + eps) *
(boxes[n, 3] - boxes[n, 1] + eps))
overlaps[n, k] = iw * ih / ua
return overlaps
def projection_matrix_to_CRT_kitti(proj):
"""Split projection matrix of kitti.
P = C @ [R|T]
C is upper triangular matrix, so we need to inverse CR and use QR
stable for all kitti camera projection matrix.
Args:
proj (p.array, shape=[4, 4]): Intrinsics of camera.
Returns:
tuple[np.ndarray]: Splited matrix of C, R and T.
"""
CR = proj[0:3, 0:3]
CT = proj[0:3, 3]
RinvCinv = np.linalg.inv(CR)
Rinv, Cinv = np.linalg.qr(RinvCinv)
C = np.linalg.inv(Cinv)
R = np.linalg.inv(Rinv)
T = Cinv @ CT
return C, R, T
def remove_outside_points(points, rect, Trv2c, P2, image_shape):
"""Remove points which are outside of image.
Args:
points (np.ndarray, shape=[N, 3+dims]): Total points.
rect (np.ndarray, shape=[4, 4]): Matrix to project points in
specific camera coordinate (e.g. CAM2) to CAM0.
Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in
camera coordinate to lidar coordinate.
P2 (p.array, shape=[4, 4]): Intrinsics of Camera2.
image_shape (list[int]): Shape of image.
Returns:
np.ndarray, shape=[N, 3+dims]: Filtered points.
"""
# 5x faster than remove_outside_points_v1(2ms vs 10ms)
C, R, T = projection_matrix_to_CRT_kitti(P2)
image_bbox = [0, 0, image_shape[1], image_shape[0]]
frustum = get_frustum(image_bbox, C)
frustum -= T
frustum = np.linalg.inv(R) @ frustum.T
frustum = camera_to_lidar(frustum.T, rect, Trv2c)
frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...])
indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces)
points = points[indices.reshape([-1])]
return points
def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100):
"""Get frustum corners in camera coordinates.
Args:
bbox_image (list[int]): box in image coordinates.
C (np.ndarray): Intrinsics.
near_clip (float): Nearest distance of frustum.
far_clip (float): Farthest distance of frustum.
Returns:
np.ndarray, shape=[8, 3]: coordinates of frustum corners.
"""
fku = C[0, 0]
fkv = -C[1, 1]
u0v0 = C[0:2, 2]
z_points = np.array(
[near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis]
b = bbox_image
box_corners = np.array(
[[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]],
dtype=C.dtype)
near_box_corners = (box_corners - u0v0) / np.array(
[fku / near_clip, -fkv / near_clip], dtype=C.dtype)
far_box_corners = (box_corners - u0v0) / np.array(
[fku / far_clip, -fkv / far_clip], dtype=C.dtype)
ret_xy = np.concatenate([near_box_corners, far_box_corners],
axis=0) # [8, 2]
ret_xyz = np.concatenate([ret_xy, z_points], axis=1)
return ret_xyz
def surface_equ_3d(polygon_surfaces):
"""
Args:
polygon_surfaces (np.ndarray): Polygon surfaces with shape of
[num_polygon, max_num_surfaces, max_num_points_of_surface, 3].
All surfaces' normal vector must direct to internal.
Max_num_points_of_surface must at least 3.
Returns:
tuple: normal vector and its direction.
"""
# return [a, b, c], d in ax+by+cz+d=0
# polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3]
surface_vec = polygon_surfaces[:, :, :2, :] - \
polygon_surfaces[:, :, 1:3, :]
# normal_vec: [..., 3]
normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :])
# print(normal_vec.shape, points[..., 0, :].shape)
# d = -np.inner(normal_vec, points[..., 0, :])
d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :])
return normal_vec, -d
@numba.njit
def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d,
num_surfaces):
"""
Args:
points (np.ndarray): Input points with shape of (num_points, 3).
polygon_surfaces (np.ndarray): Polygon surfaces with shape of
(num_polygon, max_num_surfaces, max_num_points_of_surface, 3).
All surfaces' normal vector must direct to internal.
Max_num_points_of_surface must at least 3.
normal_vec (np.ndarray): Normal vector of polygon_surfaces.
d (int): Directions of normal vector.
num_surfaces (np.ndarray): Number of surfaces a polygon contains
shape of (num_polygon).
Returns:
np.ndarray: Result matrix with the shape of [num_points, num_polygon].
"""
max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
num_points = points.shape[0]
num_polygons = polygon_surfaces.shape[0]
ret = np.ones((num_points, num_polygons), dtype=np.bool_)
sign = 0.0
for i in range(num_points):
for j in range(num_polygons):
for k in range(max_num_surfaces):
if k > num_surfaces[j]:
break
sign = (
points[i, 0] * normal_vec[j, k, 0] +
points[i, 1] * normal_vec[j, k, 1] +
points[i, 2] * normal_vec[j, k, 2] + d[j, k])
if sign >= 0:
ret[i, j] = False
break
return ret
def points_in_convex_polygon_3d_jit(points,
polygon_surfaces,
num_surfaces=None):
"""Check points is in 3d convex polygons.
Args:
points (np.ndarray): Input points with shape of (num_points, 3).
polygon_surfaces (np.ndarray): Polygon surfaces with shape of \
(num_polygon, max_num_surfaces, max_num_points_of_surface, 3). \
All surfaces' normal vector must direct to internal. \
Max_num_points_of_surface must at least 3.
num_surfaces (np.ndarray): Number of surfaces a polygon contains \
shape of (num_polygon).
Returns:
np.ndarray: Result matrix with the shape of [num_points, num_polygon].
"""
max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3]
# num_points = points.shape[0]
num_polygons = polygon_surfaces.shape[0]
if num_surfaces is None:
num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64)
normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :])
# normal_vec: [num_polygon, max_num_surfaces, 3]
# d: [num_polygon, max_num_surfaces]
return _points_in_convex_polygon_3d_jit(points, polygon_surfaces,
normal_vec, d, num_surfaces)
@numba.jit
def points_in_convex_polygon_jit(points, polygon, clockwise=True):
"""Check points is in 2d convex polygons. True when point in polygon.
Args:
points (np.ndarray): Input points with the shape of [num_points, 2].
polygon (np.ndarray): Input polygon with the shape of
[num_polygon, num_points_of_polygon, 2].
clockwise (bool): Indicate polygon is clockwise.
Returns:
np.ndarray: Result matrix with the shape of [num_points, num_polygon].
"""
# first convert polygon to directed lines
num_points_of_polygon = polygon.shape[1]
num_points = points.shape[0]
num_polygons = polygon.shape[0]
# if clockwise:
# vec1 = polygon - polygon[:, [num_points_of_polygon - 1] +
# list(range(num_points_of_polygon - 1)), :]
# else:
# vec1 = polygon[:, [num_points_of_polygon - 1] +
# list(range(num_points_of_polygon - 1)), :] - polygon
# vec1: [num_polygon, num_points_of_polygon, 2]
vec1 = np.zeros((2), dtype=polygon.dtype)
ret = np.zeros((num_points, num_polygons), dtype=np.bool_)
success = True
cross = 0.0
for i in range(num_points):
for j in range(num_polygons):
success = True
for k in range(num_points_of_polygon):
if clockwise:
vec1 = polygon[j, k] - polygon[j, k - 1]
else:
vec1 = polygon[j, k - 1] - polygon[j, k]
cross = vec1[1] * (polygon[j, k, 0] - points[i, 0])
cross -= vec1[0] * (polygon[j, k, 1] - points[i, 1])
if cross >= 0:
success = False
break
ret[i, j] = success
return ret
def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True):
"""Convert kitti center boxes to corners.
7 -------- 4
/| /|
6 -------- 5 .
| | | |
. 3 -------- 0
|/ |/
2 -------- 1
Args:
boxes3d (np.ndarray): Boxes with shape of (N, 7) \
[x, y, z, w, l, h, ry] in LiDAR coords, see the definition of ry \
in KITTI dataset.
bottom_center (bool): Whether z is on the bottom center of object.
Returns:
np.ndarray: Box corners with the shape of [N, 8, 3].
"""
boxes_num = boxes3d.shape[0]
w, l, h = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5]
x_corners = np.array(
[w / 2., -w / 2., -w / 2., w / 2., w / 2., -w / 2., -w / 2., w / 2.],
dtype=np.float32).T
y_corners = np.array(
[-l / 2., -l / 2., l / 2., l / 2., -l / 2., -l / 2., l / 2., l / 2.],
dtype=np.float32).T
if bottom_center:
z_corners = np.zeros((boxes_num, 8), dtype=np.float32)
z_corners[:, 4:8] = h.reshape(boxes_num, 1).repeat(4, axis=1) # (N, 8)
else:
z_corners = np.array([
-h / 2., -h / 2., -h / 2., -h / 2., h / 2., h / 2., h / 2., h / 2.
],
dtype=np.float32).T
ry = boxes3d[:, 6]
zeros, ones = np.zeros(
ry.size, dtype=np.float32), np.ones(
ry.size, dtype=np.float32)
rot_list = np.array([[np.cos(ry), -np.sin(ry), zeros],
[np.sin(ry), np.cos(ry), zeros], [zeros, zeros,
ones]]) # (3, 3, N)
R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3)
temp_corners = np.concatenate((x_corners.reshape(
-1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)),
axis=2) # (N, 8, 3)
rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3)
x_corners = rotated_corners[:, :, 0]
y_corners = rotated_corners[:, :, 1]
z_corners = rotated_corners[:, :, 2]
x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2]
x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8)
y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8)
z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8)
corners = np.concatenate(
(x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)),
axis=2)
return corners.astype(np.float32)
================================================
FILE: mmdet3d/core/bbox/coders/__init__.py
================================================
from mmdet.core.bbox import build_bbox_coder
from .anchor_free_bbox_coder import AnchorFreeBBoxCoder
from .centerpoint_bbox_coders import CenterPointBBoxCoder
from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder
from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
from .transfusion_bbox_coder import TransFusionBBoxCoder
from .camera_bbox_coder import CameraBBoxCoder
__all__ = [
'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder',
'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'TransFusionBBoxCoder',
'CameraBBoxCoder'
]
================================================
FILE: mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py
================================================
import numpy as np
import torch
from mmdet.core.bbox.builder import BBOX_CODERS
from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder
@BBOX_CODERS.register_module()
class AnchorFreeBBoxCoder(PartialBinBasedBBoxCoder):
"""Anchor free bbox coder for 3D boxes.
Args:
num_dir_bins (int): Number of bins to encode direction angle.
with_rot (bool): Whether the bbox is with rotation.
"""
def __init__(self, num_dir_bins, with_rot=True):
super(AnchorFreeBBoxCoder, self).__init__(
num_dir_bins, 0, [], with_rot=with_rot)
self.num_dir_bins = num_dir_bins
self.with_rot = with_rot
def encode(self, gt_bboxes_3d, gt_labels_3d):
"""Encode ground truth to prediction targets.
Args:
gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \
with shape (n, 7).
gt_labels_3d (torch.Tensor): Ground truth classes.
Returns:
tuple: Targets of center, size and direction.
"""
# generate center target
center_target = gt_bboxes_3d.gravity_center
# generate bbox size target
size_res_target = gt_bboxes_3d.dims / 2
# generate dir target
box_num = gt_labels_3d.shape[0]
if self.with_rot:
(dir_class_target,
dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
dir_res_target /= (2 * np.pi / self.num_dir_bins)
else:
dir_class_target = gt_labels_3d.new_zeros(box_num)
dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
return (center_target, size_res_target, dir_class_target,
dir_res_target)
def decode(self, bbox_out):
"""Decode predicted parts to bbox3d.
Args:
bbox_out (dict): Predictions from model, should contain keys below.
- center: predicted bottom center of bboxes.
- dir_class: predicted bbox direction class.
- dir_res: predicted bbox direction residual.
- size: predicted bbox size.
Returns:
torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
"""
center = bbox_out['center']
batch_size, num_proposal = center.shape[:2]
# decode heading angle
if self.with_rot:
dir_class = torch.argmax(bbox_out['dir_class'], -1)
dir_res = torch.gather(bbox_out['dir_res'], 2,
dir_class.unsqueeze(-1))
dir_res.squeeze_(2)
dir_angle = self.class2angle(dir_class, dir_res).reshape(
batch_size, num_proposal, 1)
else:
dir_angle = center.new_zeros(batch_size, num_proposal, 1)
# decode bbox size
bbox_size = torch.clamp(bbox_out['size'] * 2, min=0.1)
bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
return bbox3d
def split_pred(self, cls_preds, reg_preds, base_xyz):
"""Split predicted features to specific parts.
Args:
cls_preds (torch.Tensor): Class predicted features to split.
reg_preds (torch.Tensor): Regression predicted features to split.
base_xyz (torch.Tensor): Coordinates of points.
Returns:
dict[str, torch.Tensor]: Split results.
"""
results = {}
results['obj_scores'] = cls_preds
start, end = 0, 0
reg_preds_trans = reg_preds.transpose(2, 1)
# decode center
end += 3
# (batch_size, num_proposal, 3)
results['center_offset'] = reg_preds_trans[..., start:end]
results['center'] = base_xyz.detach() + reg_preds_trans[..., start:end]
start = end
# decode center
end += 3
# (batch_size, num_proposal, 3)
results['size'] = reg_preds_trans[..., start:end]
start = end
# decode direction
end += self.num_dir_bins
results['dir_class'] = reg_preds_trans[..., start:end]
start = end
end += self.num_dir_bins
dir_res_norm = reg_preds_trans[..., start:end]
start = end
results['dir_res_norm'] = dir_res_norm
results['dir_res'] = dir_res_norm * (2 * np.pi / self.num_dir_bins)
return results
================================================
FILE: mmdet3d/core/bbox/coders/camera_bbox_coder.py
================================================
import torch
from mmdet.core.bbox import BaseBBoxCoder
from mmdet.core.bbox.builder import BBOX_CODERS
@BBOX_CODERS.register_module()
class CameraBBoxCoder(BaseBBoxCoder):
def __init__(self, code_size=8):
self.code_size = code_size
def encode(self, dst_boxes):
targets = torch.zeros([dst_boxes.shape[0], self.code_size]).to(dst_boxes.device)
targets[:, 3] = dst_boxes[:, 3].log()
targets[:, 4] = dst_boxes[:, 4].log()
targets[:, 5] = dst_boxes[:, 5].log()
targets[:, 6] = torch.sin(dst_boxes[:, 6])
targets[:, 7] = torch.cos(dst_boxes[:, 6])
targets[:, 0] = dst_boxes[:, 0]
targets[:, 1] = dst_boxes[:, 1] - 0.5 * dst_boxes[:, 4]
targets[:, 2] = dst_boxes[:, 2]
if self.code_size == 10:
targets[:, 8:10] = dst_boxes[:, 7:]
return targets
def decode(self, cls, rot, dim, center, vel):
"""Decode bboxes.
Args:
cls (torch.Tensor): Heatmap with the shape of [B, num_cls, num_proposals].
rot (torch.Tensor): Rotation with the shape of
[B, 2, num_proposals].
dim (torch.Tensor): Dim of the boxes with the shape of
[B, 3, num_proposals].
center (torch.Tensor): bev center of the boxes with the shape of
[B, 3, num_proposals]. (in feature map metric)
vel (torch.Tensor): Velocity with the shape of [B, 2, num_proposals].
Returns:
list[dict]: Decoded boxes.
"""
# class label
final_preds = cls.max(1, keepdims=False).indices
final_scores = cls.max(1, keepdims=False).values
dim[:, 0, :] = dim[:, 0, :].exp()
dim[:, 1, :] = dim[:, 1, :].exp()
dim[:, 2, :] = dim[:, 2, :].exp()
# dim = torch.exp(dim)
rots, rotc = rot[:, 0:1, :], rot[:, 1:2, :]
rot = torch.atan2(rots, rotc)
center = center.clone()
center[:, 1, :] = center[:, 1, :] + 0.5 * dim[:, 1, :]
if vel is None:
final_box_preds = torch.cat([center, dim, rot], dim=1).permute(0, 2, 1)
else:
final_box_preds = torch.cat([center, dim, rot, vel], dim=1).permute(0, 2, 1)
predictions_dicts = []
for i in range(cls.shape[0]):
boxes3d = final_box_preds[i]
scores = final_scores[i]
labels = final_preds[i]
predictions_dict = {
'bboxes': boxes3d,
'scores': scores,
'labels': labels
}
predictions_dicts.append(predictions_dict)
return predictions_dicts
@staticmethod
def decode_yaw(bbox, centers2d, cam2img):
bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2], cam2img[0, 0]) + bbox[:, 6]
return bbox
================================================
FILE: mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py
================================================
import torch
from mmdet.core.bbox import BaseBBoxCoder
from mmdet.core.bbox.builder import BBOX_CODERS
@BBOX_CODERS.register_module()
class CenterPointBBoxCoder(BaseBBoxCoder):
"""Bbox coder for CenterPoint.
Args:
pc_range (list[float]): Range of point cloud.
out_size_factor (int): Downsample factor of the model.
voxel_size (list[float]): Size of voxel.
post_center_range (list[float]): Limit of the center.
Default: None.
max_num (int): Max number to be kept. Default: 100.
score_threshold (float): Threshold to filter boxes based on score.
Default: None.
code_size (int): Code size of bboxes. Default: 9
"""
def __init__(self,
pc_range,
out_size_factor,
voxel_size,
post_center_range=None,
max_num=100,
score_threshold=None,
code_size=9):
self.pc_range = pc_range
self.out_size_factor = out_size_factor
self.voxel_size = voxel_size
self.post_center_range = post_center_range
self.max_num = max_num
self.score_threshold = score_threshold
self.code_size = code_size
def _gather_feat(self, feats, inds, feat_masks=None):
"""Given feats and indexes, returns the gathered feats.
Args:
feats (torch.Tensor): Features to be transposed and gathered
with the shape of [B, 2, W, H].
inds (torch.Tensor): Indexes with the shape of [B, N].
feat_masks (torch.Tensor): Mask of the feats. Default: None.
Returns:
torch.Tensor: Gathered feats.
"""
dim = feats.size(2)
inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim)
feats = feats.gather(1, inds)
if feat_masks is not None:
feat_masks = feat_masks.unsqueeze(2).expand_as(feats)
feats = feats[feat_masks]
feats = feats.view(-1, dim)
return feats
def _topk(self, scores, K=80):
"""Get indexes based on scores.
Args:
scores (torch.Tensor): scores with the shape of [B, N, W, H].
K (int): Number to be kept. Defaults to 80.
Returns:
tuple[torch.Tensor]
torch.Tensor: Selected scores with the shape of [B, K].
torch.Tensor: Selected indexes with the shape of [B, K].
torch.Tensor: Selected classes with the shape of [B, K].
torch.Tensor: Selected y coord with the shape of [B, K].
torch.Tensor: Selected x coord with the shape of [B, K].
"""
batch, cat, height, width = scores.size()
topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K)
topk_inds = topk_inds % (height * width)
topk_ys = (topk_inds.float() /
torch.tensor(width, dtype=torch.float)).int().float()
topk_xs = (topk_inds % width).int().float()
topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K)
topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int()
topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1),
topk_ind).view(batch, K)
topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1),
topk_ind).view(batch, K)
topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1),
topk_ind).view(batch, K)
return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
def _transpose_and_gather_feat(self, feat, ind):
"""Given feats and indexes, returns the transposed and gathered feats.
Args:
feat (torch.Tensor): Features to be transposed and gathered
with the shape of [B, 2, W, H].
ind (torch.Tensor): Indexes with the shape of [B, N].
Returns:
torch.Tensor: Transposed and gathered feats.
"""
feat = feat.permute(0, 2, 3, 1).contiguous()
feat = feat.view(feat.size(0), -1, feat.size(3))
feat = self._gather_feat(feat, ind)
return feat
def encode(self):
pass
def decode(self,
heat,
rot_sine,
rot_cosine,
hei,
dim,
vel,
reg=None,
task_id=-1):
"""Decode bboxes.
Args:
heat (torch.Tensor): Heatmap with the shape of [B, N, W, H].
rot_sine (torch.Tensor): Sine of rotation with the shape of
[B, 1, W, H].
rot_cosine (torch.Tensor): Cosine of rotation with the shape of
[B, 1, W, H].
hei (torch.Tensor): Height of the boxes with the shape
of [B, 1, W, H].
dim (torch.Tensor): Dim of the boxes with the shape of
[B, 1, W, H].
vel (torch.Tensor): Velocity with the shape of [B, 1, W, H].
reg (torch.Tensor): Regression value of the boxes in 2D with
the shape of [B, 2, W, H]. Default: None.
task_id (int): Index of task. Default: -1.
Returns:
list[dict]: Decoded boxes.
"""
batch, cat, _, _ = heat.size()
scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num)
if reg is not None:
reg = self._transpose_and_gather_feat(reg, inds)
reg = reg.view(batch, self.max_num, 2)
xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1]
ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2]
else:
xs = xs.view(batch, self.max_num, 1) + 0.5
ys = ys.view(batch, self.max_num, 1) + 0.5
# rotation value and direction label
rot_sine = self._transpose_and_gather_feat(rot_sine, inds)
rot_sine = rot_sine.view(batch, self.max_num, 1)
rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds)
rot_cosine = rot_cosine.view(batch, self.max_num, 1)
rot = torch.atan2(rot_sine, rot_cosine)
# height in the bev
hei = self._transpose_and_gather_feat(hei, inds)
hei = hei.view(batch, self.max_num, 1)
# dim of the box
dim = self._transpose_and_gather_feat(dim, inds)
dim = dim.view(batch, self.max_num, 3)
# class label
clses = clses.view(batch, self.max_num).float()
scores = scores.view(batch, self.max_num)
xs = xs.view(
batch, self.max_num,
1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
ys = ys.view(
batch, self.max_num,
1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
if vel is None: # KITTI FORMAT
final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2)
else: # exist velocity, nuscene format
vel = self._transpose_and_gather_feat(vel, inds)
vel = vel.view(batch, self.max_num, 2)
final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2)
final_scores = scores
final_preds = clses
# use score threshold
if self.score_threshold is not None:
thresh_mask = final_scores > self.score_threshold
if self.post_center_range is not None:
self.post_center_range = torch.tensor(
self.post_center_range, device=heat.device)
mask = (final_box_preds[..., :3] >=
self.post_center_range[:3]).all(2)
mask &= (final_box_preds[..., :3] <=
self.post_center_range[3:]).all(2)
predictions_dicts = []
for i in range(batch):
cmask = mask[i, :]
if self.score_threshold:
cmask &= thresh_mask[i]
boxes3d = final_box_preds[i, cmask]
scores = final_scores[i, cmask]
labels = final_preds[i, cmask]
predictions_dict = {
'bboxes': boxes3d,
'scores': scores,
'labels': labels
}
predictions_dicts.append(predictions_dict)
else:
raise NotImplementedError(
'Need to reorganize output as a batch, only '
'support post_center_range is not None for now!')
return predictions_dicts
================================================
FILE: mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py
================================================
import torch
from mmdet.core.bbox import BaseBBoxCoder
from mmdet.core.bbox.builder import BBOX_CODERS
@BBOX_CODERS.register_module()
class DeltaXYZWLHRBBoxCoder(BaseBBoxCoder):
"""Bbox Coder for 3D boxes.
Args:
code_size (int): The dimension of boxes to be encoded.
"""
def __init__(self, code_size=7):
super(DeltaXYZWLHRBBoxCoder, self).__init__()
self.code_size = code_size
@staticmethod
def encode(src_boxes, dst_boxes):
"""Get box regression transformation deltas (dx, dy, dz, dw, dh, dl,
dr, dv*) that can be used to transform the `src_boxes` into the
`target_boxes`.
Args:
src_boxes (torch.Tensor): source boxes, e.g., object proposals.
dst_boxes (torch.Tensor): target of the transformation, e.g.,
ground-truth boxes.
Returns:
torch.Tensor: Box transformation deltas.
"""
box_ndim = src_boxes.shape[-1]
cas, cgs, cts = [], [], []
if box_ndim > 7:
xa, ya, za, wa, la, ha, ra, *cas = torch.split(
src_boxes, 1, dim=-1)
xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split(
dst_boxes, 1, dim=-1)
cts = [g - a for g, a in zip(cgs, cas)]
else:
xa, ya, za, wa, la, ha, ra = torch.split(src_boxes, 1, dim=-1)
xg, yg, zg, wg, lg, hg, rg = torch.split(dst_boxes, 1, dim=-1)
za = za + ha / 2
zg = zg + hg / 2
diagonal = torch.sqrt(la**2 + wa**2)
xt = (xg - xa) / diagonal
yt = (yg - ya) / diagonal
zt = (zg - za) / ha
lt = torch.log(lg / la)
wt = torch.log(wg / wa)
ht = torch.log(hg / ha)
rt = rg - ra
return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1)
@staticmethod
def decode(anchors, deltas):
"""Apply transformation `deltas` (dx, dy, dz, dw, dh, dl, dr, dv*) to
`boxes`.
Args:
anchors (torch.Tensor): Parameters of anchors with shape (N, 7).
deltas (torch.Tensor): Encoded boxes with shape
(N, 7+n) [x, y, z, w, l, h, r, velo*].
Returns:
torch.Tensor: Decoded boxes.
"""
cas, cts = [], []
box_ndim = anchors.shape[-1]
if box_ndim > 7:
xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1)
xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(deltas, 1, dim=-1)
else:
xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1)
xt, yt, zt, wt, lt, ht, rt = torch.split(deltas, 1, dim=-1)
za = za + ha / 2
diagonal = torch.sqrt(la**2 + wa**2)
xg = xt * diagonal + xa
yg = yt * diagonal + ya
zg = zt * ha + za
lg = torch.exp(lt) * la
wg = torch.exp(wt) * wa
hg = torch.exp(ht) * ha
rg = rt + ra
zg = zg - hg / 2
cgs = [t + a for t, a in zip(cts, cas)]
return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1)
================================================
FILE: mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py
================================================
import numpy as np
import torch
from mmdet.core.bbox import BaseBBoxCoder
from mmdet.core.bbox.builder import BBOX_CODERS
@BBOX_CODERS.register_module()
class PartialBinBasedBBoxCoder(BaseBBoxCoder):
"""Partial bin based bbox coder.
Args:
num_dir_bins (int): Number of bins to encode direction angle.
num_sizes (int): Number of size clusters.
mean_sizes (list[list[int]]): Mean size of bboxes in each class.
with_rot (bool): Whether the bbox is with rotation.
"""
def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True):
super(PartialBinBasedBBoxCoder, self).__init__()
assert len(mean_sizes) == num_sizes
self.num_dir_bins = num_dir_bins
self.num_sizes = num_sizes
self.mean_sizes = mean_sizes
self.with_rot = with_rot
def encode(self, gt_bboxes_3d, gt_labels_3d):
"""Encode ground truth to prediction targets.
Args:
gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes \
with shape (n, 7).
gt_labels_3d (torch.Tensor): Ground truth classes.
Returns:
tuple: Targets of center, size and direction.
"""
# generate center target
center_target = gt_bboxes_3d.gravity_center
# generate bbox size target
size_class_target = gt_labels_3d
size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor(
self.mean_sizes)[size_class_target]
# generate dir target
box_num = gt_labels_3d.shape[0]
if self.with_rot:
(dir_class_target,
dir_res_target) = self.angle2class(gt_bboxes_3d.yaw)
else:
dir_class_target = gt_labels_3d.new_zeros(box_num)
dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num)
return (center_target, size_class_target, size_res_target,
dir_class_target, dir_res_target)
def decode(self, bbox_out, suffix=''):
"""Decode predicted parts to bbox3d.
Args:
bbox_out (dict): Predictions from model, should contain keys below.
- center: predicted bottom center of bboxes.
- dir_class: predicted bbox direction class.
- dir_res: predicted bbox direction residual.
- size_class: predicted bbox size class.
- size_res: predicted bbox size residual.
suffix (str): Decode predictions with specific suffix.
Returns:
torch.Tensor: Decoded bbox3d with shape (batch, n, 7).
"""
center = bbox_out['center' + suffix]
batch_size, num_proposal = center.shape[:2]
# decode heading angle
if self.with_rot:
dir_class = torch.argmax(bbox_out['dir_class' + suffix], -1)
dir_res = torch.gather(bbox_out['dir_res' + suffix], 2,
dir_class.unsqueeze(-1))
dir_res.squeeze_(2)
dir_angle = self.class2angle(dir_class, dir_res).reshape(
batch_size, num_proposal, 1)
else:
dir_angle = center.new_zeros(batch_size, num_proposal, 1)
# decode bbox size
size_class = torch.argmax(
bbox_out['size_class' + suffix], -1, keepdim=True)
size_res = torch.gather(bbox_out['size_res' + suffix], 2,
size_class.unsqueeze(-1).repeat(1, 1, 1, 3))
mean_sizes = center.new_tensor(self.mean_sizes)
size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1))
bbox_size = size_base.reshape(batch_size, num_proposal,
-1) + size_res.squeeze(2)
bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1)
return bbox3d
def decode_corners(self, center, size_res, size_class):
"""Decode center, size residuals and class to corners. Only useful for
axis-aligned bounding boxes, so angle isn't considered.
Args:
center (torch.Tensor): Shape [B, N, 3]
size_res (torch.Tensor): Shape [B, N, 3] or [B, N, C, 3]
size_class (torch.Tensor): Shape: [B, N] or [B, N, 1]
or [B, N, C, 3]
Returns:
torch.Tensor: Corners with shape [B, N, 6]
"""
if len(size_class.shape) == 2 or size_class.shape[-1] == 1:
batch_size, proposal_num = size_class.shape[:2]
one_hot_size_class = size_res.new_zeros(
(batch_size, proposal_num, self.num_sizes))
if len(size_class.shape) == 2:
size_class = size_class.unsqueeze(-1)
one_hot_size_class.scatter_(2, size_class, 1)
one_hot_size_class_expand = one_hot_size_class.unsqueeze(
-1).repeat(1, 1, 1, 3).contiguous()
else:
one_hot_size_class_expand = size_class
if len(size_res.shape) == 4:
size_res = torch.sum(size_res * one_hot_size_class_expand, 2)
mean_sizes = size_res.new_tensor(self.mean_sizes)
mean_sizes = torch.sum(mean_sizes * one_hot_size_class_expand, 2)
size_full = (size_res + 1) * mean_sizes
size_full = torch.clamp(size_full, 0)
half_size_full = size_full / 2
corner1 = center - half_size_full
corner2 = center + half_size_full
corners = torch.cat([corner1, corner2], dim=-1)
return corners
def split_pred(self, cls_preds, reg_preds, base_xyz):
"""Split predicted features to specific parts.
Args:
cls_preds (torch.Tensor): Class predicted features to split.
reg_preds (torch.Tensor): Regression predicted features to split.
base_xyz (torch.Tensor): Coordinates of points.
Returns:
dict[str, torch.Tensor]: Split results.
"""
results = {}
start, end = 0, 0
cls_preds_trans = cls_preds.transpose(2, 1)
reg_preds_trans = reg_preds.transpose(2, 1)
# decode center
end += 3
# (batch_size, num_proposal, 3)
results['center'] = base_xyz + \
reg_preds_trans[..., start:end].contiguous()
start = end
# decode direction
end += self.num_dir_bins
results['dir_class'] = reg_preds_trans[..., start:end].contiguous()
start = end
end += self.num_dir_bins
dir_res_norm = reg_preds_trans[..., start:end].contiguous()
start = end
results['dir_res_norm'] = dir_res_norm
results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins)
# decode size
end += self.num_sizes
results['size_class'] = reg_preds_trans[..., start:end].contiguous()
start = end
end += self.num_sizes * 3
size_res_norm = reg_preds_trans[..., start:end]
batch_size, num_proposal = reg_preds_trans.shape[:2]
size_res_norm = size_res_norm.view(
[batch_size, num_proposal, self.num_sizes, 3])
start = end
results['size_res_norm'] = size_res_norm.contiguous()
mean_sizes = reg_preds.new_tensor(self.mean_sizes)
results['size_res'] = (
size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0))
# decode objectness score
start = 0
end = 2
results['obj_scores'] = cls_preds_trans[..., start:end].contiguous()
start = end
# decode semantic score
results['sem_scores'] = cls_preds_trans[..., start:].contiguous()
return results
def angle2class(self, angle):
"""Convert continuous angle to a discrete class and a residual.
Convert continuous angle to a discrete class and a small
regression number from class center angle to current angle.
Args:
angle (torch.Tensor): Angle is from 0-2pi (or -pi~pi),
class center at 0, 1*(2pi/N), 2*(2pi/N) ... (N-1)*(2pi/N).
Returns:
tuple: Encoded discrete class and residual.
"""
angle = angle % (2 * np.pi)
angle_per_class = 2 * np.pi / float(self.num_dir_bins)
shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi)
angle_cls = shifted_angle // angle_per_class
angle_res = shifted_angle - (
angle_cls * angle_per_class + angle_per_class / 2)
return angle_cls.long(), angle_res
def class2angle(self, angle_cls, angle_res, limit_period=True):
"""Inverse function to angle2class.
Args:
angle_cls (torch.Tensor): Angle class to decode.
angle_res (torch.Tensor): Angle residual to decode.
limit_period (bool): Whether to limit angle to [-pi, pi].
Returns:
torch.Tensor: Angle decoded from angle_cls and angle_res.
"""
angle_per_class = 2 * np.pi / float(self.num_dir_bins)
angle_center = angle_cls.float() * angle_per_class
angle = angle_center + angle_res
if limit_period:
angle[angle > np.pi] -= 2 * np.pi
return angle
================================================
FILE: mmdet3d/core/bbox/coders/transfusion_bbox_coder.py
================================================
import torch
from mmdet.core.bbox import BaseBBoxCoder
from mmdet.core.bbox.builder import BBOX_CODERS
@BBOX_CODERS.register_module()
class TransFusionBBoxCoder(BaseBBoxCoder):
def __init__(self,
pc_range,
out_size_factor,
voxel_size,
post_center_range=None,
score_threshold=None,
code_size=8,
):
self.pc_range = pc_range
self.out_size_factor = out_size_factor
self.voxel_size = voxel_size
self.post_center_range = post_center_range
self.score_threshold = score_threshold
self.code_size = code_size
def encode(self, dst_boxes):
targets = torch.zeros([dst_boxes.shape[0], self.code_size]).to(dst_boxes.device)
targets[:, 0] = (dst_boxes[:, 0] - self.pc_range[0]) / (self.out_size_factor * self.voxel_size[0])
targets[:, 1] = (dst_boxes[:, 1] - self.pc_range[1]) / (self.out_size_factor * self.voxel_size[1])
# targets[:, 2] = (dst_boxes[:, 2] - self.post_center_range[2]) / (self.post_center_range[5] - self.post_center_range[2])
targets[:, 3] = dst_boxes[:, 3].log()
targets[:, 4] = dst_boxes[:, 4].log()
targets[:, 5] = dst_boxes[:, 5].log()
targets[:, 2] = dst_boxes[:, 2] + dst_boxes[:, 5] * 0.5 # bottom center to gravity center
targets[:, 6] = torch.sin(dst_boxes[:, 6])
targets[:, 7] = torch.cos(dst_boxes[:, 6])
if self.code_size == 10:
targets[:, 8:10] = dst_boxes[:, 7:]
return targets
def decode(self, heatmap, rot, dim, center, height, vel, filter=False):
"""Decode bboxes.
Args:
heat (torch.Tensor): Heatmap with the shape of [B, num_cls, num_proposals].
rot (torch.Tensor): Rotation with the shape of
[B, 1, num_proposals].
dim (torch.Tensor): Dim of the boxes with the shape of
[B, 3, num_proposals].
center (torch.Tensor): bev center of the boxes with the shape of
[B, 2, num_proposals]. (in feature map metric)
hieght (torch.Tensor): height of the boxes with the shape of
[B, 2, num_proposals]. (in real world metric)
vel (torch.Tensor): Velocity with the shape of [B, 2, num_proposals].
filter: if False, return all box without checking score and center_range
Returns:
list[dict]: Decoded boxes.
"""
# class label
final_preds = heatmap.max(1, keepdims=False).indices
final_scores = heatmap.max(1, keepdims=False).values
# change size to real world metric
center[:, 0, :] = center[:, 0, :] * self.out_size_factor * self.voxel_size[0] + self.pc_range[0]
center[:, 1, :] = center[:, 1, :] * self.out_size_factor * self.voxel_size[1] + self.pc_range[1]
# center[:, 2, :] = center[:, 2, :] * (self.post_center_range[5] - self.post_center_range[2]) + self.post_center_range[2]
dim[:, 0, :] = dim[:, 0, :].exp()
dim[:, 1, :] = dim[:, 1, :].exp()
dim[:, 2, :] = dim[:, 2, :].exp()
height = height - dim[:, 2:3, :] * 0.5 # gravity center to bottom center
rots, rotc = rot[:, 0:1, :], rot[:, 1:2, :]
rot = torch.atan2(rots, rotc)
if vel is None:
final_box_preds = torch.cat([center, height, dim, rot], dim=1).permute(0, 2, 1)
else:
final_box_preds = torch.cat([center, height, dim, rot, vel], dim=1).permute(0, 2, 1)
predictions_dicts = []
for i in range(heatmap.shape[0]):
boxes3d = final_box_preds[i]
scores = final_scores[i]
labels = final_preds[i]
predictions_dict = {
'bboxes': boxes3d,
'scores': scores,
'labels': labels
}
predictions_dicts.append(predictions_dict)
if filter is False:
return predictions_dicts
# use score threshold
if self.score_threshold is not None:
thresh_mask = final_scores > self.score_threshold
if self.post_center_range is not None:
self.post_center_range = torch.tensor(
self.post_center_range, device=heatmap.device)
mask = (final_box_preds[..., :3] >=
self.post_center_range[:3]).all(2)
mask &= (final_box_preds[..., :3] <=
self.post_center_range[3:]).all(2)
predictions_dicts = []
for i in range(heatmap.shape[0]):
cmask = mask[i, :]
if self.score_threshold:
cmask &= thresh_mask[i]
boxes3d = final_box_preds[i, cmask]
scores = final_scores[i, cmask]
labels = final_preds[i, cmask]
predictions_dict = {
'bboxes': boxes3d,
'scores': scores,
'labels': labels,
'cmask': cmask
}
predictions_dicts.append(predictions_dict)
else:
raise NotImplementedError(
'Need to reorganize output as a batch, only '
'support post_center_range is not None for now!')
return predictions_dicts
================================================
FILE: mmdet3d/core/bbox/iou_calculators/__init__.py
================================================
from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D,
BboxOverlapsNearest3D,
axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d,
bbox_overlaps_nearest_3d)
__all__ = [
'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d',
'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D',
'axis_aligned_bbox_overlaps_3d'
]
================================================
FILE: mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py
================================================
import torch
from mmdet.core.bbox import bbox_overlaps
from mmdet.core.bbox.iou_calculators.builder import IOU_CALCULATORS
from ..structures import get_box_type
@IOU_CALCULATORS.register_module()
class BboxOverlapsNearest3D(object):
"""Nearest 3D IoU Calculator.
Note:
This IoU calculator first finds the nearest 2D boxes in bird eye view
(BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
Args:
coordinate (str): 'camera', 'lidar', or 'depth' coordinate system.
"""
def __init__(self, coordinate='lidar'):
assert coordinate in ['camera', 'lidar', 'depth']
self.coordinate = coordinate
def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
"""Calculate nearest 3D IoU.
Note:
If ``is_aligned`` is ``False``, then it calculates the ious between
each bbox of bboxes1 and bboxes2, otherwise it calculates the ious
between each aligned pair of bboxes1 and bboxes2.
Args:
bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, h, w, l, ry, v].
bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, h, w, l, ry, v].
mode (str): "iou" (intersection over union) or iof
(intersection over foreground).
is_aligned (bool): Whether the calculation is aligned.
Return:
torch.Tensor: If ``is_aligned`` is ``True``, return ious between \
bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \
``False``, return shape is M.
"""
return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned,
self.coordinate)
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(coordinate={self.coordinate}'
return repr_str
@IOU_CALCULATORS.register_module()
class BboxOverlaps3D(object):
"""3D IoU Calculator.
Args:
coordinate (str): The coordinate system, valid options are
'camera', 'lidar', and 'depth'.
"""
def __init__(self, coordinate):
assert coordinate in ['camera', 'lidar', 'depth']
self.coordinate = coordinate
def __call__(self, bboxes1, bboxes2, mode='iou'):
"""Calculate 3D IoU using cuda implementation.
Note:
This function calculate the IoU of 3D boxes based on their volumes.
IoU calculator ``:class:BboxOverlaps3D`` uses this function to
calculate the actual 3D IoUs of boxes.
Args:
bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry].
bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry].
mode (str): "iou" (intersection over union) or
iof (intersection over foreground).
Return:
torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \
with shape (M, N) (aligned mode is not supported currently).
"""
return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate)
def __repr__(self):
"""str: return a string that describes the module"""
repr_str = self.__class__.__name__
repr_str += f'(coordinate={self.coordinate}'
return repr_str
def bbox_overlaps_nearest_3d(bboxes1,
bboxes2,
mode='iou',
is_aligned=False,
coordinate='lidar'):
"""Calculate nearest 3D IoU.
Note:
This function first finds the nearest 2D boxes in bird eye view
(BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`.
Ths IoU calculator :class:`BboxOverlapsNearest3D` uses this
function to calculate IoUs of boxes.
If ``is_aligned`` is ``False``, then it calculates the ious between
each bbox of bboxes1 and bboxes2, otherwise the ious between each
aligned pair of bboxes1 and bboxes2.
Args:
bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry, v].
bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry, v].
mode (str): "iou" (intersection over union) or iof
(intersection over foreground).
is_aligned (bool): Whether the calculation is aligned
Return:
torch.Tensor: If ``is_aligned`` is ``True``, return ious between \
bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is \
``False``, return shape is M.
"""
assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
box_type, _ = get_box_type(coordinate)
bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
# Change the bboxes to bev
# box conversion and iou calculation in torch version on CUDA
# is 10x faster than that in numpy version
bboxes1_bev = bboxes1.nearest_bev
bboxes2_bev = bboxes2.nearest_bev
ret = bbox_overlaps(
bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned)
return ret
def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'):
"""Calculate 3D IoU using cuda implementation.
Note:
This function calculates the IoU of 3D boxes based on their volumes.
IoU calculator :class:`BboxOverlaps3D` uses this function to
calculate the actual IoUs of boxes.
Args:
bboxes1 (torch.Tensor): shape (N, 7+C) [x, y, z, h, w, l, ry].
bboxes2 (torch.Tensor): shape (M, 7+C) [x, y, z, h, w, l, ry].
mode (str): "iou" (intersection over union) or
iof (intersection over foreground).
coordinate (str): 'camera' or 'lidar' coordinate system.
Return:
torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 \
with shape (M, N) (aligned mode is not supported currently).
"""
assert bboxes1.size(-1) == bboxes2.size(-1) >= 7
box_type, _ = get_box_type(coordinate)
bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1])
bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1])
return bboxes1.overlaps(bboxes1, bboxes2, mode=mode)
@IOU_CALCULATORS.register_module()
class AxisAlignedBboxOverlaps3D(object):
"""Axis-aligned 3D Overlaps (IoU) Calculator."""
def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False):
"""Calculate IoU between 2D bboxes.
Args:
bboxes1 (Tensor): shape (B, m, 6) in
format or empty.
bboxes2 (Tensor): shape (B, n, 6) in
format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned `` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "giou" (generalized
intersection over union).
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
"""
assert bboxes1.size(-1) == bboxes2.size(-1) == 6
return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode,
is_aligned)
def __repr__(self):
"""str: a string describing the module"""
repr_str = self.__class__.__name__ + '()'
return repr_str
def axis_aligned_bbox_overlaps_3d(bboxes1,
bboxes2,
mode='iou',
is_aligned=False,
eps=1e-6):
"""Calculate overlap between two set of axis aligned 3D bboxes. If
``is_aligned `` is ``False``, then calculate the overlaps between each bbox
of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of
bboxes1 and bboxes2.
Args:
bboxes1 (Tensor): shape (B, m, 6) in
format or empty.
bboxes2 (Tensor): shape (B, n, 6) in
format or empty.
B indicates the batch dim, in shape (B1, B2, ..., Bn).
If ``is_aligned `` is ``True``, then m and n must be equal.
mode (str): "iou" (intersection over union) or "giou" (generalized
intersection over union).
is_aligned (bool, optional): If True, then m and n must be equal.
Default False.
eps (float, optional): A value added to the denominator for numerical
stability. Default 1e-6.
Returns:
Tensor: shape (m, n) if ``is_aligned `` is False else shape (m,)
Example:
>>> bboxes1 = torch.FloatTensor([
>>> [0, 0, 0, 10, 10, 10],
>>> [10, 10, 10, 20, 20, 20],
>>> [32, 32, 32, 38, 40, 42],
>>> ])
>>> bboxes2 = torch.FloatTensor([
>>> [0, 0, 0, 10, 20, 20],
>>> [0, 10, 10, 10, 19, 20],
>>> [10, 10, 10, 20, 20, 20],
>>> ])
>>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2)
>>> assert overlaps.shape == (3, 3)
>>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True)
>>> assert overlaps.shape == (3, )
Example:
>>> empty = torch.empty(0, 6)
>>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]])
>>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
>>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
>>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
"""
assert mode in ['iou', 'giou'], f'Unsupported mode {mode}'
# Either the boxes are empty or the length of boxes's last dimenstion is 6
assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0)
assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0)
# Batch dim must be the same
# Batch dim: (B1, B2, ... Bn)
assert bboxes1.shape[:-2] == bboxes2.shape[:-2]
batch_shape = bboxes1.shape[:-2]
rows = bboxes1.size(-2)
cols = bboxes2.size(-2)
if is_aligned:
assert rows == cols
if rows * cols == 0:
if is_aligned:
return bboxes1.new(batch_shape + (rows, ))
else:
return bboxes1.new(batch_shape + (rows, cols))
area1 = (bboxes1[..., 3] -
bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * (
bboxes1[..., 5] - bboxes1[..., 2])
area2 = (bboxes2[..., 3] -
bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * (
bboxes2[..., 5] - bboxes2[..., 2])
if is_aligned:
lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3]
rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3]
wh = (rb - lt).clamp(min=0) # [B, rows, 2]
overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
if mode in ['iou', 'giou']:
union = area1 + area2 - overlap
else:
union = area1
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3])
enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:])
else:
lt = torch.max(bboxes1[..., :, None, :3],
bboxes2[..., None, :, :3]) # [B, rows, cols, 3]
rb = torch.min(bboxes1[..., :, None, 3:],
bboxes2[..., None, :, 3:]) # [B, rows, cols, 3]
wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3]
overlap = wh[..., 0] * wh[..., 1] * wh[..., 2]
if mode in ['iou', 'giou']:
union = area1[..., None] + area2[..., None, :] - overlap
if mode == 'giou':
enclosed_lt = torch.min(bboxes1[..., :, None, :3],
bboxes2[..., None, :, :3])
enclosed_rb = torch.max(bboxes1[..., :, None, 3:],
bboxes2[..., None, :, 3:])
eps = union.new_tensor([eps])
union = torch.max(union, eps)
ious = overlap / union
if mode in ['iou']:
return ious
# calculate gious
enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0)
enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2]
enclose_area = torch.max(enclose_area, eps)
gious = ious - (enclose_area - union) / enclose_area
return gious
================================================
FILE: mmdet3d/core/bbox/samplers/__init__.py
================================================
from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler,
InstanceBalancedPosSampler,
IoUBalancedNegSampler, OHEMSampler,
PseudoSampler, RandomSampler,
SamplingResult)
from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler
__all__ = [
'BaseSampler', 'PseudoSampler', 'RandomSampler',
'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler',
'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler'
]
================================================
FILE: mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py
================================================
import torch
from mmdet.core.bbox.builder import BBOX_SAMPLERS
from . import RandomSampler, SamplingResult
@BBOX_SAMPLERS.register_module()
class IoUNegPiecewiseSampler(RandomSampler):
"""IoU Piece-wise Sampling.
Sampling negtive proposals according to a list of IoU thresholds.
The negtive proposals are divided into several pieces according
to `neg_iou_piece_thrs`. And the ratio of each piece is indicated
by `neg_piece_fractions`.
Args:
num (int): Number of proposals.
pos_fraction (float): The fraction of positive proposals.
neg_piece_fractions (list): A list contains fractions that indicates
the ratio of each piece of total negtive samplers.
neg_iou_piece_thrs (list): A list contains IoU thresholds that
indicate the upper bound of this piece.
neg_pos_ub (float): The total ratio to limit the upper bound
number of negtive samples.
add_gt_as_proposals (bool): Whether to add gt as proposals.
"""
def __init__(self,
num,
pos_fraction=None,
neg_piece_fractions=None,
neg_iou_piece_thrs=None,
neg_pos_ub=-1,
add_gt_as_proposals=False,
return_iou=False):
super(IoUNegPiecewiseSampler,
self).__init__(num, pos_fraction, neg_pos_ub,
add_gt_as_proposals)
assert isinstance(neg_piece_fractions, list)
assert len(neg_piece_fractions) == len(neg_iou_piece_thrs)
self.neg_piece_fractions = neg_piece_fractions
self.neg_iou_thr = neg_iou_piece_thrs
self.return_iou = return_iou
self.neg_piece_num = len(self.neg_piece_fractions)
def _sample_pos(self, assign_result, num_expected, **kwargs):
"""Randomly sample some positive samples."""
pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False)
if pos_inds.numel() != 0:
pos_inds = pos_inds.squeeze(1)
if pos_inds.numel() <= num_expected:
return pos_inds
else:
return self.random_choice(pos_inds, num_expected)
def _sample_neg(self, assign_result, num_expected, **kwargs):
"""Randomly sample some negative samples."""
neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False)
if neg_inds.numel() != 0:
neg_inds = neg_inds.squeeze(1)
if len(neg_inds) <= num_expected:
return neg_inds
else:
neg_inds_choice = neg_inds.new_zeros([0])
extend_num = 0
max_overlaps = assign_result.max_overlaps[neg_inds]
for piece_inds in range(self.neg_piece_num):
if piece_inds == self.neg_piece_num - 1: # for the last piece
piece_expected_num = num_expected - len(neg_inds_choice)
min_iou_thr = 0
else:
# if the numbers of negative samplers in previous
# pieces are less than the expected number, extend
# the same number in the current piece.
piece_expected_num = int(
num_expected *
self.neg_piece_fractions[piece_inds]) + extend_num
min_iou_thr = self.neg_iou_thr[piece_inds + 1]
max_iou_thr = self.neg_iou_thr[piece_inds]
piece_neg_inds = torch.nonzero(
(max_overlaps >= min_iou_thr)
& (max_overlaps < max_iou_thr),
as_tuple=False).view(-1)
if len(piece_neg_inds) < piece_expected_num:
neg_inds_choice = torch.cat(
[neg_inds_choice, neg_inds[piece_neg_inds]], dim=0)
extend_num += piece_expected_num - len(piece_neg_inds)
else:
piece_choice = self.random_choice(piece_neg_inds,
piece_expected_num)
neg_inds_choice = torch.cat(
[neg_inds_choice, neg_inds[piece_choice]], dim=0)
extend_num = 0
return neg_inds_choice
def sample(self,
assign_result,
bboxes,
gt_bboxes,
gt_labels=None,
**kwargs):
"""Sample positive and negative bboxes.
This is a simple implementation of bbox sampling given candidates,
assigning results and ground truth bboxes.
Args:
assign_result (:obj:`AssignResult`): Bbox assigning results.
bboxes (torch.Tensor): Boxes to be sampled from.
gt_bboxes (torch.Tensor): Ground truth bboxes.
gt_labels (torch.Tensor, optional): Class labels of ground truth \
bboxes.
Returns:
:obj:`SamplingResult`: Sampling result.
"""
if len(bboxes.shape) < 2:
bboxes = bboxes[None, :]
gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool)
if self.add_gt_as_proposals and len(gt_bboxes) > 0:
if gt_labels is None:
raise ValueError(
'gt_labels must be given when add_gt_as_proposals is True')
bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
assign_result.add_gt_(gt_labels)
gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool)
gt_flags = torch.cat([gt_ones, gt_flags])
num_expected_pos = int(self.num * self.pos_fraction)
pos_inds = self.pos_sampler._sample_pos(
assign_result, num_expected_pos, bboxes=bboxes, **kwargs)
# We found that sampled indices have duplicated items occasionally.
# (may be a bug of PyTorch)
pos_inds = pos_inds.unique()
num_sampled_pos = pos_inds.numel()
num_expected_neg = self.num - num_sampled_pos
if self.neg_pos_ub >= 0:
_pos = max(1, num_sampled_pos)
neg_upper_bound = int(self.neg_pos_ub * _pos)
if num_expected_neg > neg_upper_bound:
num_expected_neg = neg_upper_bound
neg_inds = self.neg_sampler._sample_neg(
assign_result, num_expected_neg, bboxes=bboxes, **kwargs)
neg_inds = neg_inds.unique()
sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
assign_result, gt_flags)
if self.return_iou:
# PartA2 needs iou score to regression.
sampling_result.iou = assign_result.max_overlaps[torch.cat(
[pos_inds, neg_inds])]
sampling_result.iou.detach_()
return sampling_result
================================================
FILE: mmdet3d/core/bbox/structures/__init__.py
================================================
from .base_box3d import BaseInstance3DBoxes
from .box_3d_mode import Box3DMode
from .cam_box3d import CameraInstance3DBoxes
from .coord_3d_mode import Coord3DMode
from .depth_box3d import DepthInstance3DBoxes
from .lidar_box3d import LiDARInstance3DBoxes
from .utils import (get_box_type, limit_period, points_cam2img,
rotation_3d_in_axis, xywhr2xyxyr)
__all__ = [
'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes',
'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr',
'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img',
'Coord3DMode'
]
================================================
FILE: mmdet3d/core/bbox/structures/base_box3d.py
================================================
import numpy as np
import torch
from abc import abstractmethod
from mmdet3d.ops.iou3d import iou3d_cuda
from .utils import limit_period, xywhr2xyxyr
class BaseInstance3DBoxes(object):
"""Base class for 3D Boxes.
Note:
The box is bottom centered, i.e. the relative position of origin in
the box is (0.5, 0.5, 0).
Args:
tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix.
box_dim (int): Number of the dimension of a box.
Each row is (x, y, z, x_size, y_size, z_size, yaw).
Default to 7.
with_yaw (bool): Whether the box is with yaw rotation.
If False, the value of yaw will be set to 0 as minmax boxes.
Default to True.
origin (tuple[float]): The relative position of origin in the box.
Default to (0.5, 0.5, 0). This will guide the box be converted to
(0.5, 0.5, 0) mode.
Attributes:
tensor (torch.Tensor): Float matrix of N x box_dim.
box_dim (int): Integer indicating the dimension of a box.
Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
boxes.
"""
def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)):
if isinstance(tensor, torch.Tensor):
device = tensor.device
else:
device = torch.device('cpu')
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that
# does not depend on the inputs (and consequently confuses jit)
tensor = tensor.reshape((0, box_dim)).to(
dtype=torch.float32, device=device)
assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
if tensor.shape[-1] == 6:
# If the dimension of boxes is 6, we expand box_dim by padding
# 0 as a fake yaw and set with_yaw to False.
assert box_dim == 6
fake_rot = tensor.new_zeros(tensor.shape[0], 1)
tensor = torch.cat((tensor, fake_rot), dim=-1)
self.box_dim = box_dim + 1
self.with_yaw = False
else:
self.box_dim = box_dim
self.with_yaw = with_yaw
self.tensor = tensor.clone()
if origin != (0.5, 0.5, 0):
dst = self.tensor.new_tensor((0.5, 0.5, 0))
src = self.tensor.new_tensor(origin)
self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
@property
def volume(self):
"""torch.Tensor: A vector with volume of each box."""
return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5]
@property
def dims(self):
"""torch.Tensor: Corners of each box with size (N, 8, 3)."""
return self.tensor[:, 3:6]
@property
def yaw(self):
"""torch.Tensor: A vector with yaw of each box."""
return self.tensor[:, 6]
@property
def height(self):
"""torch.Tensor: A vector with height of each box."""
return self.tensor[:, 5]
@property
def top_height(self):
"""torch.Tensor: A vector with the top height of each box."""
return self.bottom_height + self.height
@property
def bottom_height(self):
"""torch.Tensor: A vector with bottom's height of each box."""
return self.tensor[:, 2]
@property
def center(self):
"""Calculate the center of all the boxes.
Note:
In the MMDetection3D's convention, the bottom center is
usually taken as the default center.
The relative position of the centers in different kinds of
boxes are different, e.g., the relative center of a boxes is
(0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar.
It is recommended to use ``bottom_center`` or ``gravity_center``
for more clear usage.
Returns:
torch.Tensor: A tensor with center of each box.
"""
return self.bottom_center
@property
def bottom_center(self):
"""torch.Tensor: A tensor with center of each box."""
return self.tensor[:, :3]
@property
def gravity_center(self):
"""torch.Tensor: A tensor with center of each box."""
pass
@property
def corners(self):
"""torch.Tensor: a tensor with 8 corners of each box."""
pass
@abstractmethod
def rotate(self, angles, axis=0):
"""Calculate whether the points are in any of the boxes.
Args:
angles (float): Rotation angles.
axis (int): The axis to rotate the boxes.
"""
pass
@abstractmethod
def flip(self, bev_direction='horizontal'):
"""Flip the boxes in BEV along given BEV direction."""
pass
def translate(self, trans_vector):
"""Calculate whether the points are in any of the boxes.
Args:
trans_vector (torch.Tensor): Translation vector of size 1x3.
"""
if not isinstance(trans_vector, torch.Tensor):
trans_vector = self.tensor.new_tensor(trans_vector)
self.tensor[:, :3] += trans_vector
def in_range_3d(self, box_range):
"""Check whether the boxes are in the given range.
Args:
box_range (list | torch.Tensor): The range of box
(x_min, y_min, z_min, x_max, y_max, z_max)
Note:
In the original implementation of SECOND, checking whether
a box in the range checks whether the points are in a convex
polygon, we try to reduce the burden for simpler cases.
Returns:
torch.Tensor: A binary vector indicating whether each box is \
inside the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > box_range[0])
& (self.tensor[:, 1] > box_range[1])
& (self.tensor[:, 2] > box_range[2])
& (self.tensor[:, 0] < box_range[3])
& (self.tensor[:, 1] < box_range[4])
& (self.tensor[:, 2] < box_range[5]))
return in_range_flags
@abstractmethod
def in_range_bev(self, box_range):
"""Check whether the boxes are in the given range.
Args:
box_range (list | torch.Tensor): The range of box
in order of (x_min, y_min, x_max, y_max).
Returns:
torch.Tensor: Indicating whether each box is inside \
the reference range.
"""
pass
@abstractmethod
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`BoxMode`): The target Box mode.
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BaseInstance3DBoxes`: The converted box of the same type \
in the `dst` mode.
"""
pass
def scale(self, scale_factor):
"""Scale the box with horizontal and vertical scaling factors.
Args:
scale_factors (float): Scale factors to scale the boxes.
"""
self.tensor[:, :6] *= scale_factor
self.tensor[:, 7:] *= scale_factor
def limit_yaw(self, offset=0.5, period=np.pi):
"""Limit the yaw to a given period and offset.
Args:
offset (float): The offset of the yaw.
period (float): The expected period.
"""
self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period)
def nonempty(self, threshold: float = 0.0):
"""Find boxes that are non-empty.
A box is considered empty,
if either of its side is no larger than threshold.
Args:
threshold (float): The threshold of minimal sizes.
Returns:
torch.Tensor: A binary vector which represents whether each \
box is empty (False) or non-empty (True).
"""
box = self.tensor
size_x = box[..., 3]
size_y = box[..., 4]
size_z = box[..., 5]
keep = ((size_x > threshold)
& (size_y > threshold) & (size_z > threshold))
return keep
def __getitem__(self, item):
"""
Note:
The following usage are allowed:
1. `new_boxes = boxes[3]`:
return a `Boxes` that contains only one box.
2. `new_boxes = boxes[2:10]`:
return a slice of boxes.
3. `new_boxes = boxes[vector]`:
where vector is a torch.BoolTensor with `length = len(boxes)`.
Nonzero elements in the vector will be selected.
Note that the returned Boxes might share storage with this Boxes,
subject to Pytorch's indexing semantics.
Returns:
:obj:`BaseInstances3DBoxes`: A new object of \
:class:`BaseInstances3DBoxes` after indexing.
"""
original_type = type(self)
if isinstance(item, int):
return original_type(
self.tensor[item].view(1, -1),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
b = self.tensor[item]
assert b.dim() == 2, \
f'Indexing on Boxes with {item} failed to return a matrix!'
return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw)
def __len__(self):
"""int: Number of boxes in the current object."""
return self.tensor.shape[0]
def __repr__(self):
"""str: Return a strings that describes the object."""
return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')'
@classmethod
def cat(cls, boxes_list):
"""Concatenate a list of Boxes into a single Boxes.
Args:
boxes_list (list[:obj:`BaseInstances3DBoxes`]): List of boxes.
Returns:
:obj:`BaseInstances3DBoxes`: The concatenated Boxes.
"""
assert isinstance(boxes_list, (list, tuple))
if len(boxes_list) == 0:
return cls(torch.empty(0))
assert all(isinstance(box, cls) for box in boxes_list)
# use torch.cat (v.s. layers.cat)
# so the returned boxes never share storage with input
cat_boxes = cls(
torch.cat([b.tensor for b in boxes_list], dim=0),
box_dim=boxes_list[0].tensor.shape[1],
with_yaw=boxes_list[0].with_yaw)
return cat_boxes
def to(self, device):
"""Convert current boxes to a specific device.
Args:
device (str | :obj:`torch.device`): The name of the device.
Returns:
:obj:`BaseInstance3DBoxes`: A new boxes object on the \
specific device.
"""
original_type = type(self)
return original_type(
self.tensor.to(device),
box_dim=self.box_dim,
with_yaw=self.with_yaw)
def clone(self):
"""Clone the Boxes.
Returns:
:obj:`BaseInstance3DBoxes`: Box object with the same properties \
as self.
"""
original_type = type(self)
return original_type(
self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw)
@property
def device(self):
"""str: The device of the boxes are on."""
return self.tensor.device
def __iter__(self):
"""Yield a box as a Tensor of shape (4,) at a time.
Returns:
torch.Tensor: A box of shape (4,).
"""
yield from self.tensor
@classmethod
def height_overlaps(cls, boxes1, boxes2, mode='iou'):
"""Calculate height overlaps of two boxes.
Note:
This function calculates the height overlaps between boxes1 and
boxes2, boxes1 and boxes2 should be in the same type.
Args:
boxes1 (:obj:`BaseInstanceBoxes`): Boxes 1 contain N boxes.
boxes2 (:obj:`BaseInstanceBoxes`): Boxes 2 contain M boxes.
mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
Returns:
torch.Tensor: Calculated iou of boxes.
"""
assert isinstance(boxes1, BaseInstance3DBoxes)
assert isinstance(boxes2, BaseInstance3DBoxes)
assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
boxes1_top_height = boxes1.top_height.view(-1, 1)
boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
boxes2_top_height = boxes2.top_height.view(1, -1)
boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
heighest_of_bottom = torch.max(boxes1_bottom_height,
boxes2_bottom_height)
lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height)
overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0)
return overlaps_h
@classmethod
def overlaps(cls, boxes1, boxes2, mode='iou'):
"""Calculate 3D overlaps of two boxes.
Note:
This function calculates the overlaps between ``boxes1`` and
``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type.
Args:
boxes1 (:obj:`BaseInstanceBoxes`): Boxes 1 contain N boxes.
boxes2 (:obj:`BaseInstanceBoxes`): Boxes 2 contain M boxes.
mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
Returns:
torch.Tensor: Calculated iou of boxes' heights.
"""
assert isinstance(boxes1, BaseInstance3DBoxes)
assert isinstance(boxes2, BaseInstance3DBoxes)
assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \
f'be in the same type, got {type(boxes1)} and {type(boxes2)}.'
assert mode in ['iou', 'iof']
rows = len(boxes1)
cols = len(boxes2)
if rows * cols == 0:
return boxes1.tensor.new(rows, cols)
# height overlap
overlaps_h = cls.height_overlaps(boxes1, boxes2)
# obtain BEV boxes in XYXYR format
boxes1_bev = xywhr2xyxyr(boxes1.bev)
boxes2_bev = xywhr2xyxyr(boxes2.bev)
# bev overlap
overlaps_bev = boxes1_bev.new_zeros(
(boxes1_bev.shape[0], boxes2_bev.shape[0])).cuda() # (N, M)
iou3d_cuda.boxes_overlap_bev_gpu(boxes1_bev.contiguous().cuda(),
boxes2_bev.contiguous().cuda(),
overlaps_bev)
# 3d overlaps
overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h
volume1 = boxes1.volume.view(-1, 1)
volume2 = boxes2.volume.view(1, -1)
if mode == 'iou':
# the clamp func is used to avoid division of 0
iou3d = overlaps_3d / torch.clamp(
volume1 + volume2 - overlaps_3d, min=1e-8)
else:
iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8)
return iou3d
def new_box(self, data):
"""Create a new box object with data.
The new box and its tensor has the similar properties \
as self and self.tensor, respectively.
Args:
data (torch.Tensor | numpy.array | list): Data to be copied.
Returns:
:obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, \
the object's other properties are similar to ``self``.
"""
new_tensor = self.tensor.new_tensor(data) \
if not isinstance(data, torch.Tensor) else data.to(self.device)
original_type = type(self)
return original_type(
new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw)
================================================
FILE: mmdet3d/core/bbox/structures/box_3d_mode.py
================================================
import numpy as np
import torch
from enum import IntEnum, unique
from .base_box3d import BaseInstance3DBoxes
from .cam_box3d import CameraInstance3DBoxes
from .depth_box3d import DepthInstance3DBoxes
from .lidar_box3d import LiDARInstance3DBoxes
@unique
class Box3DMode(IntEnum):
r"""Enum of different ways to represent a box.
Coordinates in LiDAR:
.. code-block:: none
up z
^ x front
| /
| /
left y <------ 0
The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
Coordinates in camera:
.. code-block:: none
z front
/
/
0 ------> x right
|
|
v
down y
The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
and the yaw is around the y axis, thus the rotation axis=1.
Coordinates in Depth mode:
.. code-block:: none
up z
^ y front
| /
| /
0 ------> x right
The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
"""
LIDAR = 0
CAM = 1
DEPTH = 2
@staticmethod
def convert(box, src, dst, rt_mat=None):
"""Convert boxes from `src` mode to `dst` mode.
Args:
box (tuple | list | np.dnarray |
torch.Tensor | BaseInstance3DBoxes):
Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
src (:obj:`BoxMode`): The src Box mode.
dst (:obj:`BoxMode`): The target Box mode.
rt_mat (np.dnarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
(tuple | list | np.dnarray | torch.Tensor | BaseInstance3DBoxes): \
The converted box of the same type.
"""
if src == dst:
return box
is_numpy = isinstance(box, np.ndarray)
is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
single_box = isinstance(box, (list, tuple))
if single_box:
assert len(box) >= 7, (
'BoxMode.convert takes either a k-tuple/list or '
'an Nxk array/tensor, where k >= 7')
arr = torch.tensor(box)[None, :]
else:
# avoid modifying the input box
if is_numpy:
arr = torch.from_numpy(np.asarray(box)).clone()
elif is_Instance3DBoxes:
arr = box.tensor.clone()
else:
arr = box.clone()
# convert box from `src` mode to `dst` mode.
x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
if src == Box3DMode.LIDAR and dst == Box3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
else:
raise NotImplementedError(
f'Conversion from Box3DMode {src} to {dst} '
'is not supported yet')
if not isinstance(rt_mat, torch.Tensor):
rt_mat = arr.new_tensor(rt_mat)
if rt_mat.size(1) == 4:
extended_xyz = torch.cat(
[arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
xyz = extended_xyz @ rt_mat.t()
else:
xyz = arr[:, :3] @ rt_mat.t()
remains = arr[..., 6:]
arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
# convert arr to the original type
original_type = type(box)
if single_box:
return original_type(arr.flatten().tolist())
if is_numpy:
return arr.numpy()
elif is_Instance3DBoxes:
if dst == Box3DMode.CAM:
target_type = CameraInstance3DBoxes
elif dst == Box3DMode.LIDAR:
target_type = LiDARInstance3DBoxes
elif dst == Box3DMode.DEPTH:
target_type = DepthInstance3DBoxes
else:
raise NotImplementedError(
f'Conversion to {dst} through {original_type}'
' is not supported yet')
return target_type(
arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
else:
return arr
================================================
FILE: mmdet3d/core/bbox/structures/cam_box3d.py
================================================
import numpy as np
import torch
from mmdet3d.core.points import BasePoints
from .base_box3d import BaseInstance3DBoxes
from .utils import limit_period, rotation_3d_in_axis
class CameraInstance3DBoxes(BaseInstance3DBoxes):
"""3D boxes of instances in CAM coordinates.
Coordinates in camera:
.. code-block:: none
z front (yaw=0.5*pi)
/
/
0 ------> x right (yaw=0)
|
|
v
down y
The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5),
and the yaw is around the y axis, thus the rotation axis=1.
The yaw is 0 at the positive direction of x axis, and increases from
the positive direction of x to the positive direction of z.
Attributes:
tensor (torch.Tensor): Float matrix of N x box_dim.
box_dim (int): Integer indicates the dimension of a box
Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
boxes.
"""
def __init__(self,
tensor,
box_dim=7,
with_yaw=True,
origin=(0.5, 1.0, 0.5)):
if isinstance(tensor, torch.Tensor):
device = tensor.device
else:
device = torch.device('cpu')
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that
# does not depend on the inputs (and consequently confuses jit)
tensor = tensor.reshape((0, box_dim)).to(
dtype=torch.float32, device=device)
assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size()
if tensor.shape[-1] == 6:
# If the dimension of boxes is 6, we expand box_dim by padding
# 0 as a fake yaw and set with_yaw to False.
assert box_dim == 6
fake_rot = tensor.new_zeros(tensor.shape[0], 1)
tensor = torch.cat((tensor, fake_rot), dim=-1)
self.box_dim = box_dim + 1
self.with_yaw = False
else:
self.box_dim = box_dim
self.with_yaw = with_yaw
self.tensor = tensor.clone()
if origin != (0.5, 1.0, 0.5):
dst = self.tensor.new_tensor((0.5, 1.0, 0.5))
src = self.tensor.new_tensor(origin)
self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src)
@property
def height(self):
"""torch.Tensor: A vector with height of each box."""
return self.tensor[:, 4]
@property
def top_height(self):
"""torch.Tensor: A vector with the top height of each box."""
# the positive direction is down rather than up
return self.bottom_height - self.height
@property
def bottom_height(self):
"""torch.Tensor: A vector with bottom's height of each box."""
return self.tensor[:, 1]
@property
def gravity_center(self):
"""torch.Tensor: A tensor with center of each box."""
bottom_center = self.bottom_center
gravity_center = torch.zeros_like(bottom_center)
gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]]
gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5
return gravity_center
@property
def corners(self):
"""torch.Tensor: Coordinates of corners of all the boxes in
shape (N, 8, 3).
Convert the boxes to in clockwise order, in the form of
(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)
.. code-block:: none
front z
/
/
(x0, y0, z1) + ----------- + (x1, y0, z1)
/| / |
/ | / |
(x0, y0, z0) + ----------- + + (x1, y1, z1)
| / . | /
| / oriign | /
(x0, y1, z0) + ----------- + -------> x right
| (x1, y1, z0)
|
v
down y
"""
# TODO: rotation_3d_in_axis function do not support
# empty tensor currently.
assert len(self.tensor) != 0
dims = self.dims
corners_norm = torch.from_numpy(
np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
device=dims.device, dtype=dims.dtype)
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
# use relative origin [0.5, 1, 0.5]
corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5])
corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
# rotate around y axis
corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=1)
corners += self.tensor[:, :3].view(-1, 1, 3)
return corners
@property
def bev(self):
"""torch.Tensor: A n x 5 tensor of 2D BEV box of each box
with rotation in XYWHR format."""
return self.tensor[:, [0, 2, 3, 5, 6]]
@property
def nearest_bev(self):
"""torch.Tensor: A tensor of 2D BEV box of each box
without rotation."""
# Obtain BEV boxes with rotation in XZWHR format
bev_rotated_boxes = self.bev
# convert the rotation to a valid range
rotations = bev_rotated_boxes[:, -1]
normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
# find the center of boxes
conditions = (normed_rotations > np.pi / 4)[..., None]
bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
[0, 1, 3, 2]],
bev_rotated_boxes[:, :4])
centers = bboxes_xywh[:, :2]
dims = bboxes_xywh[:, 2:]
bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
return bev_boxes
def rotate(self, angle, points=None):
"""Rotate boxes with points (optional) with the given angle.
Args:
angle (float, torch.Tensor): Rotation angle.
points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
Points to rotate. Defaults to None.
Returns:
tuple or None: When ``points`` is None, the function returns \
None, otherwise it returns the rotated points and the \
rotation matrix ``rot_mat_T``.
"""
if not isinstance(angle, torch.Tensor):
angle = self.tensor.new_tensor(angle)
rot_sin = torch.sin(angle)
rot_cos = torch.cos(angle)
rot_mat_T = self.tensor.new_tensor([[rot_cos, 0, -rot_sin], [0, 1, 0],
[rot_sin, 0, rot_cos]])
self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
self.tensor[:, 6] += angle
if points is not None:
if isinstance(points, torch.Tensor):
points[:, :3] = points[:, :3] @ rot_mat_T
elif isinstance(points, np.ndarray):
rot_mat_T = rot_mat_T.numpy()
points[:, :3] = np.dot(points[:, :3], rot_mat_T)
elif isinstance(points, BasePoints):
# clockwise
points.rotate(-angle)
else:
raise ValueError
return points, rot_mat_T
def flip(self, bev_direction='horizontal', points=None):
"""Flip the boxes in BEV along given BEV direction.
In CAM coordinates, it flips the x (horizontal) or z (vertical) axis.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
Points to flip. Defaults to None.
Returns:
torch.Tensor, numpy.ndarray or None: Flipped points.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
self.tensor[:, 0::7] = -self.tensor[:, 0::7]
if self.with_yaw:
self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
elif bev_direction == 'vertical':
self.tensor[:, 2::7] = -self.tensor[:, 2::7]
if self.with_yaw:
self.tensor[:, 6] = -self.tensor[:, 6]
if points is not None:
assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
if isinstance(points, (torch.Tensor, np.ndarray)):
if bev_direction == 'horizontal':
points[:, 0] = -points[:, 0]
elif bev_direction == 'vertical':
points[:, 2] = -points[:, 2]
elif isinstance(points, BasePoints):
points.flip(bev_direction)
return points
def in_range_bev(self, box_range):
"""Check whether the boxes are in the given range.
Args:
box_range (list | torch.Tensor): The range of box
(x_min, z_min, x_max, z_max).
Note:
The original implementation of SECOND checks whether boxes in
a range by checking whether the points are in a convex
polygon, we reduce the burden for simpler cases.
Returns:
torch.Tensor: Indicating whether each box is inside \
the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > box_range[0])
& (self.tensor[:, 2] > box_range[1])
& (self.tensor[:, 0] < box_range[2])
& (self.tensor[:, 2] < box_range[3]))
return in_range_flags
@classmethod
def height_overlaps(cls, boxes1, boxes2, mode='iou'):
"""Calculate height overlaps of two boxes.
This function calculates the height overlaps between ``boxes1`` and
``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type.
Args:
boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes.
boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes.
mode (str, optional): Mode of iou calculation. Defaults to 'iou'.
Returns:
torch.Tensor: Calculated iou of boxes' heights.
"""
assert isinstance(boxes1, CameraInstance3DBoxes)
assert isinstance(boxes2, CameraInstance3DBoxes)
boxes1_top_height = boxes1.top_height.view(-1, 1)
boxes1_bottom_height = boxes1.bottom_height.view(-1, 1)
boxes2_top_height = boxes2.top_height.view(1, -1)
boxes2_bottom_height = boxes2.bottom_height.view(1, -1)
# In camera coordinate system
# from up to down is the positive direction
heighest_of_bottom = torch.min(boxes1_bottom_height,
boxes2_bottom_height)
lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height)
overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0)
return overlaps_h
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`BoxMode`): The target Box mode.
rt_mat (np.dnarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from ``src`` coordinates to ``dst`` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BaseInstance3DBoxes`: \
The converted box of the same type in the ``dst`` mode.
"""
from .box_3d_mode import Box3DMode
return Box3DMode.convert(
box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat)
================================================
FILE: mmdet3d/core/bbox/structures/coord_3d_mode.py
================================================
import numpy as np
import torch
from enum import IntEnum, unique
from mmdet3d.core.points import (BasePoints, CameraPoints, DepthPoints,
LiDARPoints)
from .base_box3d import BaseInstance3DBoxes
from .cam_box3d import CameraInstance3DBoxes
from .depth_box3d import DepthInstance3DBoxes
from .lidar_box3d import LiDARInstance3DBoxes
@unique
class Coord3DMode(IntEnum):
r"""Enum of different ways to represent a box
and point cloud.
Coordinates in LiDAR:
.. code-block:: none
up z
^ x front
| /
| /
left y <------ 0
The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
Coordinates in camera:
.. code-block:: none
z front
/
/
0 ------> x right
|
|
v
down y
The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5],
and the yaw is around the y axis, thus the rotation axis=1.
Coordinates in Depth mode:
.. code-block:: none
up z
^ y front
| /
| /
0 ------> x right
The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
"""
LIDAR = 0
CAM = 1
DEPTH = 2
@staticmethod
def convert(input, src, dst, rt_mat=None):
"""Convert boxes or points from `src` mode to `dst` mode."""
if isinstance(input, BaseInstance3DBoxes):
return Coord3DMode.convert_box(input, src, dst, rt_mat=rt_mat)
elif isinstance(input, BasePoints):
return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat)
else:
raise NotImplementedError
@staticmethod
def convert_box(box, src, dst, rt_mat=None):
"""Convert boxes from `src` mode to `dst` mode.
Args:
box (tuple | list | np.dnarray |
torch.Tensor | BaseInstance3DBoxes):
Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7.
src (:obj:`CoordMode`): The src Box mode.
dst (:obj:`CoordMode`): The target Box mode.
rt_mat (np.dnarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
(tuple | list | np.dnarray | torch.Tensor | BaseInstance3DBoxes): \
The converted box of the same type.
"""
if src == dst:
return box
is_numpy = isinstance(box, np.ndarray)
is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes)
single_box = isinstance(box, (list, tuple))
if single_box:
assert len(box) >= 7, (
'CoordMode.convert takes either a k-tuple/list or '
'an Nxk array/tensor, where k >= 7')
arr = torch.tensor(box)[None, :]
else:
# avoid modifying the input box
if is_numpy:
arr = torch.from_numpy(np.asarray(box)).clone()
elif is_Instance3DBoxes:
arr = box.tensor.clone()
else:
arr = box.clone()
# convert box from `src` mode to `dst` mode.
x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6]
if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
xyz_size = torch.cat([y_size, z_size, x_size], dim=-1)
elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
xyz_size = torch.cat([z_size, x_size, y_size], dim=-1)
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
xyz_size = torch.cat([x_size, z_size, y_size], dim=-1)
elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
if rt_mat is None:
rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
xyz_size = torch.cat([y_size, x_size, z_size], dim=-1)
else:
raise NotImplementedError(
f'Conversion from Coord3DMode {src} to {dst} '
'is not supported yet')
if not isinstance(rt_mat, torch.Tensor):
rt_mat = arr.new_tensor(rt_mat)
if rt_mat.size(1) == 4:
extended_xyz = torch.cat(
[arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
xyz = extended_xyz @ rt_mat.t()
else:
xyz = arr[:, :3] @ rt_mat.t()
remains = arr[..., 6:]
arr = torch.cat([xyz[:, :3], xyz_size, remains], dim=-1)
# convert arr to the original type
original_type = type(box)
if single_box:
return original_type(arr.flatten().tolist())
if is_numpy:
return arr.numpy()
elif is_Instance3DBoxes:
if dst == Coord3DMode.CAM:
target_type = CameraInstance3DBoxes
elif dst == Coord3DMode.LIDAR:
target_type = LiDARInstance3DBoxes
elif dst == Coord3DMode.DEPTH:
target_type = DepthInstance3DBoxes
else:
raise NotImplementedError(
f'Conversion to {dst} through {original_type}'
' is not supported yet')
return target_type(
arr, box_dim=arr.size(-1), with_yaw=box.with_yaw)
else:
return arr
@staticmethod
def convert_point(point, src, dst, rt_mat=None):
"""Convert points from `src` mode to `dst` mode.
Args:
point (tuple | list | np.dnarray |
torch.Tensor | BasePoints):
Can be a k-tuple, k-list or an Nxk array/tensor.
src (:obj:`CoordMode`): The src Point mode.
dst (:obj:`CoordMode`): The target Point mode.
rt_mat (np.dnarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
(tuple | list | np.dnarray | torch.Tensor | BasePoints): \
The converted point of the same type.
"""
if src == dst:
return point
is_numpy = isinstance(point, np.ndarray)
is_InstancePoints = isinstance(point, BasePoints)
single_point = isinstance(point, (list, tuple))
if single_point:
assert len(point) >= 3, (
'CoordMode.convert takes either a k-tuple/list or '
'an Nxk array/tensor, where k >= 3')
arr = torch.tensor(point)[None, :]
else:
# avoid modifying the input point
if is_numpy:
arr = torch.from_numpy(np.asarray(point)).clone()
elif is_InstancePoints:
arr = point.tensor.clone()
else:
arr = point.clone()
# convert point from `src` mode to `dst` mode.
# TODO: LIDAR
# only implemented provided Rt matrix in cam-depth conversion
if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM:
rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]])
elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR:
rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]])
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]])
else:
rt_mat = rt_mat.new_tensor(
[[1, 0, 0], [0, 0, -1], [0, 1, 0]]) @ \
rt_mat.transpose(1, 0)
elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH:
if rt_mat is None:
rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]])
else:
rt_mat = rt_mat @ rt_mat.new_tensor([[1, 0, 0], [0, 0, 1],
[0, -1, 0]])
elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH:
rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]])
elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR:
rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]])
else:
raise NotImplementedError(
f'Conversion from Coord3DMode {src} to {dst} '
'is not supported yet')
if rt_mat.size(1) == 4:
extended_xyz = torch.cat(
[arr[:, :3], arr.new_ones(arr.size(0), 1)], dim=-1)
xyz = extended_xyz @ rt_mat.t()
else:
xyz = arr[:, :3] @ rt_mat.t()
remains = arr[:, 3:]
arr = torch.cat([xyz[:, :3], remains], dim=-1)
# convert arr to the original type
original_type = type(point)
if single_point:
return original_type(arr.flatten().tolist())
if is_numpy:
return arr.numpy()
elif is_InstancePoints:
if dst == Coord3DMode.CAM:
target_type = CameraPoints
elif dst == Coord3DMode.LIDAR:
target_type = LiDARPoints
elif dst == Coord3DMode.DEPTH:
target_type = DepthPoints
else:
raise NotImplementedError(
f'Conversion to {dst} through {original_type}'
' is not supported yet')
return target_type(
arr,
points_dim=arr.size(-1),
attribute_dims=point.attribute_dims)
else:
return arr
================================================
FILE: mmdet3d/core/bbox/structures/depth_box3d.py
================================================
import numpy as np
import torch
from mmdet3d.core.points import BasePoints
from mmdet3d.ops import points_in_boxes_batch
from .base_box3d import BaseInstance3DBoxes
from .utils import limit_period, rotation_3d_in_axis
class DepthInstance3DBoxes(BaseInstance3DBoxes):
"""3D boxes of instances in Depth coordinates.
Coordinates in Depth:
.. code-block:: none
up z y front (yaw=0.5*pi)
^ ^
| /
| /
0 ------> x right (yaw=0)
The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
The yaw is 0 at the positive direction of x axis, and increases from
the positive direction of x to the positive direction of y.
Attributes:
tensor (torch.Tensor): Float matrix of N x box_dim.
box_dim (int): Integer indicates the dimension of a box
Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
boxes.
"""
@property
def gravity_center(self):
"""torch.Tensor: A tensor with center of each box."""
bottom_center = self.bottom_center
gravity_center = torch.zeros_like(bottom_center)
gravity_center[:, :2] = bottom_center[:, :2]
gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
return gravity_center
@property
def corners(self):
"""torch.Tensor: Coordinates of corners of all the boxes
in shape (N, 8, 3).
Convert the boxes to corners in clockwise order, in form of
``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
.. code-block:: none
up z
front y ^
/ |
/ |
(x0, y1, z1) + ----------- + (x1, y1, z1)
/| / |
/ | / |
(x0, y0, z1) + ----------- + + (x1, y1, z0)
| / . | /
| / oriign | /
(x0, y0, z0) + ----------- + --------> right x
(x1, y0, z0)
"""
# TODO: rotation_3d_in_axis function do not support
# empty tensor currently.
assert len(self.tensor) != 0
dims = self.dims
corners_norm = torch.from_numpy(
np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
device=dims.device, dtype=dims.dtype)
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
# use relative origin (0.5, 0.5, 0)
corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
# rotate around z axis
corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
corners += self.tensor[:, :3].view(-1, 1, 3)
return corners
@property
def bev(self):
"""torch.Tensor: A n x 5 tensor of 2D BEV box of each box
in XYWHR format."""
return self.tensor[:, [0, 1, 3, 4, 6]]
@property
def nearest_bev(self):
"""torch.Tensor: A tensor of 2D BEV box of each box
without rotation."""
# Obtain BEV boxes with rotation in XYWHR format
bev_rotated_boxes = self.bev
# convert the rotation to a valid range
rotations = bev_rotated_boxes[:, -1]
normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
# find the center of boxes
conditions = (normed_rotations > np.pi / 4)[..., None]
bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
[0, 1, 3, 2]],
bev_rotated_boxes[:, :4])
centers = bboxes_xywh[:, :2]
dims = bboxes_xywh[:, 2:]
bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
return bev_boxes
def rotate(self, angle, points=None):
"""Rotate boxes with points (optional) with the given angle.
Args:
angle (float, torch.Tensor): Rotation angle.
points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
Points to rotate. Defaults to None.
Returns:
tuple or None: When ``points`` is None, the function returns \
None, otherwise it returns the rotated points and the \
rotation matrix ``rot_mat_T``.
"""
if not isinstance(angle, torch.Tensor):
angle = self.tensor.new_tensor(angle)
rot_sin = torch.sin(angle)
rot_cos = torch.cos(angle)
rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
[rot_sin, rot_cos, 0], [0, 0,
1]]).T
self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T
if self.with_yaw:
self.tensor[:, 6] -= angle
else:
corners_rot = self.corners @ rot_mat_T
new_x_size = corners_rot[..., 0].max(
dim=1, keepdim=True)[0] - corners_rot[..., 0].min(
dim=1, keepdim=True)[0]
new_y_size = corners_rot[..., 1].max(
dim=1, keepdim=True)[0] - corners_rot[..., 1].min(
dim=1, keepdim=True)[0]
self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1)
if points is not None:
if isinstance(points, torch.Tensor):
points[:, :3] = points[:, :3] @ rot_mat_T
elif isinstance(points, np.ndarray):
rot_mat_T = rot_mat_T.numpy()
points[:, :3] = np.dot(points[:, :3], rot_mat_T)
elif isinstance(points, BasePoints):
# anti-clockwise
points.rotate(angle)
else:
raise ValueError
return points, rot_mat_T
def flip(self, bev_direction='horizontal', points=None):
"""Flip the boxes in BEV along given BEV direction.
In Depth coordinates, it flips x (horizontal) or y (vertical) axis.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
Points to flip. Defaults to None.
Returns:
torch.Tensor, numpy.ndarray or None: Flipped points.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
self.tensor[:, 0::7] = -self.tensor[:, 0::7]
if self.with_yaw:
self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
elif bev_direction == 'vertical':
self.tensor[:, 1::7] = -self.tensor[:, 1::7]
if self.with_yaw:
self.tensor[:, 6] = -self.tensor[:, 6]
if points is not None:
assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
if isinstance(points, (torch.Tensor, np.ndarray)):
if bev_direction == 'horizontal':
points[:, 0] = -points[:, 0]
elif bev_direction == 'vertical':
points[:, 1] = -points[:, 1]
elif isinstance(points, BasePoints):
points.flip(bev_direction)
return points
def in_range_bev(self, box_range):
"""Check whether the boxes are in the given range.
Args:
box_range (list | torch.Tensor): The range of box
(x_min, y_min, x_max, y_max).
Note:
In the original implementation of SECOND, checking whether
a box in the range checks whether the points are in a convex
polygon, we try to reduce the burdun for simpler cases.
Returns:
torch.Tensor: Indicating whether each box is inside \
the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > box_range[0])
& (self.tensor[:, 1] > box_range[1])
& (self.tensor[:, 0] < box_range[2])
& (self.tensor[:, 1] < box_range[3]))
return in_range_flags
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`BoxMode`): The target Box mode.
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from ``src`` coordinates to ``dst`` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`DepthInstance3DBoxes`: \
The converted box of the same type in the ``dst`` mode.
"""
from .box_3d_mode import Box3DMode
return Box3DMode.convert(
box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
def points_in_boxes(self, points):
"""Find points that are in boxes (CUDA).
Args:
points (torch.Tensor): Points in shape [1, M, 3] or [M, 3], \
3 dimensions are [x, y, z] in LiDAR coordinate.
Returns:
torch.Tensor: The index of boxes each point lies in with shape \
of (B, M, T).
"""
from .box_3d_mode import Box3DMode
# to lidar
points_lidar = points.clone()
points_lidar = points_lidar[..., [1, 0, 2]]
points_lidar[..., 1] *= -1
if points.dim() == 2:
points_lidar = points_lidar.unsqueeze(0)
else:
assert points.dim() == 3 and points_lidar.shape[0] == 1
boxes_lidar = self.convert_to(Box3DMode.LIDAR).tensor
boxes_lidar = boxes_lidar.to(points.device).unsqueeze(0)
box_idxs_of_pts = points_in_boxes_batch(points_lidar, boxes_lidar)
return box_idxs_of_pts.squeeze(0)
def enlarged_box(self, extra_width):
"""Enlarge the length, width and height boxes.
Args:
extra_width (float | torch.Tensor): Extra width to enlarge the box.
Returns:
:obj:`LiDARInstance3DBoxes`: Enlarged boxes.
"""
enlarged_boxes = self.tensor.clone()
enlarged_boxes[:, 3:6] += extra_width * 2
# bottom center z minus extra_width
enlarged_boxes[:, 2] -= extra_width
return self.new_box(enlarged_boxes)
def get_surface_line_center(self):
"""Compute surface and line center of bounding boxes.
Returns:
torch.Tensor: Surface and line center of bounding boxes.
"""
obj_size = self.dims
center = self.gravity_center.view(-1, 1, 3)
batch_size = center.shape[0]
rot_sin = torch.sin(-self.yaw)
rot_cos = torch.cos(-self.yaw)
rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3]))
rot_mat_T[..., 0, 0] = rot_cos
rot_mat_T[..., 0, 1] = -rot_sin
rot_mat_T[..., 1, 0] = rot_sin
rot_mat_T[..., 1, 1] = rot_cos
rot_mat_T[..., 2, 2] = 1
# Get the object surface center
offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0],
[0, -1, 0], [1, 0, 0], [-1, 0, 0]])
offset = offset.view(1, 6, 3) / 2
surface_3d = (offset *
obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape(
-1, 3)
# Get the object line center
offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1],
[0, -1, 1], [1, 0, -1], [-1, 0, -1],
[0, 1, -1], [0, -1, -1], [1, 1, 0],
[1, -1, 0], [-1, 1, 0], [-1, -1, 0]])
offset = offset.view(1, 12, 3) / 2
line_3d = (offset *
obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape(
-1, 3)
surface_rot = rot_mat_T.repeat(6, 1, 1)
surface_3d = torch.matmul(
surface_3d.unsqueeze(-2), surface_rot.transpose(2, 1)).squeeze(-2)
surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d
line_rot = rot_mat_T.repeat(12, 1, 1)
line_3d = torch.matmul(
line_3d.unsqueeze(-2), line_rot.transpose(2, 1)).squeeze(-2)
line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d
return surface_center, line_center
================================================
FILE: mmdet3d/core/bbox/structures/lidar_box3d.py
================================================
import numpy as np
import torch
from mmdet3d.core.points import BasePoints
from mmdet3d.ops.roiaware_pool3d import points_in_boxes_gpu
from .base_box3d import BaseInstance3DBoxes
from .utils import limit_period, rotation_3d_in_axis
class LiDARInstance3DBoxes(BaseInstance3DBoxes):
"""3D boxes of instances in LIDAR coordinates.
Coordinates in LiDAR:
.. code-block:: none
up z x front (yaw=0.5*pi)
^ ^
| /
| /
(yaw=pi) left y <------ 0
The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0),
and the yaw is around the z axis, thus the rotation axis=2.
The yaw is 0 at the negative direction of y axis, and increases from
the negative direction of y to the positive direction of x.
Attributes:
tensor (torch.Tensor): Float matrix of N x box_dim.
box_dim (int): Integer indicating the dimension of a box.
Each row is (x, y, z, x_size, y_size, z_size, yaw, ...).
with_yaw (bool): If True, the value of yaw will be set to 0 as minmax
boxes.
"""
@property
def gravity_center(self):
"""torch.Tensor: A tensor with center of each box."""
bottom_center = self.bottom_center
gravity_center = torch.zeros_like(bottom_center)
gravity_center[:, :2] = bottom_center[:, :2]
gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5
return gravity_center
@property
def corners(self):
"""torch.Tensor: Coordinates of corners of all the boxes
in shape (N, 8, 3).
Convert the boxes to corners in clockwise order, in form of
``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)``
.. code-block:: none
up z
front x ^
/ |
/ |
(x1, y0, z1) + ----------- + (x1, y1, z1)
/| / |
/ | / |
(x0, y0, z1) + ----------- + + (x1, y1, z0)
| / . | /
| / oriign | /
left y<-------- + ----------- + (x0, y1, z0)
(x0, y0, z0)
"""
# TODO: rotation_3d_in_axis function do not support
# empty tensor currently.
assert len(self.tensor) != 0
dims = self.dims
corners_norm = torch.from_numpy(
np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to(
device=dims.device, dtype=dims.dtype)
corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]]
# use relative origin [0.5, 0.5, 0]
corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0])
corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3])
# rotate around z axis
corners = rotation_3d_in_axis(corners, self.tensor[:, 6], axis=2)
corners += self.tensor[:, :3].view(-1, 1, 3)
return corners
@property
def bev(self):
"""torch.Tensor: 2D BEV box of each box with rotation
in XYWHR format."""
return self.tensor[:, [0, 1, 3, 4, 6]]
@property
def nearest_bev(self):
"""torch.Tensor: A tensor of 2D BEV box of each box
without rotation."""
# Obtain BEV boxes with rotation in XYWHR format
bev_rotated_boxes = self.bev
# convert the rotation to a valid range
rotations = bev_rotated_boxes[:, -1]
normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi))
# find the center of boxes
conditions = (normed_rotations > np.pi / 4)[..., None]
bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:,
[0, 1, 3, 2]],
bev_rotated_boxes[:, :4])
centers = bboxes_xywh[:, :2]
dims = bboxes_xywh[:, 2:]
bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1)
return bev_boxes
def rotate(self, angle, points=None):
"""Rotate boxes with points (optional) with the given angle.
Args:
angle (float | torch.Tensor): Rotation angle.
points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, optional):
Points to rotate. Defaults to None.
Returns:
tuple or None: When ``points`` is None, the function returns \
None, otherwise it returns the rotated points and the \
rotation matrix ``rot_mat_T``.
"""
if not isinstance(angle, torch.Tensor):
angle = self.tensor.new_tensor(angle)
rot_sin = torch.sin(angle)
rot_cos = torch.cos(angle)
rot_mat_T = self.tensor.new_tensor([[rot_cos, -rot_sin, 0],
[rot_sin, rot_cos, 0], [0, 0, 1]])
self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
self.tensor[:, 6] += angle
if self.tensor.shape[1] == 9:
# rotate velo vector
self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2]
if points is not None:
if isinstance(points, torch.Tensor):
points[:, :3] = points[:, :3] @ rot_mat_T
elif isinstance(points, np.ndarray):
rot_mat_T = rot_mat_T.numpy()
points[:, :3] = np.dot(points[:, :3], rot_mat_T)
elif isinstance(points, BasePoints):
# clockwise
points.rotate(-angle)
else:
raise ValueError
return points, rot_mat_T
def flip(self, bev_direction='horizontal', points=None):
"""Flip the boxes in BEV along given BEV direction.
In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis.
Args:
bev_direction (str): Flip direction (horizontal or vertical).
points (torch.Tensor, numpy.ndarray, :obj:`BasePoints`, None):
Points to flip. Defaults to None.
Returns:
torch.Tensor, numpy.ndarray or None: Flipped points.
"""
assert bev_direction in ('horizontal', 'vertical')
if bev_direction == 'horizontal':
self.tensor[:, 1::7] = -self.tensor[:, 1::7]
if self.with_yaw:
self.tensor[:, 6] = -self.tensor[:, 6] + np.pi
elif bev_direction == 'vertical':
self.tensor[:, 0::7] = -self.tensor[:, 0::7]
if self.with_yaw:
self.tensor[:, 6] = -self.tensor[:, 6]
if points is not None:
assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints))
if isinstance(points, (torch.Tensor, np.ndarray)):
if bev_direction == 'horizontal':
points[:, 1] = -points[:, 1]
elif bev_direction == 'vertical':
points[:, 0] = -points[:, 0]
elif isinstance(points, BasePoints):
points.flip(bev_direction)
return points
def in_range_bev(self, box_range):
"""Check whether the boxes are in the given range.
Args:
box_range (list | torch.Tensor): the range of box
(x_min, y_min, x_max, y_max)
Note:
The original implementation of SECOND checks whether boxes in
a range by checking whether the points are in a convex
polygon, we reduce the burden for simpler cases.
Returns:
torch.Tensor: Whether each box is inside the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > box_range[0])
& (self.tensor[:, 1] > box_range[1])
& (self.tensor[:, 0] < box_range[2])
& (self.tensor[:, 1] < box_range[3]))
return in_range_flags
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`BoxMode`): the target Box mode
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from ``src`` coordinates to ``dst`` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BaseInstance3DBoxes`: \
The converted box of the same type in the ``dst`` mode.
"""
from .box_3d_mode import Box3DMode
return Box3DMode.convert(
box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
def enlarged_box(self, extra_width):
"""Enlarge the length, width and height boxes.
Args:
extra_width (float | torch.Tensor): Extra width to enlarge the box.
Returns:
:obj:`LiDARInstance3DBoxes`: Enlarged boxes.
"""
enlarged_boxes = self.tensor.clone()
enlarged_boxes[:, 3:6] += extra_width * 2
# bottom center z minus extra_width
enlarged_boxes[:, 2] -= extra_width
return self.new_box(enlarged_boxes)
def points_in_boxes(self, points):
"""Find the box which the points are in.
Args:
points (torch.Tensor): Points in shape (N, 3).
Returns:
torch.Tensor: The index of box where each point are in.
"""
box_idx = points_in_boxes_gpu(
points.unsqueeze(0),
self.tensor.unsqueeze(0).to(points.device)).squeeze(0)
return box_idx
================================================
FILE: mmdet3d/core/bbox/structures/utils.py
================================================
import numpy as np
import torch
def limit_period(val, offset=0.5, period=np.pi):
"""Limit the value into a period for periodic function.
Args:
val (torch.Tensor): The value to be converted.
offset (float, optional): Offset to set the value range. \
Defaults to 0.5.
period ([type], optional): Period of the value. Defaults to np.pi.
Returns:
torch.Tensor: Value in the range of \
[-offset * period, (1-offset) * period]
"""
return val - torch.floor(val / period + offset) * period
def rotation_3d_in_axis(points, angles, axis=0):
"""Rotate points by angles according to axis.
Args:
points (torch.Tensor): Points of shape (N, M, 3).
angles (torch.Tensor): Vector of angles in shape (N,)
axis (int, optional): The axis to be rotated. Defaults to 0.
Raises:
ValueError: when the axis is not in range [0, 1, 2], it will \
raise value error.
Returns:
torch.Tensor: Rotated points in shape (N, M, 3)
"""
rot_sin = torch.sin(angles)
rot_cos = torch.cos(angles)
ones = torch.ones_like(rot_cos)
zeros = torch.zeros_like(rot_cos)
if axis == 1:
rot_mat_T = torch.stack([
torch.stack([rot_cos, zeros, -rot_sin]),
torch.stack([zeros, ones, zeros]),
torch.stack([rot_sin, zeros, rot_cos])
])
elif axis == 2 or axis == -1:
rot_mat_T = torch.stack([
torch.stack([rot_cos, -rot_sin, zeros]),
torch.stack([rot_sin, rot_cos, zeros]),
torch.stack([zeros, zeros, ones])
])
elif axis == 0:
rot_mat_T = torch.stack([
torch.stack([zeros, rot_cos, -rot_sin]),
torch.stack([zeros, rot_sin, rot_cos]),
torch.stack([ones, zeros, zeros])
])
else:
raise ValueError(f'axis should in range [0, 1, 2], got {axis}')
return torch.einsum('aij,jka->aik', (points, rot_mat_T))
def xywhr2xyxyr(boxes_xywhr):
"""Convert a rotated boxes in XYWHR format to XYXYR format.
Args:
boxes_xywhr (torch.Tensor): Rotated boxes in XYWHR format.
Returns:
torch.Tensor: Converted boxes in XYXYR format.
"""
boxes = torch.zeros_like(boxes_xywhr)
half_w = boxes_xywhr[:, 2] / 2
half_h = boxes_xywhr[:, 3] / 2
boxes[:, 0] = boxes_xywhr[:, 0] - half_w
boxes[:, 1] = boxes_xywhr[:, 1] - half_h
boxes[:, 2] = boxes_xywhr[:, 0] + half_w
boxes[:, 3] = boxes_xywhr[:, 1] + half_h
boxes[:, 4] = boxes_xywhr[:, 4]
return boxes
def get_box_type(box_type):
"""Get the type and mode of box structure.
Args:
box_type (str): The type of box structure.
The valid value are "LiDAR", "Camera", or "Depth".
Returns:
tuple: Box type and box mode.
"""
from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes,
DepthInstance3DBoxes, LiDARInstance3DBoxes)
box_type_lower = box_type.lower()
if box_type_lower == 'lidar':
box_type_3d = LiDARInstance3DBoxes
box_mode_3d = Box3DMode.LIDAR
elif box_type_lower == 'camera':
box_type_3d = CameraInstance3DBoxes
box_mode_3d = Box3DMode.CAM
elif box_type_lower == 'depth':
box_type_3d = DepthInstance3DBoxes
box_mode_3d = Box3DMode.DEPTH
else:
raise ValueError('Only "box_type" of "camera", "lidar", "depth"'
f' are supported, got {box_type}')
return box_type_3d, box_mode_3d
def points_cam2img(points_3d, proj_mat):
"""Project points from camera coordicates to image coordinates.
Args:
points_3d (torch.Tensor): Points in shape (N, 3)
proj_mat (torch.Tensor): Transformation matrix between coordinates.
Returns:
torch.Tensor: Points in image coordinates with shape [N, 2].
"""
points_num = list(points_3d.shape)[:-1]
points_shape = np.concatenate([points_num, [1]], axis=0).tolist()
assert len(proj_mat.shape) == 2, f'The dimension of the projection'\
f'matrix should be 2 instead of {len(proj_mat.shape)}.'
d1, d2 = proj_mat.shape[:2]
assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or (
d1 == 4 and d2 == 4), f'The shape of the projection matrix'\
f' ({d1}*{d2}) is not supported.'
if d1 == 3:
proj_mat_expanded = torch.eye(
4, device=proj_mat.device, dtype=proj_mat.dtype)
proj_mat_expanded[:d1, :d2] = proj_mat
proj_mat = proj_mat_expanded
# previous implementation use new_zeros, new_one yeilds better results
points_4 = torch.cat(
[points_3d, points_3d.new_ones(*points_shape)], dim=-1)
point_2d = torch.matmul(points_4, proj_mat.t())
point_2d_res = point_2d[..., :2] / point_2d[..., 2:3]
return point_2d_res
================================================
FILE: mmdet3d/core/bbox/transforms.py
================================================
import torch
def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical):
"""Map bboxes from testing scale to original image scale.
Args:
bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back.
scale_factor (float): Scale factor.
flip_horizontal (bool): Whether to flip horizontally.
flip_vertical (bool): Whether to flip vertically.
Returns:
:obj:`BaseInstance3DBoxes`: Boxes mapped back.
"""
new_bboxes = bboxes.clone()
if flip_horizontal:
new_bboxes.flip('horizontal')
if flip_vertical:
new_bboxes.flip('vertical')
new_bboxes.scale(1 / scale_factor)
return new_bboxes
def bbox3d2roi(bbox_list):
"""Convert a list of bounding boxes to roi format.
Args:
bbox_list (list[torch.Tensor]): A list of bounding boxes
corresponding to a batch of images.
Returns:
torch.Tensor: Region of interests in shape (n, c), where \
the channels are in order of [batch_ind, x, y ...].
"""
rois_list = []
for img_id, bboxes in enumerate(bbox_list):
if bboxes.size(0) > 0:
img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
rois = torch.cat([img_inds, bboxes], dim=-1)
else:
rois = torch.zeros_like(bboxes)
rois_list.append(rois)
rois = torch.cat(rois_list, 0)
return rois
def bbox3d2result(bboxes, scores, labels):
"""Convert detection results to a list of numpy arrays.
Args:
bboxes (torch.Tensor): Bounding boxes with shape of (n, 5).
labels (torch.Tensor): Labels with shape of (n, ).
scores (torch.Tensor): Scores with shape of (n, ).
Returns:
dict[str, torch.Tensor]: Bounding box results in cpu mode.
- boxes_3d (torch.Tensor): 3D boxes.
- scores (torch.Tensor): Prediction scores.
- labels_3d (torch.Tensor): Box labels.
"""
return dict(
boxes_3d=bboxes.to('cpu'),
scores_3d=scores.cpu(),
labels_3d=labels.cpu())
================================================
FILE: mmdet3d/core/evaluation/__init__.py
================================================
from .indoor_eval import indoor_eval
from .kitti_utils import kitti_eval, kitti_eval_coco_style
from .lyft_eval import lyft_eval
from .seg_eval import seg_eval
__all__ = [
'kitti_eval_coco_style', 'kitti_eval', 'indoor_eval', 'lyft_eval',
'seg_eval'
]
================================================
FILE: mmdet3d/core/evaluation/indoor_eval.py
================================================
import numpy as np
import torch
from mmcv.utils import print_log
from terminaltables import AsciiTable
def average_precision(recalls, precisions, mode='area'):
"""Calculate average precision (for single or multiple scales).
Args:
recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) \
or (num_dets, ).
precisions (np.ndarray): Precisions with shape of \
(num_scales, num_dets) or (num_dets, ).
mode (str): 'area' or '11points', 'area' means calculating the area
under precision-recall curve, '11points' means calculating
the average precision of recalls at [0, 0.1, ..., 1]
Returns:
float or np.ndarray: Calculated average precision.
"""
if recalls.ndim == 1:
recalls = recalls[np.newaxis, :]
precisions = precisions[np.newaxis, :]
assert recalls.shape == precisions.shape
assert recalls.ndim == 2
num_scales = recalls.shape[0]
ap = np.zeros(num_scales, dtype=np.float32)
if mode == 'area':
zeros = np.zeros((num_scales, 1), dtype=recalls.dtype)
ones = np.ones((num_scales, 1), dtype=recalls.dtype)
mrec = np.hstack((zeros, recalls, ones))
mpre = np.hstack((zeros, precisions, zeros))
for i in range(mpre.shape[1] - 1, 0, -1):
mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i])
for i in range(num_scales):
ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0]
ap[i] = np.sum(
(mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1])
elif mode == '11points':
for i in range(num_scales):
for thr in np.arange(0, 1 + 1e-3, 0.1):
precs = precisions[i, recalls[i, :] >= thr]
prec = precs.max() if precs.size > 0 else 0
ap[i] += prec
ap /= 11
else:
raise ValueError(
'Unrecognized mode, only "area" and "11points" are supported')
return ap
def eval_det_cls(pred, gt, iou_thr=None):
"""Generic functions to compute precision/recall for object detection for a
single class.
Args:
pred (dict): Predictions mapping from image id to bounding boxes \
and scores.
gt (dict): Ground truths mapping from image id to bounding boxes.
iou_thr (list[float]): A list of iou thresholds.
Return:
tuple (np.ndarray, np.ndarray, float): Recalls, precisions and \
average precision.
"""
# {img_id: {'bbox': box structure, 'det': matched list}}
class_recs = {}
npos = 0
for img_id in gt.keys():
cur_gt_num = len(gt[img_id])
if cur_gt_num != 0:
gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32)
for i in range(cur_gt_num):
gt_cur[i] = gt[img_id][i].tensor
bbox = gt[img_id][0].new_box(gt_cur)
else:
bbox = gt[img_id]
det = [[False] * len(bbox) for i in iou_thr]
npos += len(bbox)
class_recs[img_id] = {'bbox': bbox, 'det': det}
# construct dets
image_ids = []
confidence = []
ious = []
for img_id in pred.keys():
cur_num = len(pred[img_id])
if cur_num == 0:
continue
pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32)
box_idx = 0
for box, score in pred[img_id]:
image_ids.append(img_id)
confidence.append(score)
pred_cur[box_idx] = box.tensor
box_idx += 1
pred_cur = box.new_box(pred_cur)
gt_cur = class_recs[img_id]['bbox']
if len(gt_cur) > 0:
# calculate iou in each image
iou_cur = pred_cur.overlaps(pred_cur, gt_cur)
for i in range(cur_num):
ious.append(iou_cur[i])
else:
for i in range(cur_num):
ious.append(np.zeros(1))
confidence = np.array(confidence)
# sort by confidence
sorted_ind = np.argsort(-confidence)
image_ids = [image_ids[x] for x in sorted_ind]
ious = [ious[x] for x in sorted_ind]
# go down dets and mark TPs and FPs
nd = len(image_ids)
tp_thr = [np.zeros(nd) for i in iou_thr]
fp_thr = [np.zeros(nd) for i in iou_thr]
for d in range(nd):
R = class_recs[image_ids[d]]
iou_max = -np.inf
BBGT = R['bbox']
cur_iou = ious[d]
if len(BBGT) > 0:
# compute overlaps
for j in range(len(BBGT)):
# iou = get_iou_main(get_iou_func, (bb, BBGT[j,...]))
iou = cur_iou[j]
if iou > iou_max:
iou_max = iou
jmax = j
for iou_idx, thresh in enumerate(iou_thr):
if iou_max > thresh:
if not R['det'][iou_idx][jmax]:
tp_thr[iou_idx][d] = 1.
R['det'][iou_idx][jmax] = 1
else:
fp_thr[iou_idx][d] = 1.
else:
fp_thr[iou_idx][d] = 1.
ret = []
for iou_idx, thresh in enumerate(iou_thr):
# compute precision recall
fp = np.cumsum(fp_thr[iou_idx])
tp = np.cumsum(tp_thr[iou_idx])
recall = tp / float(npos)
# avoid divide by zero in case the first detection matches a difficult
# ground truth
precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps)
ap = average_precision(recall, precision)
ret.append((recall, precision, ap))
return ret
def eval_map_recall(pred, gt, ovthresh=None):
"""Evaluate mAP and recall.
Generic functions to compute precision/recall for object detection
for multiple classes.
Args:
pred (dict): Information of detection results,
which maps class_id and predictions.
gt (dict): Information of ground truths, which maps class_id and \
ground truths.
ovthresh (list[float]): iou threshold.
Default: None.
Return:
tuple[dict]: dict results of recall, AP, and precision for all classes.
"""
ret_values = {}
for classname in gt.keys():
if classname in pred:
ret_values[classname] = eval_det_cls(pred[classname],
gt[classname], ovthresh)
recall = [{} for i in ovthresh]
precision = [{} for i in ovthresh]
ap = [{} for i in ovthresh]
for label in gt.keys():
for iou_idx, thresh in enumerate(ovthresh):
if label in pred:
recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][
label] = ret_values[label][iou_idx]
else:
recall[iou_idx][label] = np.zeros(1)
precision[iou_idx][label] = np.zeros(1)
ap[iou_idx][label] = np.zeros(1)
return recall, precision, ap
def indoor_eval(gt_annos,
dt_annos,
metric,
label2cat,
logger=None,
box_type_3d=None,
box_mode_3d=None):
"""Indoor Evaluation.
Evaluate the result of the detection.
Args:
gt_annos (list[dict]): Ground truth annotations.
dt_annos (list[dict]): Detection annotations. the dict
includes the following keys
- labels_3d (torch.Tensor): Labels of boxes.
- boxes_3d (:obj:`BaseInstance3DBoxes`): \
3D bounding boxes in Depth coordinate.
- scores_3d (torch.Tensor): Scores of boxes.
metric (list[float]): IoU thresholds for computing average precisions.
label2cat (dict): Map from label to category.
logger (logging.Logger | str | None): The way to print the mAP
summary. See `mmdet.utils.print_log()` for details. Default: None.
Return:
dict[str, float]: Dict of results.
"""
assert len(dt_annos) == len(gt_annos)
pred = {} # map {class_id: pred}
gt = {} # map {class_id: gt}
for img_id in range(len(dt_annos)):
# parse detected annotations
det_anno = dt_annos[img_id]
for i in range(len(det_anno['labels_3d'])):
label = det_anno['labels_3d'].numpy()[i]
bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i]
score = det_anno['scores_3d'].numpy()[i]
if label not in pred:
pred[int(label)] = {}
if img_id not in pred[label]:
pred[int(label)][img_id] = []
if label not in gt:
gt[int(label)] = {}
if img_id not in gt[label]:
gt[int(label)][img_id] = []
pred[int(label)][img_id].append((bbox, score))
# parse gt annotations
gt_anno = gt_annos[img_id]
if gt_anno['gt_num'] != 0:
gt_boxes = box_type_3d(
gt_anno['gt_boxes_upright_depth'],
box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d)
labels_3d = gt_anno['class']
else:
gt_boxes = box_type_3d(np.array([], dtype=np.float32))
labels_3d = np.array([], dtype=np.int64)
for i in range(len(labels_3d)):
label = labels_3d[i]
bbox = gt_boxes[i]
if label not in gt:
gt[label] = {}
if img_id not in gt[label]:
gt[label][img_id] = []
gt[label][img_id].append(bbox)
rec, prec, ap = eval_map_recall(pred, gt, metric)
ret_dict = dict()
header = ['classes']
table_columns = [[label2cat[label]
for label in ap[0].keys()] + ['Overall']]
for i, iou_thresh in enumerate(metric):
header.append(f'AP_{iou_thresh:.2f}')
header.append(f'AR_{iou_thresh:.2f}')
rec_list = []
for label in ap[i].keys():
ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float(
ap[i][label][0])
ret_dict[f'mAP_{iou_thresh:.2f}'] = float(
np.mean(list(ap[i].values())))
table_columns.append(list(map(float, list(ap[i].values()))))
table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']]
table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
for label in rec[i].keys():
ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float(
rec[i][label][-1])
rec_list.append(rec[i][label][-1])
ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list))
table_columns.append(list(map(float, rec_list)))
table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']]
table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]]
table_data = [header]
table_rows = list(zip(*table_columns))
table_data += table_rows
table = AsciiTable(table_data)
table.inner_footing_row_border = True
print_log('\n' + table.table, logger=logger)
return ret_dict
================================================
FILE: mmdet3d/core/evaluation/kitti_utils/__init__.py
================================================
from .eval import kitti_eval, kitti_eval_coco_style
__all__ = ['kitti_eval', 'kitti_eval_coco_style']
================================================
FILE: mmdet3d/core/evaluation/kitti_utils/eval.py
================================================
import gc
import io as sysio
import numba
import numpy as np
@numba.jit
def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41):
scores.sort()
scores = scores[::-1]
current_recall = 0
thresholds = []
for i, score in enumerate(scores):
l_recall = (i + 1) / num_gt
if i < (len(scores) - 1):
r_recall = (i + 2) / num_gt
else:
r_recall = l_recall
if (((r_recall - current_recall) < (current_recall - l_recall))
and (i < (len(scores) - 1))):
continue
# recall = l_recall
thresholds.append(score)
current_recall += 1 / (num_sample_pts - 1.0)
return thresholds
def clean_data(gt_anno, dt_anno, current_class, difficulty):
CLASS_NAMES = ['car', 'pedestrian', 'cyclist']
MIN_HEIGHT = [40, 25, 25]
MAX_OCCLUSION = [0, 1, 2]
MAX_TRUNCATION = [0.15, 0.3, 0.5]
dc_bboxes, ignored_gt, ignored_dt = [], [], []
current_cls_name = CLASS_NAMES[current_class].lower()
num_gt = len(gt_anno['name'])
num_dt = len(dt_anno['name'])
num_valid_gt = 0
for i in range(num_gt):
bbox = gt_anno['bbox'][i]
gt_name = gt_anno['name'][i].lower()
height = bbox[3] - bbox[1]
valid_class = -1
if (gt_name == current_cls_name):
valid_class = 1
elif (current_cls_name == 'Pedestrian'.lower()
and 'Person_sitting'.lower() == gt_name):
valid_class = 0
elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name):
valid_class = 0
else:
valid_class = -1
ignore = False
if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty])
or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty])
or (height <= MIN_HEIGHT[difficulty])):
ignore = True
if valid_class == 1 and not ignore:
ignored_gt.append(0)
num_valid_gt += 1
elif (valid_class == 0 or (ignore and (valid_class == 1))):
ignored_gt.append(1)
else:
ignored_gt.append(-1)
# for i in range(num_gt):
if gt_anno['name'][i] == 'DontCare':
dc_bboxes.append(gt_anno['bbox'][i])
for i in range(num_dt):
if (dt_anno['name'][i].lower() == current_cls_name):
valid_class = 1
else:
valid_class = -1
height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1])
if height < MIN_HEIGHT[difficulty]:
ignored_dt.append(1)
elif valid_class == 1:
ignored_dt.append(0)
else:
ignored_dt.append(-1)
return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes
@numba.jit(nopython=True)
def image_box_overlap(boxes, query_boxes, criterion=-1):
N = boxes.shape[0]
K = query_boxes.shape[0]
overlaps = np.zeros((N, K), dtype=boxes.dtype)
for k in range(K):
qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) *
(query_boxes[k, 3] - query_boxes[k, 1]))
for n in range(N):
iw = (
min(boxes[n, 2], query_boxes[k, 2]) -
max(boxes[n, 0], query_boxes[k, 0]))
if iw > 0:
ih = (
min(boxes[n, 3], query_boxes[k, 3]) -
max(boxes[n, 1], query_boxes[k, 1]))
if ih > 0:
if criterion == -1:
ua = ((boxes[n, 2] - boxes[n, 0]) *
(boxes[n, 3] - boxes[n, 1]) + qbox_area -
iw * ih)
elif criterion == 0:
ua = ((boxes[n, 2] - boxes[n, 0]) *
(boxes[n, 3] - boxes[n, 1]))
elif criterion == 1:
ua = qbox_area
else:
ua = 1.0
overlaps[n, k] = iw * ih / ua
return overlaps
def bev_box_overlap(boxes, qboxes, criterion=-1):
from .rotate_iou import rotate_iou_gpu_eval
riou = rotate_iou_gpu_eval(boxes, qboxes, criterion)
return riou
@numba.jit(nopython=True, parallel=True)
def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1):
# ONLY support overlap in CAMERA, not lidar.
# TODO: change to use prange for parallel mode, should check the difference
N, K = boxes.shape[0], qboxes.shape[0]
for i in numba.prange(N):
for j in numba.prange(K):
if rinc[i, j] > 0:
# iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] +
# qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1]))
iw = (
min(boxes[i, 1], qboxes[j, 1]) -
max(boxes[i, 1] - boxes[i, 4],
qboxes[j, 1] - qboxes[j, 4]))
if iw > 0:
area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5]
area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5]
inc = iw * rinc[i, j]
if criterion == -1:
ua = (area1 + area2 - inc)
elif criterion == 0:
ua = area1
elif criterion == 1:
ua = area2
else:
ua = inc
rinc[i, j] = inc / ua
else:
rinc[i, j] = 0.0
def d3_box_overlap(boxes, qboxes, criterion=-1):
from .rotate_iou import rotate_iou_gpu_eval
rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]],
qboxes[:, [0, 2, 3, 5, 6]], 2)
d3_box_overlap_kernel(boxes, qboxes, rinc, criterion)
return rinc
@numba.jit(nopython=True)
def compute_statistics_jit(overlaps,
gt_datas,
dt_datas,
ignored_gt,
ignored_det,
dc_bboxes,
metric,
min_overlap,
thresh=0,
compute_fp=False,
compute_aos=False):
det_size = dt_datas.shape[0]
gt_size = gt_datas.shape[0]
dt_scores = dt_datas[:, -1]
dt_alphas = dt_datas[:, 4]
gt_alphas = gt_datas[:, 4]
dt_bboxes = dt_datas[:, :4]
# gt_bboxes = gt_datas[:, :4]
assigned_detection = [False] * det_size
ignored_threshold = [False] * det_size
if compute_fp:
for i in range(det_size):
if (dt_scores[i] < thresh):
ignored_threshold[i] = True
NO_DETECTION = -10000000
tp, fp, fn, similarity = 0, 0, 0, 0
# thresholds = [0.0]
# delta = [0.0]
thresholds = np.zeros((gt_size, ))
thresh_idx = 0
delta = np.zeros((gt_size, ))
delta_idx = 0
for i in range(gt_size):
if ignored_gt[i] == -1:
continue
det_idx = -1
valid_detection = NO_DETECTION
max_overlap = 0
assigned_ignored_det = False
for j in range(det_size):
if (ignored_det[j] == -1):
continue
if (assigned_detection[j]):
continue
if (ignored_threshold[j]):
continue
overlap = overlaps[j, i]
dt_score = dt_scores[j]
if (not compute_fp and (overlap > min_overlap)
and dt_score > valid_detection):
det_idx = j
valid_detection = dt_score
elif (compute_fp and (overlap > min_overlap)
and (overlap > max_overlap or assigned_ignored_det)
and ignored_det[j] == 0):
max_overlap = overlap
det_idx = j
valid_detection = 1
assigned_ignored_det = False
elif (compute_fp and (overlap > min_overlap)
and (valid_detection == NO_DETECTION)
and ignored_det[j] == 1):
det_idx = j
valid_detection = 1
assigned_ignored_det = True
if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0:
fn += 1
elif ((valid_detection != NO_DETECTION)
and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)):
assigned_detection[det_idx] = True
elif valid_detection != NO_DETECTION:
tp += 1
# thresholds.append(dt_scores[det_idx])
thresholds[thresh_idx] = dt_scores[det_idx]
thresh_idx += 1
if compute_aos:
# delta.append(gt_alphas[i] - dt_alphas[det_idx])
delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx]
delta_idx += 1
assigned_detection[det_idx] = True
if compute_fp:
for i in range(det_size):
if (not (assigned_detection[i] or ignored_det[i] == -1
or ignored_det[i] == 1 or ignored_threshold[i])):
fp += 1
nstuff = 0
if metric == 0:
overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0)
for i in range(dc_bboxes.shape[0]):
for j in range(det_size):
if (assigned_detection[j]):
continue
if (ignored_det[j] == -1 or ignored_det[j] == 1):
continue
if (ignored_threshold[j]):
continue
if overlaps_dt_dc[j, i] > min_overlap:
assigned_detection[j] = True
nstuff += 1
fp -= nstuff
if compute_aos:
tmp = np.zeros((fp + delta_idx, ))
# tmp = [0] * fp
for i in range(delta_idx):
tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0
# tmp.append((1.0 + np.cos(delta[i])) / 2.0)
# assert len(tmp) == fp + tp
# assert len(delta) == tp
if tp > 0 or fp > 0:
similarity = np.sum(tmp)
else:
similarity = -1
return tp, fp, fn, similarity, thresholds[:thresh_idx]
def get_split_parts(num, num_part):
same_part = num // num_part
remain_num = num % num_part
if remain_num == 0:
return [same_part] * num_part
else:
return [same_part] * num_part + [remain_num]
@numba.jit(nopython=True)
def fused_compute_statistics(overlaps,
pr,
gt_nums,
dt_nums,
dc_nums,
gt_datas,
dt_datas,
dontcares,
ignored_gts,
ignored_dets,
metric,
min_overlap,
thresholds,
compute_aos=False):
gt_num = 0
dt_num = 0
dc_num = 0
for i in range(gt_nums.shape[0]):
for t, thresh in enumerate(thresholds):
overlap = overlaps[dt_num:dt_num + dt_nums[i],
gt_num:gt_num + gt_nums[i]]
gt_data = gt_datas[gt_num:gt_num + gt_nums[i]]
dt_data = dt_datas[dt_num:dt_num + dt_nums[i]]
ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]]
ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]]
dontcare = dontcares[dc_num:dc_num + dc_nums[i]]
tp, fp, fn, similarity, _ = compute_statistics_jit(
overlap,
gt_data,
dt_data,
ignored_gt,
ignored_det,
dontcare,
metric,
min_overlap=min_overlap,
thresh=thresh,
compute_fp=True,
compute_aos=compute_aos)
pr[t, 0] += tp
pr[t, 1] += fp
pr[t, 2] += fn
if similarity != -1:
pr[t, 3] += similarity
gt_num += gt_nums[i]
dt_num += dt_nums[i]
dc_num += dc_nums[i]
def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50):
"""Fast iou algorithm. this function can be used independently to do result
analysis. Must be used in CAMERA coordinate system.
Args:
gt_annos (dict): Must from get_label_annos() in kitti_common.py.
dt_annos (dict): Must from get_label_annos() in kitti_common.py.
metric (int): Eval type. 0: bbox, 1: bev, 2: 3d.
num_parts (int): A parameter for fast calculate algorithm.
"""
assert len(gt_annos) == len(dt_annos)
total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0)
total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0)
num_examples = len(gt_annos)
split_parts = get_split_parts(num_examples, num_parts)
parted_overlaps = []
example_idx = 0
for num_part in split_parts:
gt_annos_part = gt_annos[example_idx:example_idx + num_part]
dt_annos_part = dt_annos[example_idx:example_idx + num_part]
if metric == 0:
gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0)
dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0)
overlap_part = image_box_overlap(gt_boxes, dt_boxes)
elif metric == 1:
loc = np.concatenate(
[a['location'][:, [0, 2]] for a in gt_annos_part], 0)
dims = np.concatenate(
[a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0)
rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
axis=1)
loc = np.concatenate(
[a['location'][:, [0, 2]] for a in dt_annos_part], 0)
dims = np.concatenate(
[a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0)
rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
axis=1)
overlap_part = bev_box_overlap(gt_boxes,
dt_boxes).astype(np.float64)
elif metric == 2:
loc = np.concatenate([a['location'] for a in gt_annos_part], 0)
dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0)
rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0)
gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
axis=1)
loc = np.concatenate([a['location'] for a in dt_annos_part], 0)
dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0)
rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0)
dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]],
axis=1)
overlap_part = d3_box_overlap(gt_boxes,
dt_boxes).astype(np.float64)
else:
raise ValueError('unknown metric')
parted_overlaps.append(overlap_part)
example_idx += num_part
overlaps = []
example_idx = 0
for j, num_part in enumerate(split_parts):
gt_annos_part = gt_annos[example_idx:example_idx + num_part]
dt_annos_part = dt_annos[example_idx:example_idx + num_part]
gt_num_idx, dt_num_idx = 0, 0
for i in range(num_part):
gt_box_num = total_gt_num[example_idx + i]
dt_box_num = total_dt_num[example_idx + i]
overlaps.append(
parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num,
dt_num_idx:dt_num_idx + dt_box_num])
gt_num_idx += gt_box_num
dt_num_idx += dt_box_num
example_idx += num_part
return overlaps, parted_overlaps, total_gt_num, total_dt_num
def _prepare_data(gt_annos, dt_annos, current_class, difficulty):
gt_datas_list = []
dt_datas_list = []
total_dc_num = []
ignored_gts, ignored_dets, dontcares = [], [], []
total_num_valid_gt = 0
for i in range(len(gt_annos)):
rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty)
num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets
ignored_gts.append(np.array(ignored_gt, dtype=np.int64))
ignored_dets.append(np.array(ignored_det, dtype=np.int64))
if len(dc_bboxes) == 0:
dc_bboxes = np.zeros((0, 4)).astype(np.float64)
else:
dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64)
total_dc_num.append(dc_bboxes.shape[0])
dontcares.append(dc_bboxes)
total_num_valid_gt += num_valid_gt
gt_datas = np.concatenate(
[gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1)
dt_datas = np.concatenate([
dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis],
dt_annos[i]['score'][..., np.newaxis]
], 1)
gt_datas_list.append(gt_datas)
dt_datas_list.append(dt_datas)
total_dc_num = np.stack(total_dc_num, axis=0)
return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares,
total_dc_num, total_num_valid_gt)
def eval_class(gt_annos,
dt_annos,
current_classes,
difficultys,
metric,
min_overlaps,
compute_aos=False,
num_parts=200):
"""Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP.
Args:
gt_annos (dict): Must from get_label_annos() in kitti_common.py.
dt_annos (dict): Must from get_label_annos() in kitti_common.py.
current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist.
difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard
metric (int): Eval type. 0: bbox, 1: bev, 2: 3d
min_overlaps (float): Min overlap. format:
[num_overlap, metric, class].
num_parts (int): A parameter for fast calculate algorithm
Returns:
dict[str, np.ndarray]: recall, precision and aos
"""
assert len(gt_annos) == len(dt_annos)
num_examples = len(gt_annos)
if num_examples < num_parts:
num_parts = num_examples
split_parts = get_split_parts(num_examples, num_parts)
rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts)
overlaps, parted_overlaps, total_dt_num, total_gt_num = rets
N_SAMPLE_PTS = 41
num_minoverlap = len(min_overlaps)
num_class = len(current_classes)
num_difficulty = len(difficultys)
precision = np.zeros(
[num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
recall = np.zeros(
[num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS])
for m, current_class in enumerate(current_classes):
for idx_l, difficulty in enumerate(difficultys):
rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty)
(gt_datas_list, dt_datas_list, ignored_gts, ignored_dets,
dontcares, total_dc_num, total_num_valid_gt) = rets
for k, min_overlap in enumerate(min_overlaps[:, metric, m]):
thresholdss = []
for i in range(len(gt_annos)):
rets = compute_statistics_jit(
overlaps[i],
gt_datas_list[i],
dt_datas_list[i],
ignored_gts[i],
ignored_dets[i],
dontcares[i],
metric,
min_overlap=min_overlap,
thresh=0.0,
compute_fp=False)
tp, fp, fn, similarity, thresholds = rets
thresholdss += thresholds.tolist()
thresholdss = np.array(thresholdss)
thresholds = get_thresholds(thresholdss, total_num_valid_gt)
thresholds = np.array(thresholds)
pr = np.zeros([len(thresholds), 4])
idx = 0
for j, num_part in enumerate(split_parts):
gt_datas_part = np.concatenate(
gt_datas_list[idx:idx + num_part], 0)
dt_datas_part = np.concatenate(
dt_datas_list[idx:idx + num_part], 0)
dc_datas_part = np.concatenate(
dontcares[idx:idx + num_part], 0)
ignored_dets_part = np.concatenate(
ignored_dets[idx:idx + num_part], 0)
ignored_gts_part = np.concatenate(
ignored_gts[idx:idx + num_part], 0)
fused_compute_statistics(
parted_overlaps[j],
pr,
total_gt_num[idx:idx + num_part],
total_dt_num[idx:idx + num_part],
total_dc_num[idx:idx + num_part],
gt_datas_part,
dt_datas_part,
dc_datas_part,
ignored_gts_part,
ignored_dets_part,
metric,
min_overlap=min_overlap,
thresholds=thresholds,
compute_aos=compute_aos)
idx += num_part
for i in range(len(thresholds)):
recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2])
precision[m, idx_l, k, i] = pr[i, 0] / (
pr[i, 0] + pr[i, 1])
if compute_aos:
aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1])
for i in range(len(thresholds)):
precision[m, idx_l, k, i] = np.max(
precision[m, idx_l, k, i:], axis=-1)
recall[m, idx_l, k, i] = np.max(
recall[m, idx_l, k, i:], axis=-1)
if compute_aos:
aos[m, idx_l, k, i] = np.max(
aos[m, idx_l, k, i:], axis=-1)
ret_dict = {
'recall': recall,
'precision': precision,
'orientation': aos,
}
# clean temp variables
del overlaps
del parted_overlaps
gc.collect()
return ret_dict
def get_mAP(prec):
sums = 0
for i in range(0, prec.shape[-1], 4):
sums = sums + prec[..., i]
return sums / 11 * 100
def print_str(value, *arg, sstream=None):
if sstream is None:
sstream = sysio.StringIO()
sstream.truncate(0)
sstream.seek(0)
print(value, *arg, file=sstream)
return sstream.getvalue()
def do_eval(gt_annos,
dt_annos,
current_classes,
min_overlaps,
eval_types=['bbox', 'bev', '3d']):
# min_overlaps: [num_minoverlap, metric, num_class]
difficultys = [0, 1, 2]
mAP_bbox = None
mAP_aos = None
if 'bbox' in eval_types:
ret = eval_class(
gt_annos,
dt_annos,
current_classes,
difficultys,
0,
min_overlaps,
compute_aos=('aos' in eval_types))
# ret: [num_class, num_diff, num_minoverlap, num_sample_points]
mAP_bbox = get_mAP(ret['precision'])
if 'aos' in eval_types:
mAP_aos = get_mAP(ret['orientation'])
mAP_bev = None
if 'bev' in eval_types:
ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1,
min_overlaps)
mAP_bev = get_mAP(ret['precision'])
mAP_3d = None
if '3d' in eval_types:
ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2,
min_overlaps)
mAP_3d = get_mAP(ret['precision'])
return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges,
compute_aos):
# overlap_ranges: [range, metric, num_class]
min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]])
for i in range(overlap_ranges.shape[1]):
for j in range(overlap_ranges.shape[2]):
min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j])
mAP_bbox, mAP_bev, mAP_3d, mAP_aos = do_eval(gt_annos, dt_annos,
current_classes, min_overlaps,
compute_aos)
# ret: [num_class, num_diff, num_minoverlap]
mAP_bbox = mAP_bbox.mean(-1)
mAP_bev = mAP_bev.mean(-1)
mAP_3d = mAP_3d.mean(-1)
if mAP_aos is not None:
mAP_aos = mAP_aos.mean(-1)
return mAP_bbox, mAP_bev, mAP_3d, mAP_aos
def kitti_eval(gt_annos,
dt_annos,
current_classes,
eval_types=['bbox', 'bev', '3d']):
"""KITTI evaluation.
Args:
gt_annos (list[dict]): Contain gt information of each sample.
dt_annos (list[dict]): Contain detected information of each sample.
current_classes (list[str]): Classes to evaluation.
eval_types (list[str], optional): Types to eval.
Defaults to ['bbox', 'bev', '3d'].
Returns:
tuple: String and dict of evaluation results.
"""
assert len(eval_types) > 0, 'must contain at least one evaluation type'
if 'aos' in eval_types:
assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos'
overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7,
0.5], [0.7, 0.5, 0.5, 0.7, 0.5],
[0.7, 0.5, 0.5, 0.7, 0.5]])
overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5],
[0.5, 0.25, 0.25, 0.5, 0.25],
[0.5, 0.25, 0.25, 0.5, 0.25]])
min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0) # [2, 3, 5]
class_to_name = {
0: 'Car',
1: 'Pedestrian',
2: 'Cyclist',
3: 'Van',
4: 'Person_sitting',
}
name_to_class = {v: n for n, v in class_to_name.items()}
if not isinstance(current_classes, (list, tuple)):
current_classes = [current_classes]
current_classes_int = []
for curcls in current_classes:
if isinstance(curcls, str):
current_classes_int.append(name_to_class[curcls])
else:
current_classes_int.append(curcls)
current_classes = current_classes_int
min_overlaps = min_overlaps[:, :, current_classes]
result = ''
# check whether alpha is valid
compute_aos = False
pred_alpha = False
valid_alpha_gt = False
for anno in dt_annos:
if anno['alpha'].shape[0] != 0:
pred_alpha = True
break
for anno in gt_annos:
if anno['alpha'][0] != -10:
valid_alpha_gt = True
break
compute_aos = (pred_alpha and valid_alpha_gt)
if compute_aos:
eval_types.append('aos')
mAPbbox, mAPbev, mAP3d, mAPaos = do_eval(gt_annos, dt_annos,
current_classes, min_overlaps,
eval_types)
ret_dict = {}
difficulty = ['easy', 'moderate', 'hard']
for j, curcls in enumerate(current_classes):
# mAP threshold array: [num_minoverlap, metric, class]
# mAP result: [num_class, num_diff, num_minoverlap]
curcls_name = class_to_name[curcls]
for i in range(min_overlaps.shape[0]):
# prepare results for print
result += ('{} AP@{:.2f}, {:.2f}, {:.2f}:\n'.format(
curcls_name, *min_overlaps[i, :, j]))
if mAPbbox is not None:
result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
*mAPbbox[j, :, i])
if mAPbev is not None:
result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
*mAPbev[j, :, i])
if mAP3d is not None:
result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(
*mAP3d[j, :, i])
if compute_aos:
result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(
*mAPaos[j, :, i])
# prepare results for logger
for idx in range(3):
if i == 0:
postfix = f'{difficulty[idx]}_strict'
else:
postfix = f'{difficulty[idx]}_loose'
prefix = f'KITTI/{curcls_name}'
if mAP3d is not None:
ret_dict[f'{prefix}_3D_{postfix}'] = mAP3d[j, idx, i]
if mAPbev is not None:
ret_dict[f'{prefix}_BEV_{postfix}'] = mAPbev[j, idx, i]
if mAPbbox is not None:
ret_dict[f'{prefix}_2D_{postfix}'] = mAPbbox[j, idx, i]
# calculate mAP over all classes if there are multiple classes
if len(current_classes) > 1:
# prepare results for print
result += ('\nOverall AP@{}, {}, {}:\n'.format(*difficulty))
if mAPbbox is not None:
mAPbbox = mAPbbox.mean(axis=0)
result += 'bbox AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbbox[:, 0])
if mAPbev is not None:
mAPbev = mAPbev.mean(axis=0)
result += 'bev AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAPbev[:, 0])
if mAP3d is not None:
mAP3d = mAP3d.mean(axis=0)
result += '3d AP:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP3d[:, 0])
if compute_aos:
mAPaos = mAPaos.mean(axis=0)
result += 'aos AP:{:.2f}, {:.2f}, {:.2f}\n'.format(*mAPaos[:, 0])
# prepare results for logger
for idx in range(3):
postfix = f'{difficulty[idx]}'
if mAP3d is not None:
ret_dict[f'KITTI/Overall_3D_{postfix}'] = mAP3d[idx, 0]
if mAPbev is not None:
ret_dict[f'KITTI/Overall_BEV_{postfix}'] = mAPbev[idx, 0]
if mAPbbox is not None:
ret_dict[f'KITTI/Overall_2D_{postfix}'] = mAPbbox[idx, 0]
return result, ret_dict
def kitti_eval_coco_style(gt_annos, dt_annos, current_classes):
"""coco style evaluation of kitti.
Args:
gt_annos (list[dict]): Contain gt information of each sample.
dt_annos (list[dict]): Contain detected information of each sample.
current_classes (list[str]): Classes to evaluation.
Returns:
string: Evaluation results.
"""
class_to_name = {
0: 'Car',
1: 'Pedestrian',
2: 'Cyclist',
3: 'Van',
4: 'Person_sitting',
}
class_to_range = {
0: [0.5, 0.95, 10],
1: [0.25, 0.7, 10],
2: [0.25, 0.7, 10],
3: [0.5, 0.95, 10],
4: [0.25, 0.7, 10],
}
name_to_class = {v: n for n, v in class_to_name.items()}
if not isinstance(current_classes, (list, tuple)):
current_classes = [current_classes]
current_classes_int = []
for curcls in current_classes:
if isinstance(curcls, str):
current_classes_int.append(name_to_class[curcls])
else:
current_classes_int.append(curcls)
current_classes = current_classes_int
overlap_ranges = np.zeros([3, 3, len(current_classes)])
for i, curcls in enumerate(current_classes):
overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:,
np.newaxis]
result = ''
# check whether alpha is valid
compute_aos = False
for anno in dt_annos:
if anno['alpha'].shape[0] != 0:
if anno['alpha'][0] != -10:
compute_aos = True
break
mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval(
gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos)
for j, curcls in enumerate(current_classes):
# mAP threshold array: [num_minoverlap, metric, class]
# mAP result: [num_class, num_diff, num_minoverlap]
o_range = np.array(class_to_range[curcls])[[0, 2, 1]]
o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1)
result += print_str((f'{class_to_name[curcls]} '
'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range)))
result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, '
f'{mAPbbox[j, 1]:.2f}, '
f'{mAPbbox[j, 2]:.2f}'))
result += print_str((f'bev AP:{mAPbev[j, 0]:.2f}, '
f'{mAPbev[j, 1]:.2f}, '
f'{mAPbev[j, 2]:.2f}'))
result += print_str((f'3d AP:{mAP3d[j, 0]:.2f}, '
f'{mAP3d[j, 1]:.2f}, '
f'{mAP3d[j, 2]:.2f}'))
if compute_aos:
result += print_str((f'aos AP:{mAPaos[j, 0]:.2f}, '
f'{mAPaos[j, 1]:.2f}, '
f'{mAPaos[j, 2]:.2f}'))
return result
================================================
FILE: mmdet3d/core/evaluation/kitti_utils/rotate_iou.py
================================================
#####################
# Based on https://github.com/hongzhenwang/RRPN-revise
# Licensed under The MIT License
# Author: yanyan, scrin@foxmail.com
#####################
import math
import numba
import numpy as np
from numba import cuda
@numba.jit(nopython=True)
def div_up(m, n):
return m // n + (m % n > 0)
@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
def trangle_area(a, b, c):
return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) *
(b[0] - c[0])) / 2.0
@cuda.jit('(float32[:], int32)', device=True, inline=True)
def area(int_pts, num_of_inter):
area_val = 0.0
for i in range(num_of_inter - 2):
area_val += abs(
trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4],
int_pts[2 * i + 4:2 * i + 6]))
return area_val
@cuda.jit('(float32[:], int32)', device=True, inline=True)
def sort_vertex_in_convex_polygon(int_pts, num_of_inter):
if num_of_inter > 0:
center = cuda.local.array((2, ), dtype=numba.float32)
center[:] = 0.0
for i in range(num_of_inter):
center[0] += int_pts[2 * i]
center[1] += int_pts[2 * i + 1]
center[0] /= num_of_inter
center[1] /= num_of_inter
v = cuda.local.array((2, ), dtype=numba.float32)
vs = cuda.local.array((16, ), dtype=numba.float32)
for i in range(num_of_inter):
v[0] = int_pts[2 * i] - center[0]
v[1] = int_pts[2 * i + 1] - center[1]
d = math.sqrt(v[0] * v[0] + v[1] * v[1])
v[0] = v[0] / d
v[1] = v[1] / d
if v[1] < 0:
v[0] = -2 - v[0]
vs[i] = v[0]
j = 0
temp = 0
for i in range(1, num_of_inter):
if vs[i - 1] > vs[i]:
temp = vs[i]
tx = int_pts[2 * i]
ty = int_pts[2 * i + 1]
j = i
while j > 0 and vs[j - 1] > temp:
vs[j] = vs[j - 1]
int_pts[j * 2] = int_pts[j * 2 - 2]
int_pts[j * 2 + 1] = int_pts[j * 2 - 1]
j -= 1
vs[j] = temp
int_pts[j * 2] = tx
int_pts[j * 2 + 1] = ty
@cuda.jit(
'(float32[:], float32[:], int32, int32, float32[:])',
device=True,
inline=True)
def line_segment_intersection(pts1, pts2, i, j, temp_pts):
A = cuda.local.array((2, ), dtype=numba.float32)
B = cuda.local.array((2, ), dtype=numba.float32)
C = cuda.local.array((2, ), dtype=numba.float32)
D = cuda.local.array((2, ), dtype=numba.float32)
A[0] = pts1[2 * i]
A[1] = pts1[2 * i + 1]
B[0] = pts1[2 * ((i + 1) % 4)]
B[1] = pts1[2 * ((i + 1) % 4) + 1]
C[0] = pts2[2 * j]
C[1] = pts2[2 * j + 1]
D[0] = pts2[2 * ((j + 1) % 4)]
D[1] = pts2[2 * ((j + 1) % 4) + 1]
BA0 = B[0] - A[0]
BA1 = B[1] - A[1]
DA0 = D[0] - A[0]
CA0 = C[0] - A[0]
DA1 = D[1] - A[1]
CA1 = C[1] - A[1]
acd = DA1 * CA0 > CA1 * DA0
bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0])
if acd != bcd:
abc = CA1 * BA0 > BA1 * CA0
abd = DA1 * BA0 > BA1 * DA0
if abc != abd:
DC0 = D[0] - C[0]
DC1 = D[1] - C[1]
ABBA = A[0] * B[1] - B[0] * A[1]
CDDC = C[0] * D[1] - D[0] * C[1]
DH = BA1 * DC0 - BA0 * DC1
Dx = ABBA * DC0 - BA0 * CDDC
Dy = ABBA * DC1 - BA1 * CDDC
temp_pts[0] = Dx / DH
temp_pts[1] = Dy / DH
return True
return False
@cuda.jit(
'(float32[:], float32[:], int32, int32, float32[:])',
device=True,
inline=True)
def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts):
a = cuda.local.array((2, ), dtype=numba.float32)
b = cuda.local.array((2, ), dtype=numba.float32)
c = cuda.local.array((2, ), dtype=numba.float32)
d = cuda.local.array((2, ), dtype=numba.float32)
a[0] = pts1[2 * i]
a[1] = pts1[2 * i + 1]
b[0] = pts1[2 * ((i + 1) % 4)]
b[1] = pts1[2 * ((i + 1) % 4) + 1]
c[0] = pts2[2 * j]
c[1] = pts2[2 * j + 1]
d[0] = pts2[2 * ((j + 1) % 4)]
d[1] = pts2[2 * ((j + 1) % 4) + 1]
area_abc = trangle_area(a, b, c)
area_abd = trangle_area(a, b, d)
if area_abc * area_abd >= 0:
return False
area_cda = trangle_area(c, d, a)
area_cdb = area_cda + area_abc - area_abd
if area_cda * area_cdb >= 0:
return False
t = area_cda / (area_abd - area_abc)
dx = t * (b[0] - a[0])
dy = t * (b[1] - a[1])
temp_pts[0] = a[0] + dx
temp_pts[1] = a[1] + dy
return True
@cuda.jit('(float32, float32, float32[:])', device=True, inline=True)
def point_in_quadrilateral(pt_x, pt_y, corners):
ab0 = corners[2] - corners[0]
ab1 = corners[3] - corners[1]
ad0 = corners[6] - corners[0]
ad1 = corners[7] - corners[1]
ap0 = pt_x - corners[0]
ap1 = pt_y - corners[1]
abab = ab0 * ab0 + ab1 * ab1
abap = ab0 * ap0 + ab1 * ap1
adad = ad0 * ad0 + ad1 * ad1
adap = ad0 * ap0 + ad1 * ap1
return abab >= abap and abap >= 0 and adad >= adap and adap >= 0
@cuda.jit('(float32[:], float32[:], float32[:])', device=True, inline=True)
def quadrilateral_intersection(pts1, pts2, int_pts):
num_of_inter = 0
for i in range(4):
if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2):
int_pts[num_of_inter * 2] = pts1[2 * i]
int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1]
num_of_inter += 1
if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1):
int_pts[num_of_inter * 2] = pts2[2 * i]
int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1]
num_of_inter += 1
temp_pts = cuda.local.array((2, ), dtype=numba.float32)
for i in range(4):
for j in range(4):
has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts)
if has_pts:
int_pts[num_of_inter * 2] = temp_pts[0]
int_pts[num_of_inter * 2 + 1] = temp_pts[1]
num_of_inter += 1
return num_of_inter
@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
def rbbox_to_corners(corners, rbbox):
# generate clockwise corners and rotate it clockwise
angle = rbbox[4]
a_cos = math.cos(angle)
a_sin = math.sin(angle)
center_x = rbbox[0]
center_y = rbbox[1]
x_d = rbbox[2]
y_d = rbbox[3]
corners_x = cuda.local.array((4, ), dtype=numba.float32)
corners_y = cuda.local.array((4, ), dtype=numba.float32)
corners_x[0] = -x_d / 2
corners_x[1] = -x_d / 2
corners_x[2] = x_d / 2
corners_x[3] = x_d / 2
corners_y[0] = -y_d / 2
corners_y[1] = y_d / 2
corners_y[2] = y_d / 2
corners_y[3] = -y_d / 2
for i in range(4):
corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x
corners[2 * i +
1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y
@cuda.jit('(float32[:], float32[:])', device=True, inline=True)
def inter(rbbox1, rbbox2):
"""Compute intersection of two rotated boxes.
Args:
rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
Returns:
float: Intersection of two rotated boxes.
"""
corners1 = cuda.local.array((8, ), dtype=numba.float32)
corners2 = cuda.local.array((8, ), dtype=numba.float32)
intersection_corners = cuda.local.array((16, ), dtype=numba.float32)
rbbox_to_corners(corners1, rbbox1)
rbbox_to_corners(corners2, rbbox2)
num_intersection = quadrilateral_intersection(corners1, corners2,
intersection_corners)
sort_vertex_in_convex_polygon(intersection_corners, num_intersection)
# print(intersection_corners.reshape([-1, 2])[:num_intersection])
return area(intersection_corners, num_intersection)
@cuda.jit('(float32[:], float32[:], int32)', device=True, inline=True)
def devRotateIoUEval(rbox1, rbox2, criterion=-1):
"""Compute rotated iou on device.
Args:
rbox1 (np.ndarray, shape=[5]): Rotated 2d box.
rbox2 (np.ndarray, shape=[5]): Rotated 2d box.
criterion (int, optional): Indicate different type of iou.
-1 indicate `area_inter / (area1 + area2 - area_inter)`,
0 indicate `area_inter / area1`,
1 indicate `area_inter / area2`.
Returns:
float: iou between two input boxes.
"""
area1 = rbox1[2] * rbox1[3]
area2 = rbox2[2] * rbox2[3]
area_inter = inter(rbox1, rbox2)
if criterion == -1:
return area_inter / (area1 + area2 - area_inter)
elif criterion == 0:
return area_inter / area1
elif criterion == 1:
return area_inter / area2
else:
return area_inter
@cuda.jit(
'(int64, int64, float32[:], float32[:], float32[:], int32)',
fastmath=False)
def rotate_iou_kernel_eval(N,
K,
dev_boxes,
dev_query_boxes,
dev_iou,
criterion=-1):
"""Kernel of computing rotated iou.
Args:
N (int): The number of boxes.
K (int): The number of query boxes.
dev_boxes (np.ndarray): Boxes on device.
dev_query_boxes (np.ndarray): Query boxes on device.
dev_iou (np.ndarray): Computed iou to return.
criterion (int, optional): Indicate different type of iou.
-1 indicate `area_inter / (area1 + area2 - area_inter)`,
0 indicate `area_inter / area1`,
1 indicate `area_inter / area2`.
"""
threadsPerBlock = 8 * 8
row_start = cuda.blockIdx.x
col_start = cuda.blockIdx.y
tx = cuda.threadIdx.x
row_size = min(N - row_start * threadsPerBlock, threadsPerBlock)
col_size = min(K - col_start * threadsPerBlock, threadsPerBlock)
block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32)
dev_query_box_idx = threadsPerBlock * col_start + tx
dev_box_idx = threadsPerBlock * row_start + tx
if (tx < col_size):
block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0]
block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1]
block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2]
block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3]
block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4]
if (tx < row_size):
block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0]
block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1]
block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2]
block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3]
block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4]
cuda.syncthreads()
if tx < row_size:
for i in range(col_size):
offset = (
row_start * threadsPerBlock * K + col_start * threadsPerBlock +
tx * K + i)
dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5],
block_boxes[tx * 5:tx * 5 + 5],
criterion)
def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0):
"""Rotated box iou running in gpu. 500x faster than cpu version (take 5ms
in one example with numba.cuda code). convert from [this project](
https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation).
Args:
boxes (torch.Tensor): rbboxes. format: centers, dims,
angles(clockwise when positive) with the shape of [N, 5].
query_boxes (float tensor: [K, 5]): rbboxes to compute iou with boxes.
device_id (int, optional): Defaults to 0. Device to use.
criterion (int, optional): Indicate different type of iou.
-1 indicate `area_inter / (area1 + area2 - area_inter)`,
0 indicate `area_inter / area1`,
1 indicate `area_inter / area2`.
Returns:
np.ndarray: IoU results.
"""
boxes = boxes.astype(np.float32)
query_boxes = query_boxes.astype(np.float32)
N = boxes.shape[0]
K = query_boxes.shape[0]
iou = np.zeros((N, K), dtype=np.float32)
if N == 0 or K == 0:
return iou
threadsPerBlock = 8 * 8
cuda.select_device(device_id)
blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock))
stream = cuda.stream()
with stream.auto_synchronize():
boxes_dev = cuda.to_device(boxes.reshape([-1]), stream)
query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream)
iou_dev = cuda.to_device(iou.reshape([-1]), stream)
rotate_iou_kernel_eval[blockspergrid, threadsPerBlock,
stream](N, K, boxes_dev, query_boxes_dev,
iou_dev, criterion)
iou_dev.copy_to_host(iou.reshape([-1]), stream=stream)
return iou.astype(boxes.dtype)
================================================
FILE: mmdet3d/core/evaluation/lyft_eval.py
================================================
import mmcv
import numpy as np
from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap,
get_class_names,
get_ious,
group_by_key,
wrap_in_box)
from mmcv.utils import print_log
from os import path as osp
from terminaltables import AsciiTable
def load_lyft_gts(lyft, data_root, eval_split, logger=None):
"""Loads ground truth boxes from database.
Args:
lyft (:obj:`LyftDataset`): Lyft class in the sdk.
data_root (str): Root of data for reading splits.
eval_split (str): Name of the split for evaluation.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
Returns:
list[dict]: List of annotation dictionaries.
"""
split_scenes = mmcv.list_from_file(
osp.join(data_root, f'{eval_split}.txt'))
# Read out all sample_tokens in DB.
sample_tokens_all = [s['token'] for s in lyft.sample]
assert len(sample_tokens_all) > 0, 'Error: Database has no samples!'
if eval_split == 'test':
# Check that you aren't trying to cheat :)
assert len(lyft.sample_annotation) > 0, \
'Error: You are trying to evaluate on the test set \
but you do not have the annotations!'
sample_tokens = []
for sample_token in sample_tokens_all:
scene_token = lyft.get('sample', sample_token)['scene_token']
scene_record = lyft.get('scene', scene_token)
if scene_record['name'] in split_scenes:
sample_tokens.append(sample_token)
all_annotations = []
print_log('Loading ground truth annotations...', logger=logger)
# Load annotations and filter predictions and annotations.
for sample_token in mmcv.track_iter_progress(sample_tokens):
sample = lyft.get('sample', sample_token)
sample_annotation_tokens = sample['anns']
for sample_annotation_token in sample_annotation_tokens:
# Get label name in detection task and filter unused labels.
sample_annotation = \
lyft.get('sample_annotation', sample_annotation_token)
detection_name = sample_annotation['category_name']
if detection_name is None:
continue
annotation = {
'sample_token': sample_token,
'translation': sample_annotation['translation'],
'size': sample_annotation['size'],
'rotation': sample_annotation['rotation'],
'name': detection_name,
}
all_annotations.append(annotation)
return all_annotations
def load_lyft_predictions(res_path):
"""Load Lyft predictions from json file.
Args:
res_path (str): Path of result json file recording detections.
Returns:
list[dict]: List of prediction dictionaries.
"""
predictions = mmcv.load(res_path)
predictions = predictions['results']
all_preds = []
for sample_token in predictions.keys():
all_preds.extend(predictions[sample_token])
return all_preds
def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None):
"""Evaluation API for Lyft dataset.
Args:
lyft (:obj:`LyftDataset`): Lyft class in the sdk.
data_root (str): Root of data for reading splits.
res_path (str): Path of result json file recording detections.
eval_set (str): Name of the split for evaluation.
output_dir (str): Output directory for output json files.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
Returns:
dict[str, float]: The evaluation results.
"""
# evaluate by lyft metrics
gts = load_lyft_gts(lyft, data_root, eval_set, logger)
predictions = load_lyft_predictions(res_path)
class_names = get_class_names(gts)
print('Calculating mAP@0.5:0.95...')
iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
metrics = {}
average_precisions = \
get_classwise_aps(gts, predictions, class_names, iou_thresholds)
APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]]
mAPs = np.mean(average_precisions, axis=0)
mAPs_cate = np.mean(average_precisions, axis=1)
final_mAP = np.mean(mAPs)
metrics['average_precisions'] = average_precisions.tolist()
metrics['mAPs'] = mAPs.tolist()
metrics['Final mAP'] = float(final_mAP)
metrics['class_names'] = class_names
metrics['mAPs_cate'] = mAPs_cate.tolist()
APs_data = [['class', 'mAP@0.5:0.95']]
for i in range(len(class_names)):
row = [class_names[i], round(mAPs_cate[i], 3)]
APs_data.append(row)
APs_data.append(['Overall', round(final_mAP, 3)])
APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95')
APs_table.inner_footing_row_border = True
print_log(APs_table.table, logger=logger)
res_path = osp.join(output_dir, 'lyft_metrics.json')
mmcv.dump(metrics, res_path)
return metrics
def get_classwise_aps(gt, predictions, class_names, iou_thresholds):
"""Returns an array with an average precision per class.
Note: Ground truth and predictions should have the following format.
.. code-block::
gt = [{
'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
fbb039a550991a5149214f98cec136ac',
'translation': [974.2811881299899, 1714.6815014457964,
-23.689857123368846],
'size': [1.796, 4.488, 1.664],
'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121],
'name': 'car'
}]
predictions = [{
'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207
fbb039a550991a5149214f98cec136ac',
'translation': [971.8343488872263, 1713.6816097857359,
-25.82534357061308],
'size': [2.519726579986132, 7.810161372666739, 3.483438286096803],
'rotation': [0.10913582721095375, 0.04099572636992043,
0.01927712319721745, 1.029328402625659],
'name': 'car',
'score': 0.3077029437237213
}]
Args:
gt (list[dict]): list of dictionaries in the format described below.
predictions (list[dict]): list of dictionaries in the format
described below.
class_names (list[str]): list of the class names.
iou_thresholds (list[float]): IOU thresholds used to calculate
TP / FN
Returns:
np.ndarray: an array with an average precision per class.
"""
assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds])
gt_by_class_name = group_by_key(gt, 'name')
pred_by_class_name = group_by_key(predictions, 'name')
average_precisions = np.zeros((len(class_names), len(iou_thresholds)))
for class_id, class_name in enumerate(class_names):
if class_name in pred_by_class_name:
recalls, precisions, average_precision = get_single_class_aps(
gt_by_class_name[class_name], pred_by_class_name[class_name],
iou_thresholds)
average_precisions[class_id, :] = average_precision
return average_precisions
def get_single_class_aps(gt, predictions, iou_thresholds):
"""Compute recall and precision for all iou thresholds. Adapted from
LyftDatasetDevkit.
Args:
gt (list[dict]): list of dictionaries in the format described above.
predictions (list[dict]): list of dictionaries in the format \
described below.
iou_thresholds (list[float]): IOU thresholds used to calculate \
TP / FN
Returns:
tuple[np.ndarray]: Returns (recalls, precisions, average precisions)
for each class.
"""
num_gts = len(gt)
image_gts = group_by_key(gt, 'sample_token')
image_gts = wrap_in_box(image_gts)
sample_gt_checked = {
sample_token: np.zeros((len(boxes), len(iou_thresholds)))
for sample_token, boxes in image_gts.items()
}
predictions = sorted(predictions, key=lambda x: x['score'], reverse=True)
# go down dets and mark TPs and FPs
num_predictions = len(predictions)
tps = np.zeros((num_predictions, len(iou_thresholds)))
fps = np.zeros((num_predictions, len(iou_thresholds)))
for prediction_index, prediction in enumerate(predictions):
predicted_box = Box3D(**prediction)
sample_token = prediction['sample_token']
max_overlap = -np.inf
jmax = -1
if sample_token in image_gts:
gt_boxes = image_gts[sample_token]
# gt_boxes per sample
gt_checked = sample_gt_checked[sample_token]
# gt flags per sample
else:
gt_boxes = []
gt_checked = None
if len(gt_boxes) > 0:
overlaps = get_ious(gt_boxes, predicted_box)
max_overlap = np.max(overlaps)
jmax = np.argmax(overlaps)
for i, iou_threshold in enumerate(iou_thresholds):
if max_overlap > iou_threshold:
if gt_checked[jmax, i] == 0:
tps[prediction_index, i] = 1.0
gt_checked[jmax, i] = 1
else:
fps[prediction_index, i] = 1.0
else:
fps[prediction_index, i] = 1.0
# compute precision recall
fps = np.cumsum(fps, axis=0)
tps = np.cumsum(tps, axis=0)
recalls = tps / float(num_gts)
# avoid divide by zero in case the first detection
# matches a difficult ground truth
precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps)
aps = []
for i in range(len(iou_thresholds)):
recall = recalls[:, i]
precision = precisions[:, i]
assert np.all(0 <= recall) & np.all(recall <= 1)
assert np.all(0 <= precision) & np.all(precision <= 1)
ap = get_ap(recall, precision)
aps.append(ap)
aps = np.array(aps)
return recalls, precisions, aps
================================================
FILE: mmdet3d/core/evaluation/seg_eval.py
================================================
import numpy as np
from mmcv.utils import print_log
from terminaltables import AsciiTable
def fast_hist(preds, labels, num_classes):
"""Compute the confusion matrix for every batch.
Args:
preds (np.ndarray): Prediction labels of points with shape of
(num_points, ).
labels (np.ndarray): Ground truth labels of points with shape of
(num_points, ).
num_classes (int): number of classes
Returns:
np.ndarray: Calculated confusion matrix.
"""
k = (labels >= 0) & (labels < num_classes)
bin_count = np.bincount(
num_classes * labels[k].astype(int) + preds[k],
minlength=num_classes**2)
return bin_count[:num_classes**2].reshape(num_classes, num_classes)
def per_class_iou(hist):
"""Compute the per class iou.
Args:
hist(np.ndarray): Overall confusion martix
(num_classes, num_classes ).
Returns:
np.ndarray: Calculated per class iou
"""
return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist))
def get_acc(hist):
"""Compute the overall accuracy.
Args:
hist(np.ndarray): Overall confusion martix
(num_classes, num_classes ).
Returns:
float: Calculated overall acc
"""
return np.diag(hist).sum() / hist.sum()
def get_acc_cls(hist):
"""Compute the class average accuracy.
Args:
hist(np.ndarray): Overall confusion martix
(num_classes, num_classes ).
Returns:
float: Calculated class average acc
"""
return np.nanmean(np.diag(hist) / hist.sum(axis=1))
def seg_eval(gt_labels, seg_preds, label2cat, logger=None):
"""Semantic Segmentation Evaluation.
Evaluate the result of the Semantic Segmentation.
Args:
gt_labels (list[torch.Tensor]): Ground truth labels.
seg_preds (list[torch.Tensor]): Predtictions
label2cat (dict): Map from label to category.
logger (logging.Logger | str | None): The way to print the mAP
summary. See `mmdet.utils.print_log()` for details. Default: None.
Return:
dict[str, float]: Dict of results.
"""
assert len(seg_preds) == len(gt_labels)
hist_list = []
for i in range(len(seg_preds)):
hist_list.append(
fast_hist(seg_preds[i].numpy().astype(int),
gt_labels[i].numpy().astype(int), len(label2cat)))
iou = per_class_iou(sum(hist_list))
miou = np.nanmean(iou)
acc = get_acc(sum(hist_list))
acc_cls = get_acc_cls(sum(hist_list))
header = ['classes']
for i in range(len(label2cat)):
header.append(label2cat[i])
header.extend(['miou', 'acc', 'acc_cls'])
ret_dict = dict()
table_columns = [['results']]
for i in range(len(label2cat)):
ret_dict[label2cat[i]] = float(iou[i])
table_columns.append([f'{iou[i]:.4f}'])
ret_dict['miou'] = float(miou)
ret_dict['acc'] = float(acc)
ret_dict['acc_cls'] = float(acc_cls)
table_columns.append([f'{miou:.4f}'])
table_columns.append([f'{acc:.4f}'])
table_columns.append([f'{acc_cls:.4f}'])
table_data = [header]
table_rows = list(zip(*table_columns))
table_data += table_rows
table = AsciiTable(table_data)
table.inner_footing_row_border = True
print_log('\n' + table.table, logger=logger)
return ret_dict
================================================
FILE: mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py
================================================
r"""Adapted from `Waymo to KITTI converter
`_.
"""
try:
from waymo_open_dataset import dataset_pb2 as open_dataset
except ImportError:
raise ImportError(
'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" '
'to install the official devkit first.')
import mmcv
import numpy as np
import tensorflow as tf
from glob import glob
from os.path import join
from waymo_open_dataset import label_pb2
from waymo_open_dataset.protos import metrics_pb2
class KITTI2Waymo(object):
"""KITTI predictions to Waymo converter.
This class serves as the converter to change predictions from KITTI to
Waymo format.
Args:
kitti_result_files (list[dict]): Predictions in KITTI format.
waymo_tfrecords_dir (str): Directory to load waymo raw data.
waymo_results_save_dir (str): Directory to save converted predictions
in waymo format (.bin files).
waymo_results_final_path (str): Path to save combined
predictions in waymo format (.bin file), like 'a/b/c.bin'.
prefix (str): Prefix of filename. In general, 0 for training, 1 for
validation and 2 for testing.
workers (str): Number of parallel processes.
"""
def __init__(self,
kitti_result_files,
waymo_tfrecords_dir,
waymo_results_save_dir,
waymo_results_final_path,
prefix,
workers=64):
self.kitti_result_files = kitti_result_files
self.waymo_tfrecords_dir = waymo_tfrecords_dir
self.waymo_results_save_dir = waymo_results_save_dir
self.waymo_results_final_path = waymo_results_final_path
self.prefix = prefix
self.workers = int(workers)
self.name2idx = {}
for idx, result in enumerate(kitti_result_files):
if len(result['sample_idx']) > 0:
self.name2idx[str(result['sample_idx'][0])] = idx
# turn on eager execution for older tensorflow versions
if int(tf.__version__.split('.')[0]) < 2:
tf.enable_eager_execution()
self.k2w_cls_map = {
'Car': label_pb2.Label.TYPE_VEHICLE,
'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN,
'Sign': label_pb2.Label.TYPE_SIGN,
'Cyclist': label_pb2.Label.TYPE_CYCLIST,
}
self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0],
[-1.0, 0.0, 0.0, 0.0],
[0.0, -1.0, 0.0, 0.0],
[0.0, 0.0, 0.0, 1.0]])
self.get_file_names()
self.create_folder()
def get_file_names(self):
"""Get file names of waymo raw data."""
self.waymo_tfrecord_pathnames = sorted(
glob(join(self.waymo_tfrecords_dir, '*.tfrecord')))
print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.')
def create_folder(self):
"""Create folder for data conversion."""
mmcv.mkdir_or_exist(self.waymo_results_save_dir)
def parse_objects(self, kitti_result, T_k2w, context_name,
frame_timestamp_micros):
"""Parse one prediction with several instances in kitti format and
convert them to `Object` proto.
Args:
kitti_result (dict): Predictions in kitti format.
- name (np.ndarray): Class labels of predictions.
- dimensions (np.ndarray): Height, width, length of boxes.
- location (np.ndarray): Bottom center of boxes (x, y, z).
- rotation_y (np.ndarray): Orientation of boxes.
- score (np.ndarray): Scores of predictions.
T_k2w (np.ndarray): Transformation matrix from kitti to waymo.
context_name (str): Context name of the frame.
frame_timestamp_micros (int): Frame timestamp.
Returns:
:obj:`Object`: Predictions in waymo dataset Object proto.
"""
def parse_one_object(instance_idx):
"""Parse one instance in kitti format and convert them to `Object`
proto.
Args:
instance_idx (int): Index of the instance to be converted.
Returns:
:obj:`Object`: Predicted instance in waymo dataset \
Object proto.
"""
cls = kitti_result['name'][instance_idx]
length = round(kitti_result['dimensions'][instance_idx, 0], 4)
height = round(kitti_result['dimensions'][instance_idx, 1], 4)
width = round(kitti_result['dimensions'][instance_idx, 2], 4)
x = round(kitti_result['location'][instance_idx, 0], 4)
y = round(kitti_result['location'][instance_idx, 1], 4)
z = round(kitti_result['location'][instance_idx, 2], 4)
rotation_y = round(kitti_result['rotation_y'][instance_idx], 4)
score = round(kitti_result['score'][instance_idx], 4)
# y: downwards; move box origin from bottom center (kitti) to
# true center (waymo)
y -= height / 2
# frame transformation: kitti -> waymo
x, y, z = self.transform(T_k2w, x, y, z)
# different conventions
heading = -(rotation_y + np.pi / 2)
while heading < -np.pi:
heading += 2 * np.pi
while heading > np.pi:
heading -= 2 * np.pi
box = label_pb2.Label.Box()
box.center_x = x
box.center_y = y
box.center_z = z
box.length = length
box.width = width
box.height = height
box.heading = heading
o = metrics_pb2.Object()
o.object.box.CopyFrom(box)
o.object.type = self.k2w_cls_map[cls]
o.score = score
o.context_name = context_name
o.frame_timestamp_micros = frame_timestamp_micros
return o
objects = metrics_pb2.Objects()
for instance_idx in range(len(kitti_result['name'])):
o = parse_one_object(instance_idx)
objects.objects.append(o)
return objects
def convert_one(self, file_idx):
"""Convert action for single file.
Args:
file_idx (int): Index of the file to be converted.
"""
file_pathname = self.waymo_tfrecord_pathnames[file_idx]
file_data = tf.data.TFRecordDataset(file_pathname, compression_type='')
for frame_num, frame_data in enumerate(file_data):
frame = open_dataset.Frame()
frame.ParseFromString(bytearray(frame_data.numpy()))
filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}'
for camera in frame.context.camera_calibrations:
# FRONT = 1, see dataset.proto for details
if camera.name == 1:
T_front_cam_to_vehicle = np.array(
camera.extrinsic.transform).reshape(4, 4)
T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam
context_name = frame.context.name
frame_timestamp_micros = frame.timestamp_micros
if filename in self.name2idx:
kitti_result = \
self.kitti_result_files[self.name2idx[filename]]
objects = self.parse_objects(kitti_result, T_k2w, context_name,
frame_timestamp_micros)
else:
print(filename, 'not found.')
objects = metrics_pb2.Objects()
with open(
join(self.waymo_results_save_dir, f'{filename}.bin'),
'wb') as f:
f.write(objects.SerializeToString())
def convert(self):
"""Convert action."""
print('Start converting ...')
mmcv.track_parallel_progress(self.convert_one, range(len(self)),
self.workers)
print('\nFinished ...')
# combine all files into one .bin
pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin')))
combined = self.combine(pathnames)
with open(self.waymo_results_final_path, 'wb') as f:
f.write(combined.SerializeToString())
def __len__(self):
"""Length of the filename list."""
return len(self.waymo_tfrecord_pathnames)
def transform(self, T, x, y, z):
"""Transform the coordinates with matrix T.
Args:
T (np.ndarray): Transformation matrix.
x(float): Coordinate in x axis.
y(float): Coordinate in y axis.
z(float): Coordinate in z axis.
Returns:
list: Coordinates after transformation.
"""
pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1)
pt_aft = np.matmul(T, pt_bef)
return pt_aft[:3].flatten().tolist()
def combine(self, pathnames):
"""Combine predictions in waymo format for each sample together.
Args:
pathnames (str): Paths to save predictions.
Returns:
:obj:`Objects`: Combined predictions in Objects proto.
"""
combined = metrics_pb2.Objects()
for pathname in pathnames:
objects = metrics_pb2.Objects()
with open(pathname, 'rb') as f:
objects.ParseFromString(f.read())
for o in objects.objects:
combined.objects.append(o)
return combined
================================================
FILE: mmdet3d/core/points/__init__.py
================================================
from .base_points import BasePoints
from .cam_points import CameraPoints
from .depth_points import DepthPoints
from .lidar_points import LiDARPoints
__all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints']
def get_points_type(points_type):
"""Get the class of points according to coordinate type.
Args:
points_type (str): The type of points coordinate.
The valid value are "CAMERA", "LIDAR", or "DEPTH".
Returns:
class: Points type.
"""
if points_type == 'CAMERA':
points_cls = CameraPoints
elif points_type == 'LIDAR':
points_cls = LiDARPoints
elif points_type == 'DEPTH':
points_cls = DepthPoints
else:
raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"'
f' are supported, got {points_type}')
return points_cls
================================================
FILE: mmdet3d/core/points/base_points.py
================================================
import numpy as np
import torch
from abc import abstractmethod
class BasePoints(object):
"""Base class for Points.
Args:
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None.
Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...).
attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self, tensor, points_dim=3, attribute_dims=None):
if isinstance(tensor, torch.Tensor):
device = tensor.device
else:
device = torch.device('cpu')
tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device)
if tensor.numel() == 0:
# Use reshape, so we don't end up creating a new tensor that
# does not depend on the inputs (and consequently confuses jit)
tensor = tensor.reshape((0, points_dim)).to(
dtype=torch.float32, device=device)
assert tensor.dim() == 2 and tensor.size(-1) == \
points_dim, tensor.size()
self.tensor = tensor
self.points_dim = points_dim
self.attribute_dims = attribute_dims
self.rotation_axis = 0
@property
def coord(self):
"""torch.Tensor: Coordinates of each point with size (N, 3)."""
return self.tensor[:, :3]
@property
def height(self):
"""torch.Tensor: A vector with height of each point."""
if self.attribute_dims is not None and \
'height' in self.attribute_dims.keys():
return self.tensor[:, self.attribute_dims['height']]
else:
return None
@property
def color(self):
"""torch.Tensor: A vector with color of each point."""
if self.attribute_dims is not None and \
'color' in self.attribute_dims.keys():
return self.tensor[:, self.attribute_dims['color']]
else:
return None
@property
def shape(self):
"""torch.Shape: Shape of points."""
return self.tensor.shape
def shuffle(self):
"""Shuffle the points."""
self.tensor = self.tensor[torch.randperm(
self.__len__(), device=self.tensor.device)]
def rotate(self, rotation, axis=None):
"""Rotate points with the given rotation matrix or angle.
Args:
rotation (float, np.ndarray, torch.Tensor): Rotation matrix
or angle.
axis (int): Axis to rotate at. Defaults to None.
"""
if not isinstance(rotation, torch.Tensor):
rotation = self.tensor.new_tensor(rotation)
assert rotation.shape == torch.Size([3, 3]) or \
rotation.numel() == 1
if axis is None:
axis = self.rotation_axis
if rotation.numel() == 1:
rot_sin = torch.sin(rotation)
rot_cos = torch.cos(rotation)
if axis == 1:
rot_mat_T = rotation.new_tensor([[rot_cos, 0, -rot_sin],
[0, 1, 0],
[rot_sin, 0, rot_cos]])
elif axis == 2 or axis == -1:
rot_mat_T = rotation.new_tensor([[rot_cos, -rot_sin, 0],
[rot_sin, rot_cos, 0],
[0, 0, 1]])
elif axis == 0:
rot_mat_T = rotation.new_tensor([[0, rot_cos, -rot_sin],
[0, rot_sin, rot_cos],
[1, 0, 0]])
else:
raise ValueError('axis should in range')
rot_mat_T = rot_mat_T.T
elif rotation.numel() == 9:
rot_mat_T = rotation
else:
raise NotImplementedError
self.tensor[:, :3] = self.tensor[:, :3] @ rot_mat_T
@abstractmethod
def flip(self, bev_direction='horizontal'):
"""Flip the points in BEV along given BEV direction."""
pass
def translate(self, trans_vector):
"""Translate points with the given translation vector.
Args:
trans_vector (np.ndarray, torch.Tensor): Translation
vector of size 3 or nx3.
"""
if not isinstance(trans_vector, torch.Tensor):
trans_vector = self.tensor.new_tensor(trans_vector)
trans_vector = trans_vector.squeeze(0)
if trans_vector.dim() == 1:
assert trans_vector.shape[0] == 3
elif trans_vector.dim() == 2:
assert trans_vector.shape[0] == self.tensor.shape[0] and \
trans_vector.shape[1] == 3
else:
raise NotImplementedError(
'Unsupported translation vector of shape {}'.format(
trans_vector.shape))
self.tensor[:, :3] += trans_vector
def in_range_3d(self, point_range):
"""Check whether the points are in the given range.
Args:
point_range (list | torch.Tensor): The range of point
(x_min, y_min, z_min, x_max, y_max, z_max)
Note:
In the original implementation of SECOND, checking whether
a box in the range checks whether the points are in a convex
polygon, we try to reduce the burden for simpler cases.
Returns:
torch.Tensor: A binary vector indicating whether each point is \
inside the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > point_range[0])
& (self.tensor[:, 1] > point_range[1])
& (self.tensor[:, 2] > point_range[2])
& (self.tensor[:, 0] < point_range[3])
& (self.tensor[:, 1] < point_range[4])
& (self.tensor[:, 2] < point_range[5]))
return in_range_flags
@abstractmethod
def in_range_bev(self, point_range):
"""Check whether the points are in the given range.
Args:
point_range (list | torch.Tensor): The range of point
in order of (x_min, y_min, x_max, y_max).
Returns:
torch.Tensor: Indicating whether each point is inside \
the reference range.
"""
pass
@abstractmethod
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`CoordMode`): The target Box mode.
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BasePoints`: The converted box of the same type \
in the `dst` mode.
"""
pass
def scale(self, scale_factor):
"""Scale the points with horizontal and vertical scaling factors.
Args:
scale_factors (float): Scale factors to scale the points.
"""
self.tensor[:, :3] *= scale_factor
def __getitem__(self, item):
"""
Note:
The following usage are allowed:
1. `new_points = points[3]`:
return a `Points` that contains only one point.
2. `new_points = points[2:10]`:
return a slice of points.
3. `new_points = points[vector]`:
where vector is a torch.BoolTensor with `length = len(points)`.
Nonzero elements in the vector will be selected.
4. `new_points = points[3:11, vector]`:
return a slice of points and attribute dims.
Note that the returned Points might share storage with this Points,
subject to Pytorch's indexing semantics.
Returns:
:obj:`BasePoints`: A new object of \
:class:`BasePoints` after indexing.
"""
original_type = type(self)
if isinstance(item, int):
return original_type(
self.tensor[item].view(1, -1),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
elif isinstance(item, tuple) and len(item) == 2:
if isinstance(item[1], slice):
start = 0 if item[1].start is None else item[1].start
stop = self.tensor.shape[1] + \
1 if item[1].stop is None else item[1].stop
step = 1 if item[1].step is None else item[1].step
item = list(item)
item[1] = list(range(start, stop, step))
item = tuple(item)
p = self.tensor[item[0], item[1]]
keep_dims = list(
set(item[1]).intersection(set(range(3, self.tensor.shape[1]))))
if self.attribute_dims is not None:
attribute_dims = self.attribute_dims.copy()
for key in self.attribute_dims.keys():
cur_attribute_dim = attribute_dims[key]
if isinstance(cur_attribute_dim, int):
cur_attribute_dims = [cur_attribute_dim]
intersect_attr = list(
set(cur_attribute_dims).intersection(set(keep_dims)))
if len(intersect_attr) == 1:
attribute_dims[key] = intersect_attr[0]
elif len(intersect_attr) > 1:
attribute_dims[key] = intersect_attr
else:
attribute_dims.pop(key)
else:
attribute_dims = None
elif isinstance(item, (slice, np.ndarray, torch.Tensor)):
p = self.tensor[item]
attribute_dims = self.attribute_dims
else:
raise NotImplementedError(f'Invalid slice {item}!')
assert p.dim() == 2, \
f'Indexing on Points with {item} failed to return a matrix!'
return original_type(
p, points_dim=p.shape[1], attribute_dims=attribute_dims)
def __len__(self):
"""int: Number of points in the current object."""
return self.tensor.shape[0]
def __repr__(self):
"""str: Return a strings that describes the object."""
return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')'
@classmethod
def cat(cls, points_list):
"""Concatenate a list of Points into a single Points.
Args:
points_list (list[:obj:`BasePoints`]): List of points.
Returns:
:obj:`BasePoints`: The concatenated Points.
"""
assert isinstance(points_list, (list, tuple))
if len(points_list) == 0:
return cls(torch.empty(0))
assert all(isinstance(points, cls) for points in points_list)
# use torch.cat (v.s. layers.cat)
# so the returned points never share storage with input
cat_points = cls(
torch.cat([p.tensor for p in points_list], dim=0),
points_dim=points_list[0].tensor.shape[1],
attribute_dims=points_list[0].attribute_dims)
return cat_points
def to(self, device):
"""Convert current points to a specific device.
Args:
device (str | :obj:`torch.device`): The name of the device.
Returns:
:obj:`BasePoints`: A new boxes object on the \
specific device.
"""
original_type = type(self)
return original_type(
self.tensor.to(device),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
def clone(self):
"""Clone the Points.
Returns:
:obj:`BasePoints`: Box object with the same properties \
as self.
"""
original_type = type(self)
return original_type(
self.tensor.clone(),
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
@property
def device(self):
"""str: The device of the points are on."""
return self.tensor.device
def __iter__(self):
"""Yield a point as a Tensor of shape (4,) at a time.
Returns:
torch.Tensor: A point of shape (4,).
"""
yield from self.tensor
def new_point(self, data):
"""Create a new point object with data.
The new point and its tensor has the similar properties \
as self and self.tensor, respectively.
Args:
data (torch.Tensor | numpy.array | list): Data to be copied.
Returns:
:obj:`BasePoints`: A new point object with ``data``, \
the object's other properties are similar to ``self``.
"""
new_tensor = self.tensor.new_tensor(data) \
if not isinstance(data, torch.Tensor) else data.to(self.device)
original_type = type(self)
return original_type(
new_tensor,
points_dim=self.points_dim,
attribute_dims=self.attribute_dims)
================================================
FILE: mmdet3d/core/points/cam_points.py
================================================
from .base_points import BasePoints
class CameraPoints(BasePoints):
"""Points of instances in CAM coordinates.
Args:
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None.
Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...).
attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self, tensor, points_dim=3, attribute_dims=None):
super(CameraPoints, self).__init__(
tensor, points_dim=points_dim, attribute_dims=attribute_dims)
self.rotation_axis = 1
def flip(self, bev_direction='horizontal'):
"""Flip the boxes in BEV along given BEV direction."""
if bev_direction == 'horizontal':
self.tensor[:, 0] = -self.tensor[:, 0]
elif bev_direction == 'vertical':
self.tensor[:, 2] = -self.tensor[:, 2]
def in_range_bev(self, point_range):
"""Check whether the points are in the given range.
Args:
point_range (list | torch.Tensor): The range of point
in order of (x_min, y_min, x_max, y_max).
Returns:
torch.Tensor: Indicating whether each point is inside \
the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > point_range[0])
& (self.tensor[:, 2] > point_range[1])
& (self.tensor[:, 0] < point_range[2])
& (self.tensor[:, 2] < point_range[3]))
return in_range_flags
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`CoordMode`): The target Point mode.
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type \
in the `dst` mode.
"""
from mmdet3d.core.bbox import Coord3DMode
return Coord3DMode.convert_point(
point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat)
================================================
FILE: mmdet3d/core/points/depth_points.py
================================================
from .base_points import BasePoints
class DepthPoints(BasePoints):
"""Points of instances in DEPTH coordinates.
Args:
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None.
Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...).
attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self, tensor, points_dim=3, attribute_dims=None):
super(DepthPoints, self).__init__(
tensor, points_dim=points_dim, attribute_dims=attribute_dims)
self.rotation_axis = 2
def flip(self, bev_direction='horizontal'):
"""Flip the boxes in BEV along given BEV direction."""
if bev_direction == 'horizontal':
self.tensor[:, 0] = -self.tensor[:, 0]
elif bev_direction == 'vertical':
self.tensor[:, 1] = -self.tensor[:, 1]
def in_range_bev(self, point_range):
"""Check whether the points are in the given range.
Args:
point_range (list | torch.Tensor): The range of point
in order of (x_min, y_min, x_max, y_max).
Returns:
torch.Tensor: Indicating whether each point is inside \
the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > point_range[0])
& (self.tensor[:, 1] > point_range[1])
& (self.tensor[:, 0] < point_range[2])
& (self.tensor[:, 1] < point_range[3]))
return in_range_flags
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`CoordMode`): The target Point mode.
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type \
in the `dst` mode.
"""
from mmdet3d.core.bbox import Coord3DMode
return Coord3DMode.convert_point(
point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat)
================================================
FILE: mmdet3d/core/points/lidar_points.py
================================================
from .base_points import BasePoints
class LiDARPoints(BasePoints):
"""Points of instances in LIDAR coordinates.
Args:
tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix.
points_dim (int): Number of the dimension of a point.
Each row is (x, y, z). Default to 3.
attribute_dims (dict): Dictionary to indicate the meaning of extra
dimension. Default to None.
Attributes:
tensor (torch.Tensor): Float matrix of N x points_dim.
points_dim (int): Integer indicating the dimension of a point.
Each row is (x, y, z, ...).
attribute_dims (bool): Dictionary to indicate the meaning of extra
dimension. Default to None.
rotation_axis (int): Default rotation axis for points rotation.
"""
def __init__(self, tensor, points_dim=3, attribute_dims=None):
super(LiDARPoints, self).__init__(
tensor, points_dim=points_dim, attribute_dims=attribute_dims)
self.rotation_axis = 2
def flip(self, bev_direction='horizontal'):
"""Flip the boxes in BEV along given BEV direction."""
if bev_direction == 'horizontal':
self.tensor[:, 1] = -self.tensor[:, 1]
elif bev_direction == 'vertical':
self.tensor[:, 0] = -self.tensor[:, 0]
def in_range_bev(self, point_range):
"""Check whether the points are in the given range.
Args:
point_range (list | torch.Tensor): The range of point
in order of (x_min, y_min, x_max, y_max).
Returns:
torch.Tensor: Indicating whether each point is inside \
the reference range.
"""
in_range_flags = ((self.tensor[:, 0] > point_range[0])
& (self.tensor[:, 1] > point_range[1])
& (self.tensor[:, 0] < point_range[2])
& (self.tensor[:, 1] < point_range[3]))
return in_range_flags
def convert_to(self, dst, rt_mat=None):
"""Convert self to ``dst`` mode.
Args:
dst (:obj:`CoordMode`): The target Point mode.
rt_mat (np.ndarray | torch.Tensor): The rotation and translation
matrix between different coordinates. Defaults to None.
The conversion from `src` coordinates to `dst` coordinates
usually comes along the change of sensors, e.g., from camera
to LiDAR. This requires a transformation matrix.
Returns:
:obj:`BasePoints`: The converted point of the same type \
in the `dst` mode.
"""
from mmdet3d.core.bbox import Coord3DMode
return Coord3DMode.convert_point(
point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat)
================================================
FILE: mmdet3d/core/post_processing/__init__.py
================================================
from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks,
merge_aug_proposals, merge_aug_scores,
multiclass_nms)
from .box3d_nms import aligned_3d_nms, box3d_multiclass_nms, circle_nms
from .merge_augs import merge_aug_bboxes_3d
__all__ = [
'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes',
'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms',
'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms'
]
================================================
FILE: mmdet3d/core/post_processing/box3d_nms.py
================================================
import numba
import numpy as np
import torch
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
def box3d_multiclass_nms(mlvl_bboxes,
mlvl_bboxes_for_nms,
mlvl_scores,
score_thr,
max_num,
cfg,
mlvl_dir_scores=None):
"""Multi-class nms for 3D boxes.
Args:
mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M).
M is the dimensions of boxes.
mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape
(N, 4). N is the number of boxes.
mlvl_scores (torch.Tensor): Multi-level boxes with shape
(N, ). N is the number of boxes.
score_thr (float): Score thredhold to filter boxes with low
confidence.
max_num (int): Maximum number of boxes will be kept.
cfg (dict): Configuration dict of NMS.
mlvl_dir_scores (torch.Tensor, optional): Multi-level scores
of direction classifier. Defaults to None.
Returns:
tuple[torch.Tensor]: Return results after nms, including 3D \
bounding boxes, scores, labels and direction scores.
"""
# do multi class nms
# the fg class id range: [0, num_classes-1]
num_classes = mlvl_scores.shape[1] - 1
bboxes = []
scores = []
labels = []
dir_scores = []
for i in range(0, num_classes):
# get bboxes and scores of this class
cls_inds = mlvl_scores[:, i] > score_thr
if not cls_inds.any():
continue
_scores = mlvl_scores[cls_inds, i]
_bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :]
if cfg.use_rotate_nms:
nms_func = nms_gpu
else:
nms_func = nms_normal_gpu
selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
_mlvl_bboxes = mlvl_bboxes[cls_inds, :]
bboxes.append(_mlvl_bboxes[selected])
scores.append(_scores[selected])
cls_label = mlvl_bboxes.new_full((len(selected), ),
i,
dtype=torch.long)
labels.append(cls_label)
if mlvl_dir_scores is not None:
_mlvl_dir_scores = mlvl_dir_scores[cls_inds]
dir_scores.append(_mlvl_dir_scores[selected])
if bboxes:
bboxes = torch.cat(bboxes, dim=0)
scores = torch.cat(scores, dim=0)
labels = torch.cat(labels, dim=0)
if mlvl_dir_scores is not None:
dir_scores = torch.cat(dir_scores, dim=0)
if bboxes.shape[0] > max_num:
_, inds = scores.sort(descending=True)
inds = inds[:max_num]
bboxes = bboxes[inds, :]
labels = labels[inds]
scores = scores[inds]
if mlvl_dir_scores is not None:
dir_scores = dir_scores[inds]
else:
bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1)))
scores = mlvl_scores.new_zeros((0, ))
labels = mlvl_scores.new_zeros((0, ), dtype=torch.long)
dir_scores = mlvl_scores.new_zeros((0, ))
return bboxes, scores, labels, dir_scores
def aligned_3d_nms(boxes, scores, classes, thresh):
"""3d nms for aligned boxes.
Args:
boxes (torch.Tensor): Aligned box with shape [n, 6].
scores (torch.Tensor): Scores of each box.
classes (torch.Tensor): Class of each box.
thresh (float): Iou threshold for nms.
Returns:
torch.Tensor: Indices of selected boxes.
"""
x1 = boxes[:, 0]
y1 = boxes[:, 1]
z1 = boxes[:, 2]
x2 = boxes[:, 3]
y2 = boxes[:, 4]
z2 = boxes[:, 5]
area = (x2 - x1) * (y2 - y1) * (z2 - z1)
zero = boxes.new_zeros(1, )
score_sorted = torch.argsort(scores)
pick = []
while (score_sorted.shape[0] != 0):
last = score_sorted.shape[0]
i = score_sorted[-1]
pick.append(i)
xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]])
yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]])
zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]])
xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]])
yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]])
zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]])
classes1 = classes[i]
classes2 = classes[score_sorted[:last - 1]]
inter_l = torch.max(zero, xx2 - xx1)
inter_w = torch.max(zero, yy2 - yy1)
inter_h = torch.max(zero, zz2 - zz1)
inter = inter_l * inter_w * inter_h
iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter)
iou = iou * (classes1 == classes2).float()
score_sorted = score_sorted[torch.nonzero(
iou <= thresh, as_tuple=False).flatten()]
indices = boxes.new_tensor(pick, dtype=torch.long)
return indices
@numba.jit(nopython=True)
def circle_nms(dets, thresh, socre_thre=0, post_max_size=83):
"""Circular NMS.
An object is only counted as positive if no other center
with a higher confidence exists within a radius r using a
bird-eye view distance metric.
Args:
dets (torch.Tensor): Detection results with the shape of [N, 3].
thresh (float): Value of threshold.
post_max_size (int): Max number of prediction to be kept. Defaults
to 83
Returns:
torch.Tensor: Indexes of the detections to be kept.
"""
x1 = dets[:, 0]
y1 = dets[:, 1]
scores = dets[:, 2]
order = scores.argsort()[::-1].astype(np.int32) # highest->lowest
ndets = dets.shape[0]
suppressed = np.zeros((ndets), dtype=np.int32)
keep = []
for _i in range(ndets):
i = order[_i] # start with highest score box
if suppressed[
i] == 1: # if any box have enough iou with this, remove it
continue
keep.append(i)
for _j in range(_i + 1, ndets):
j = order[_j]
if suppressed[j] == 1:
continue
# calculate center distance between i and j box
dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2
# ovr = inter / areas[j]
if dist <= thresh and scores[i] - scores[j] > socre_thre:
suppressed[j] = 1
return keep[:post_max_size]
================================================
FILE: mmdet3d/core/post_processing/merge_augs.py
================================================
import torch
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr
def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg):
"""Merge augmented detection 3D bboxes and scores.
Args:
aug_results (list[dict]): The dict of detection results.
The dict contains the following keys
- boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
- scores_3d (torch.Tensor): Detection scores.
- labels_3d (torch.Tensor): Predicted box labels.
img_metas (list[dict]): Meta information of each sample.
test_cfg (dict): Test config.
Returns:
dict: Bounding boxes results in cpu mode, containing merged results.
- boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox.
- scores_3d (torch.Tensor): Merged detection scores.
- labels_3d (torch.Tensor): Merged predicted box labels.
"""
assert len(aug_results) == len(img_metas), \
'"aug_results" should have the same length as "img_metas", got len(' \
f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}'
recovered_bboxes = []
recovered_scores = []
recovered_labels = []
for bboxes, img_info in zip(aug_results, img_metas):
scale_factor = img_info[0]['pcd_scale_factor']
pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip']
pcd_vertical_flip = img_info[0]['pcd_vertical_flip']
recovered_scores.append(bboxes['scores_3d'])
recovered_labels.append(bboxes['labels_3d'])
bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor,
pcd_horizontal_flip, pcd_vertical_flip)
recovered_bboxes.append(bboxes)
aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes)
aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev)
aug_scores = torch.cat(recovered_scores, dim=0)
aug_labels = torch.cat(recovered_labels, dim=0)
# TODO: use a more elegent way to deal with nms
if test_cfg.use_rotate_nms:
nms_func = nms_gpu
else:
nms_func = nms_normal_gpu
merged_bboxes = []
merged_scores = []
merged_labels = []
# Apply multi-class nms when merge bboxes
if len(aug_labels) == 0:
return bbox3d2result(aug_bboxes, aug_scores, aug_labels)
for class_id in range(torch.max(aug_labels).item() + 1):
class_inds = (aug_labels == class_id)
bboxes_i = aug_bboxes[class_inds]
bboxes_nms_i = aug_bboxes_for_nms[class_inds, :]
scores_i = aug_scores[class_inds]
labels_i = aug_labels[class_inds]
if len(bboxes_nms_i) == 0:
continue
selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr)
merged_bboxes.append(bboxes_i[selected, :])
merged_scores.append(scores_i[selected])
merged_labels.append(labels_i[selected])
merged_bboxes = merged_bboxes[0].cat(merged_bboxes)
merged_scores = torch.cat(merged_scores, dim=0)
merged_labels = torch.cat(merged_labels, dim=0)
_, order = merged_scores.sort(0, descending=True)
num = min(test_cfg.max_num, len(aug_bboxes))
order = order[:num]
merged_bboxes = merged_bboxes[order]
merged_scores = merged_scores[order]
merged_labels = merged_labels[order]
return bbox3d2result(merged_bboxes, merged_scores, merged_labels)
================================================
FILE: mmdet3d/core/utils/__init__.py
================================================
from .gaussian import draw_heatmap_gaussian, gaussian_2d, gaussian_radius
__all__ = ['gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian']
================================================
FILE: mmdet3d/core/utils/gaussian.py
================================================
import numpy as np
import torch
def gaussian_2d(shape, sigma=1):
"""Generate gaussian map.
Args:
shape (list[int]): Shape of the map.
sigma (float): Sigma to generate gaussian map.
Defaults to 1.
Returns:
np.ndarray: Generated gaussian map.
"""
m, n = [(ss - 1.) / 2. for ss in shape]
y, x = np.ogrid[-m:m + 1, -n:n + 1]
h = np.exp(-(x * x + y * y) / (2 * sigma * sigma))
h[h < np.finfo(h.dtype).eps * h.max()] = 0
return h
def draw_heatmap_gaussian(heatmap, center, radius, k=1):
"""Get gaussian masked heatmap.
Args:
heatmap (torch.Tensor): Heatmap to be masked.
center (torch.Tensor): Center coord of the heatmap.
radius (int): Radius of gausian.
K (int): Multiple of masked_gaussian. Defaults to 1.
Returns:
torch.Tensor: Masked heatmap.
"""
diameter = 2 * radius + 1
gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6)
x, y = int(center[0]), int(center[1])
height, width = heatmap.shape[0:2]
left, right = min(x, radius), min(width - x, radius + 1)
top, bottom = min(y, radius), min(height - y, radius + 1)
masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
masked_gaussian = torch.from_numpy(
gaussian[radius - top:radius + bottom,
radius - left:radius + right]).to(heatmap.device,
torch.float32)
if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap)
return heatmap
def gaussian_radius(det_size, min_overlap=0.5):
"""Get radius of gaussian.
Args:
det_size (tuple[torch.Tensor]): Size of the detection result.
min_overlap (float): Gaussian_overlap. Defaults to 0.5.
Returns:
torch.Tensor: Computed radius.
"""
height, width = det_size
a1 = 1
b1 = (height + width)
c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
sq1 = torch.sqrt(b1**2 - 4 * a1 * c1)
r1 = (b1 + sq1) / 2
a2 = 4
b2 = 2 * (height + width)
c2 = (1 - min_overlap) * width * height
sq2 = torch.sqrt(b2**2 - 4 * a2 * c2)
r2 = (b2 + sq2) / 2
a3 = 4 * min_overlap
b3 = -2 * min_overlap * (height + width)
c3 = (min_overlap - 1) * width * height
sq3 = torch.sqrt(b3**2 - 4 * a3 * c3)
r3 = (b3 + sq3) / 2
return min(r1, r2, r3)
================================================
FILE: mmdet3d/core/visualizer/__init__.py
================================================
from .show_result import show_result
__all__ = ['show_result']
================================================
FILE: mmdet3d/core/visualizer/open3d_vis.py
================================================
import cv2
import numpy as np
import torch
from matplotlib import pyplot as plt
try:
import open3d as o3d
from open3d import geometry
except ImportError:
raise ImportError(
'Please run "pip install open3d" to install open3d first.')
def _draw_points(points,
vis,
points_size=2,
point_color=(0.5, 0.5, 0.5),
mode='xyz'):
"""Draw points on visualizer.
Args:
points (numpy.array | torch.tensor, shape=[N, 3+C]):
points to visualize.
vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
points_size (int): the size of points to show on visualizer.
Default: 2.
point_color (tuple[float]): the color of points.
Default: (0.5, 0.5, 0.5).
mode (str): indicate type of the input points, avaliable mode
['xyz', 'xyzrgb']. Default: 'xyz'.
Returns:
tuple: points, color of each point.
"""
vis.get_render_option().point_size = points_size # set points size
if isinstance(points, torch.Tensor):
points = points.cpu().numpy()
points = points.copy()
pcd = geometry.PointCloud()
if mode == 'xyz':
pcd.points = o3d.utility.Vector3dVector(points[:, :3])
points_colors = np.tile(np.array(point_color), (points.shape[0], 1))
elif mode == 'xyzrgb':
pcd.points = o3d.utility.Vector3dVector(points[:, :3])
points_colors = points[:, 3:6]
else:
raise NotImplementedError
pcd.colors = o3d.utility.Vector3dVector(points_colors)
vis.add_geometry(pcd)
return pcd, points_colors
def _draw_bboxes(bbox3d,
vis,
points_colors,
pcd=None,
bbox_color=(0, 1, 0),
points_in_box_color=(1, 0, 0),
rot_axis=2,
center_mode='lidar_bottom',
mode='xyz'):
"""Draw bbox on visualizer and change the color of points inside bbox3d.
Args:
bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
points_colors (numpy.array): color of each points.
pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None.
bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
points_in_box_color (tuple[float]):
the color of points inside bbox3d. Default: (1, 0, 0).
rot_axis (int): rotation axis of bbox. Default: 2.
center_mode (bool): indicate the center of bbox is bottom center
or gravity center. avaliable mode
['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
mode (str): indicate type of the input points, avaliable mode
['xyz', 'xyzrgb']. Default: 'xyz'.
"""
if isinstance(bbox3d, torch.Tensor):
bbox3d = bbox3d.cpu().numpy()
bbox3d = bbox3d.copy()
in_box_color = np.array(points_in_box_color)
for i in range(len(bbox3d)):
center = bbox3d[i, 0:3]
dim = bbox3d[i, 3:6]
yaw = np.zeros(3)
yaw[rot_axis] = -bbox3d[i, 6]
rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
if center_mode == 'lidar_bottom':
center[rot_axis] += dim[
rot_axis] / 2 # bottom center to gravity center
elif center_mode == 'camera_bottom':
center[rot_axis] -= dim[
rot_axis] / 2 # bottom center to gravity center
box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
line_set.paint_uniform_color(bbox_color)
# draw bboxes on visualizer
vis.add_geometry(line_set)
# change the color of points which are in box
if pcd is not None and mode == 'xyz':
indices = box3d.get_point_indices_within_bounding_box(pcd.points)
points_colors[indices] = in_box_color
# update points colors
if pcd is not None:
pcd.colors = o3d.utility.Vector3dVector(points_colors)
vis.update_geometry(pcd)
def show_pts_boxes(points,
bbox3d=None,
show=True,
save_path=None,
points_size=2,
point_color=(0.5, 0.5, 0.5),
bbox_color=(0, 1, 0),
points_in_box_color=(1, 0, 0),
rot_axis=2,
center_mode='lidar_bottom',
mode='xyz'):
"""Draw bbox and points on visualizer.
Args:
points (numpy.array | torch.tensor, shape=[N, 3+C]):
points to visualize.
bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None.
show (bool): whether to show the visualization results. Default: True.
save_path (str): path to save visualized results. Default: None.
points_size (int): the size of points to show on visualizer.
Default: 2.
point_color (tuple[float]): the color of points.
Default: (0.5, 0.5, 0.5).
bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
points_in_box_color (tuple[float]):
the color of points which are in bbox3d. Default: (1, 0, 0).
rot_axis (int): rotation axis of bbox. Default: 2.
center_mode (bool): indicate the center of bbox is bottom center
or gravity center. avaliable mode
['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
mode (str): indicate type of the input points, avaliable mode
['xyz', 'xyzrgb']. Default: 'xyz'.
"""
# TODO: support score and class info
assert 0 <= rot_axis <= 2
# init visualizer
vis = o3d.visualization.Visualizer()
vis.create_window()
mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
size=1, origin=[0, 0, 0]) # create coordinate frame
vis.add_geometry(mesh_frame)
# draw points
pcd, points_colors = _draw_points(points, vis, points_size, point_color,
mode)
# draw boxes
if bbox3d is not None:
_draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color,
points_in_box_color, rot_axis, center_mode, mode)
if show:
vis.run()
if save_path is not None:
vis.capture_screen_image(save_path)
vis.destroy_window()
def _draw_bboxes_ind(bbox3d,
vis,
indices,
points_colors,
pcd=None,
bbox_color=(0, 1, 0),
points_in_box_color=(1, 0, 0),
rot_axis=2,
center_mode='lidar_bottom',
mode='xyz'):
"""Draw bbox on visualizer and change the color or points inside bbox3d
with indices.
Args:
bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer.
indices (numpy.array | torch.tensor, shape=[N, M]):
indicate which bbox3d that each point lies in.
points_colors (numpy.array): color of each points.
pcd (:obj:`open3d.geometry.PointCloud`): point cloud. Default: None.
bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
points_in_box_color (tuple[float]):
the color of points which are in bbox3d. Default: (1, 0, 0).
rot_axis (int): rotation axis of bbox. Default: 2.
center_mode (bool): indicate the center of bbox is bottom center
or gravity center. avaliable mode
['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
mode (str): indicate type of the input points, avaliable mode
['xyz', 'xyzrgb']. Default: 'xyz'.
"""
if isinstance(bbox3d, torch.Tensor):
bbox3d = bbox3d.cpu().numpy()
if isinstance(indices, torch.Tensor):
indices = indices.cpu().numpy()
bbox3d = bbox3d.copy()
in_box_color = np.array(points_in_box_color)
for i in range(len(bbox3d)):
center = bbox3d[i, 0:3]
dim = bbox3d[i, 3:6]
yaw = np.zeros(3)
# TODO: fix problem of current coordinate system
# dim[0], dim[1] = dim[1], dim[0] # for current coordinate
# yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi)
yaw[rot_axis] = -bbox3d[i, 6]
rot_mat = geometry.get_rotation_matrix_from_xyz(yaw)
if center_mode == 'lidar_bottom':
center[rot_axis] += dim[
rot_axis] / 2 # bottom center to gravity center
elif center_mode == 'camera_bottom':
center[rot_axis] -= dim[
rot_axis] / 2 # bottom center to gravity center
box3d = geometry.OrientedBoundingBox(center, rot_mat, dim)
line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d)
line_set.paint_uniform_color(bbox_color)
# draw bboxes on visualizer
vis.add_geometry(line_set)
# change the color of points which are in box
if pcd is not None and mode == 'xyz':
points_colors[indices[:, i].astype(np.bool)] = in_box_color
# update points colors
if pcd is not None:
pcd.colors = o3d.utility.Vector3dVector(points_colors)
vis.update_geometry(pcd)
def show_pts_index_boxes(points,
bbox3d=None,
show=True,
indices=None,
save_path=None,
points_size=2,
point_color=(0.5, 0.5, 0.5),
bbox_color=(0, 1, 0),
points_in_box_color=(1, 0, 0),
rot_axis=2,
center_mode='lidar_bottom',
mode='xyz'):
"""Draw bbox and points on visualizer with indices that indicate which
bbox3d that each point lies in.
Args:
points (numpy.array | torch.tensor, shape=[N, 3+C]):
points to visualize.
bbox3d (numpy.array | torch.tensor, shape=[M, 7]):
3d bbox (x, y, z, dx, dy, dz, yaw) to visualize. Default: None.
show (bool): whether to show the visualization results. Default: True.
indices (numpy.array | torch.tensor, shape=[N, M]):
indicate which bbox3d that each point lies in. Default: None.
save_path (str): path to save visualized results. Default: None.
points_size (int): the size of points to show on visualizer.
Default: 2.
point_color (tuple[float]): the color of points.
Default: (0.5, 0.5, 0.5).
bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
points_in_box_color (tuple[float]):
the color of points which are in bbox3d. Default: (1, 0, 0).
rot_axis (int): rotation axis of bbox. Default: 2.
center_mode (bool): indicate the center of bbox is bottom center
or gravity center. avaliable mode
['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
mode (str): indicate type of the input points, avaliable mode
['xyz', 'xyzrgb']. Default: 'xyz'.
"""
# TODO: support score and class info
assert 0 <= rot_axis <= 2
# init visualizer
vis = o3d.visualization.Visualizer()
vis.create_window()
mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
size=1, origin=[0, 0, 0]) # create coordinate frame
vis.add_geometry(mesh_frame)
# draw points
pcd, points_colors = _draw_points(points, vis, points_size, point_color,
mode)
# draw boxes
if bbox3d is not None:
_draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color,
points_in_box_color, rot_axis, center_mode, mode)
if show:
vis.run()
if save_path is not None:
vis.capture_screen_image(save_path)
vis.destroy_window()
def project_pts_on_img(points,
raw_img,
lidar2img_rt,
max_distance=70,
thickness=-1):
"""Project the 3D points cloud on 2D image.
Args:
points (numpy.array): 3D points cloud (x, y, z) to visualize.
raw_img (numpy.array): The numpy array of image.
lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
according to the camera intrinsic parameters.
max_distance (float): the max distance of the points cloud.
Default: 70.
thickness (int, optional): The thickness of 2D points. Default: -1.
"""
img = raw_img.copy()
num_points = points.shape[0]
pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1)
pts_2d = pts_4d @ lidar2img_rt.T
# cam_points is Tensor of Nx4 whose last column is 1
# transform camera coordinate to image coordinate
pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999)
pts_2d[:, 0] /= pts_2d[:, 2]
pts_2d[:, 1] /= pts_2d[:, 2]
fov_inds = ((pts_2d[:, 0] < img.shape[1])
& (pts_2d[:, 0] >= 0)
& (pts_2d[:, 1] < img.shape[0])
& (pts_2d[:, 1] >= 0))
imgfov_pts_2d = pts_2d[fov_inds, :3] # u, v, d
cmap = plt.cm.get_cmap('hsv', 256)
cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255
for i in range(imgfov_pts_2d.shape[0]):
depth = imgfov_pts_2d[i, 2]
color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :]
cv2.circle(
img,
center=(int(np.round(imgfov_pts_2d[i, 0])),
int(np.round(imgfov_pts_2d[i, 1]))),
radius=1,
color=tuple(color),
thickness=thickness,
)
cv2.imshow('project_pts_img', img)
cv2.waitKey(100)
def project_bbox3d_on_img(bboxes3d,
raw_img,
lidar2img_rt,
color=(0, 255, 0),
thickness=1):
"""Project the 3D bbox on 2D image.
Args:
bboxes3d (numpy.array, shape=[M, 7]):
3d bbox (x, y, z, dx, dy, dz, yaw) to visualize.
raw_img (numpy.array): The numpy array of image.
lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix
according to the camera intrinsic parameters.
color (tuple[int]): the color to draw bboxes. Default: (0, 255, 0).
thickness (int, optional): The thickness of bboxes. Default: 1.
"""
img = raw_img.copy()
corners_3d = bboxes3d.corners
num_bbox = corners_3d.shape[0]
pts_4d = np.concatenate(
[corners_3d.reshape(-1, 3),
np.ones((num_bbox * 8, 1))], axis=-1)
pts_2d = pts_4d @ lidar2img_rt.T
pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5)
pts_2d[:, 0] /= pts_2d[:, 2]
pts_2d[:, 1] /= pts_2d[:, 2]
imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2)
line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7),
(4, 5), (4, 7), (2, 6), (5, 6), (6, 7))
for i in range(num_bbox):
corners = imgfov_pts_2d[i].astype(np.int)
for start, end in line_indices:
cv2.line(img, (corners[start, 0], corners[start, 1]),
(corners[end, 0], corners[end, 1]), color, thickness,
cv2.LINE_AA)
cv2.imshow('project_bbox3d_img', img)
cv2.waitKey(0)
class Visualizer(object):
r"""Online visualizer implemented with Open3d.
Args:
points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points
cloud is in mode of Coord3DMode.DEPTH (please refer to
core.structures.coord_3d_mode).
bbox3d (numpy.array, shape=[M, 7]): 3d bbox (x, y, z, dx, dy, dz, yaw)
to visualize. The 3d bbox is in mode of Box3DMode.DEPTH with
gravity_center (please refer to core.structures.box_3d_mode).
Default: None.
save_path (str): path to save visualized results. Default: None.
points_size (int): the size of points to show on visualizer.
Default: 2.
point_color (tuple[float]): the color of points.
Default: (0.5, 0.5, 0.5).
bbox_color (tuple[float]): the color of bbox. Default: (0, 1, 0).
points_in_box_color (tuple[float]):
the color of points which are in bbox3d. Default: (1, 0, 0).
rot_axis (int): rotation axis of bbox. Default: 2.
center_mode (bool): indicate the center of bbox is bottom center
or gravity center. avaliable mode
['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'.
mode (str): indicate type of the input points, avaliable mode
['xyz', 'xyzrgb']. Default: 'xyz'.
"""
def __init__(self,
points,
bbox3d=None,
save_path=None,
points_size=2,
point_color=(0.5, 0.5, 0.5),
bbox_color=(0, 1, 0),
points_in_box_color=(1, 0, 0),
rot_axis=2,
center_mode='lidar_bottom',
mode='xyz'):
super(Visualizer, self).__init__()
assert 0 <= rot_axis <= 2
# init visualizer
self.o3d_visualizer = o3d.visualization.Visualizer()
self.o3d_visualizer.create_window()
mesh_frame = geometry.TriangleMesh.create_coordinate_frame(
size=1, origin=[0, 0, 0]) # create coordinate frame
self.o3d_visualizer.add_geometry(mesh_frame)
self.points_size = points_size
self.point_color = point_color
self.bbox_color = bbox_color
self.points_in_box_color = points_in_box_color
self.rot_axis = rot_axis
self.center_mode = center_mode
self.mode = mode
# draw points
if points is not None:
self.pcd, self.points_colors = _draw_points(
points, self.o3d_visualizer, points_size, point_color, mode)
# draw boxes
if bbox3d is not None:
_draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors,
self.pcd, bbox_color, points_in_box_color, rot_axis,
center_mode, mode)
def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None):
"""Add bounding box to visualizer.
Args:
bbox3d (numpy.array, shape=[M, 7]):
3D bbox (x, y, z, dx, dy, dz, yaw) to be visualized.
The 3d bbox is in mode of Box3DMode.DEPTH with
gravity_center (please refer to core.structures.box_3d_mode).
bbox_color (tuple[float]): the color of bbox. Defaule: None.
points_in_box_color (tuple[float]): the color of points which
are in bbox3d. Defaule: None.
"""
if bbox_color is None:
bbox_color = self.bbox_color
if points_in_box_color is None:
points_in_box_color = self.points_in_box_color
_draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd,
bbox_color, points_in_box_color, self.rot_axis,
self.center_mode, self.mode)
def show(self, save_path=None):
"""Visualize the points cloud.
Args:
save_path (str): path to save image. Default: None.
"""
self.o3d_visualizer.run()
if save_path is not None:
self.o3d_visualizer.capture_screen_image(save_path)
self.o3d_visualizer.destroy_window()
return
================================================
FILE: mmdet3d/core/visualizer/show_result.py
================================================
import mmcv
import numpy as np
import trimesh
from os import path as osp
def _write_ply(points, out_filename):
"""Write points into ``ply`` format for meshlab visualization.
Args:
points (np.ndarray): Points in shape (N, dim).
out_filename (str): Filename to be saved.
"""
N = points.shape[0]
fout = open(out_filename, 'w')
for i in range(N):
if points.shape[1] == 6:
c = points[i, 3:].astype(int)
fout.write(
'v %f %f %f %d %d %d\n' %
(points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2]))
else:
fout.write('v %f %f %f\n' %
(points[i, 0], points[i, 1], points[i, 2]))
fout.close()
def _write_oriented_bbox(scene_bbox, out_filename):
"""Export oriented (around Z axis) scene bbox to meshes.
Args:
scene_bbox(list[ndarray] or ndarray): xyz pos of center and
3 lengths (dx,dy,dz) and heading angle around Z axis.
Y forward, X right, Z upward. heading angle of positive X is 0,
heading angle of positive Y is 90 degrees.
out_filename(str): Filename.
"""
def heading2rotmat(heading_angle):
rotmat = np.zeros((3, 3))
rotmat[2, 2] = 1
cosval = np.cos(heading_angle)
sinval = np.sin(heading_angle)
rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]])
return rotmat
def convert_oriented_box_to_trimesh_fmt(box):
ctr = box[:3]
lengths = box[3:6]
trns = np.eye(4)
trns[0:3, 3] = ctr
trns[3, 3] = 1.0
trns[0:3, 0:3] = heading2rotmat(box[6])
box_trimesh_fmt = trimesh.creation.box(lengths, trns)
return box_trimesh_fmt
if len(scene_bbox) == 0:
scene_bbox = np.zeros((1, 7))
scene = trimesh.scene.Scene()
for box in scene_bbox:
scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box))
mesh_list = trimesh.util.concatenate(scene.dump())
# save to ply file
trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='ply')
return
def show_result(points, gt_bboxes, pred_bboxes, out_dir, filename, show=True):
"""Convert results into format that is directly readable for meshlab.
Args:
points (np.ndarray): Points.
gt_bboxes (np.ndarray): Ground truth boxes.
pred_bboxes (np.ndarray): Predicted boxes.
out_dir (str): Path of output directory
filename (str): Filename of the current frame.
show (bool): Visualize the results online.
"""
if show:
from .open3d_vis import Visualizer
vis = Visualizer(points)
if pred_bboxes is not None:
vis.add_bboxes(bbox3d=pred_bboxes)
if gt_bboxes is not None:
vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1))
vis.show()
result_path = osp.join(out_dir, filename)
mmcv.mkdir_or_exist(result_path)
if points is not None:
_write_ply(points, osp.join(result_path, f'{filename}_points.obj'))
if gt_bboxes is not None:
# bottom center to gravity center
gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2
# the positive direction for yaw in meshlab is clockwise
gt_bboxes[:, 6] *= -1
_write_oriented_bbox(gt_bboxes,
osp.join(result_path, f'{filename}_gt.ply'))
if pred_bboxes is not None:
# bottom center to gravity center
pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2
# the positive direction for yaw in meshlab is clockwise
pred_bboxes[:, 6] *= -1
_write_oriented_bbox(pred_bboxes,
osp.join(result_path, f'{filename}_pred.ply'))
================================================
FILE: mmdet3d/core/voxel/__init__.py
================================================
from .builder import build_voxel_generator
from .voxel_generator import VoxelGenerator
__all__ = ['build_voxel_generator', 'VoxelGenerator']
================================================
FILE: mmdet3d/core/voxel/builder.py
================================================
import mmcv
from . import voxel_generator
def build_voxel_generator(cfg, **kwargs):
"""Builder of voxel generator."""
if isinstance(cfg, voxel_generator.VoxelGenerator):
return cfg
elif isinstance(cfg, dict):
return mmcv.runner.obj_from_dict(
cfg, voxel_generator, default_args=kwargs)
else:
raise TypeError('Invalid type {} for building a sampler'.format(
type(cfg)))
================================================
FILE: mmdet3d/core/voxel/voxel_generator.py
================================================
import numba
import numpy as np
class VoxelGenerator(object):
"""Voxel generator in numpy implementation.
Args:
voxel_size (list[float]): Size of a single voxel
point_cloud_range (list[float]): Range of points
max_num_points (int): Maximum number of points in a single voxel
max_voxels (int, optional): Maximum number of voxels.
Defaults to 20000.
"""
def __init__(self,
voxel_size,
point_cloud_range,
max_num_points,
max_voxels=20000):
point_cloud_range = np.array(point_cloud_range, dtype=np.float32)
# [0, -40, -3, 70.4, 40, 1]
voxel_size = np.array(voxel_size, dtype=np.float32)
grid_size = (point_cloud_range[3:] -
point_cloud_range[:3]) / voxel_size
grid_size = np.round(grid_size).astype(np.int64)
self._voxel_size = voxel_size
self._point_cloud_range = point_cloud_range
self._max_num_points = max_num_points
self._max_voxels = max_voxels
self._grid_size = grid_size
def generate(self, points):
"""Generate voxels given points."""
return points_to_voxel(points, self._voxel_size,
self._point_cloud_range, self._max_num_points,
True, self._max_voxels)
@property
def voxel_size(self):
"""list[float]: Size of a single voxel."""
return self._voxel_size
@property
def max_num_points_per_voxel(self):
"""int: Maximum number of points per voxel."""
return self._max_num_points
@property
def point_cloud_range(self):
"""list[float]: Range of point cloud."""
return self._point_cloud_range
@property
def grid_size(self):
"""np.ndarray: The size of grids."""
return self._grid_size
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
indent = ' ' * (len(repr_str) + 1)
repr_str += f'(voxel_size={self._voxel_size},\n'
repr_str += indent + 'point_cloud_range='
repr_str += f'{self._point_cloud_range.tolist()},\n'
repr_str += indent + f'max_num_points={self._max_num_points},\n'
repr_str += indent + f'max_voxels={self._max_voxels},\n'
repr_str += indent + f'grid_size={self._grid_size.tolist()}'
repr_str += ')'
return repr_str
def points_to_voxel(points,
voxel_size,
coors_range,
max_points=35,
reverse_index=True,
max_voxels=20000):
"""convert kitti points(N, >=3) to voxels.
Args:
points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
points[:, 3:] contain other information such as reflectivity.
voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size
coors_range (list[float | tuple[float] | ndarray]): Voxel range. \
format: xyzxyz, minmax
max_points (int): Indicate maximum points contained in a voxel.
reverse_index (bool): Whether return reversed coordinates. \
if points has xyz format and reverse_index is True, output \
coordinates will be zyx format, but points in features always \
xyz format.
max_voxels (int): Maximum number of voxels this function creates. \
For second, 20000 is a good choice. Points should be shuffled for \
randomness before this function because max_voxels drops points.
Returns:
tuple[np.ndarray]:
voxels: [M, max_points, ndim] float tensor. only contain points.
coordinates: [M, 3] int32 tensor.
num_points_per_voxel: [M] int32 tensor.
"""
if not isinstance(voxel_size, np.ndarray):
voxel_size = np.array(voxel_size, dtype=points.dtype)
if not isinstance(coors_range, np.ndarray):
coors_range = np.array(coors_range, dtype=points.dtype)
voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size
voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist())
if reverse_index:
voxelmap_shape = voxelmap_shape[::-1]
# don't create large array in jit(nopython=True) code.
num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32)
coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32)
voxels = np.zeros(
shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype)
coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32)
if reverse_index:
voxel_num = _points_to_voxel_reverse_kernel(
points, voxel_size, coors_range, num_points_per_voxel,
coor_to_voxelidx, voxels, coors, max_points, max_voxels)
else:
voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range,
num_points_per_voxel,
coor_to_voxelidx, voxels, coors,
max_points, max_voxels)
coors = coors[:voxel_num]
voxels = voxels[:voxel_num]
num_points_per_voxel = num_points_per_voxel[:voxel_num]
return voxels, coors, num_points_per_voxel
@numba.jit(nopython=True)
def _points_to_voxel_reverse_kernel(points,
voxel_size,
coors_range,
num_points_per_voxel,
coor_to_voxelidx,
voxels,
coors,
max_points=35,
max_voxels=20000):
"""convert kitti points(N, >=3) to voxels.
Args:
points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
points[:, 3:] contain other information such as reflectivity.
voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size \
coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \
format: xyzxyz, minmax
num_points_per_voxel (int): Number of points per voxel.
coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \
which has the same shape as the complete voxel map. It indicates \
the index of each corresponding voxel.
voxels (np.ndarray): Created empty voxels.
coors (np.ndarray): Created coordinates of each voxel.
max_points (int): Indicate maximum points contained in a voxel.
max_voxels (int): Maximum number of voxels this function create. \
for second, 20000 is a good choice. Points should be shuffled for \
randomness before this function because max_voxels drops points.
Returns:
tuple[np.ndarray]:
voxels: Shape [M, max_points, ndim], only contain points.
coordinates: Shape [M, 3].
num_points_per_voxel: Shape [M].
"""
# put all computations to one loop.
# we shouldn't create large array in main jit code, otherwise
# reduce performance
N = points.shape[0]
# ndim = points.shape[1] - 1
ndim = 3
ndim_minus_1 = ndim - 1
grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
# np.round(grid_size)
# grid_size = np.round(grid_size).astype(np.int64)(np.int32)
grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
coor = np.zeros(shape=(3, ), dtype=np.int32)
voxel_num = 0
failed = False
for i in range(N):
failed = False
for j in range(ndim):
c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
if c < 0 or c >= grid_size[j]:
failed = True
break
coor[ndim_minus_1 - j] = c
if failed:
continue
voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
if voxelidx == -1:
voxelidx = voxel_num
if voxel_num >= max_voxels:
break
voxel_num += 1
coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
coors[voxelidx] = coor
num = num_points_per_voxel[voxelidx]
if num < max_points:
voxels[voxelidx, num] = points[i]
num_points_per_voxel[voxelidx] += 1
return voxel_num
@numba.jit(nopython=True)
def _points_to_voxel_kernel(points,
voxel_size,
coors_range,
num_points_per_voxel,
coor_to_voxelidx,
voxels,
coors,
max_points=35,
max_voxels=20000):
"""convert kitti points(N, >=3) to voxels.
Args:
points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and \
points[:, 3:] contain other information such as reflectivity.
voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size.
coors_range (list[float | tuple[float] | ndarray]): Range of voxels. \
format: xyzxyz, minmax
num_points_per_voxel (int): Number of points per voxel.
coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), \
which has the same shape as the complete voxel map. It indicates \
the index of each corresponding voxel.
voxels (np.ndarray): Created empty voxels.
coors (np.ndarray): Created coordinates of each voxel.
max_points (int): Indicate maximum points contained in a voxel.
max_voxels (int): Maximum number of voxels this function create. \
for second, 20000 is a good choice. Points should be shuffled for \
randomness before this function because max_voxels drops points.
Returns:
tuple[np.ndarray]:
voxels: Shape [M, max_points, ndim], only contain points.
coordinates: Shape [M, 3].
num_points_per_voxel: Shape [M].
"""
N = points.shape[0]
# ndim = points.shape[1] - 1
ndim = 3
grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size
# grid_size = np.round(grid_size).astype(np.int64)(np.int32)
grid_size = np.round(grid_size, 0, grid_size).astype(np.int32)
# lower_bound = coors_range[:3]
# upper_bound = coors_range[3:]
coor = np.zeros(shape=(3, ), dtype=np.int32)
voxel_num = 0
failed = False
for i in range(N):
failed = False
for j in range(ndim):
c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j])
if c < 0 or c >= grid_size[j]:
failed = True
break
coor[j] = c
if failed:
continue
voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]]
if voxelidx == -1:
voxelidx = voxel_num
if voxel_num >= max_voxels:
break
voxel_num += 1
coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx
coors[voxelidx] = coor
num = num_points_per_voxel[voxelidx]
if num < max_points:
voxels[voxelidx, num] = points[i]
num_points_per_voxel[voxelidx] += 1
return voxel_num
================================================
FILE: mmdet3d/datasets/__init__.py
================================================
from mmdet.datasets.builder import build_dataloader
from .builder import DATASETS, build_dataset
from .custom_3d import Custom3DDataset
from .kitti_dataset import KittiDataset
from .lyft_dataset import LyftDataset
from .nuscenes_dataset import NuScenesDataset
from .pipelines import (BackgroundPointsFilter, GlobalRotScaleTrans,
IndoorPointSample, LoadAnnotations3D,
LoadPointsFromFile, LoadPointsFromMultiSweeps,
NormalizePointsColor, ObjectNoise, ObjectRangeFilter,
ObjectSample, PointShuffle, PointsRangeFilter,
RandomFlip3D, VoxelBasedPointSampler)
from .scannet_dataset import ScanNetDataset
from .semantickitti_dataset import SemanticKITTIDataset
from .sunrgbd_dataset import SUNRGBDDataset
from .waymo_dataset import WaymoDataset
from .nuscenes_dataset_viewInfo import NuScenesDataset_ViewInfo
__all__ = [
'KittiDataset', 'GroupSampler', 'DistributedGroupSampler',
'build_dataloader', 'RepeatFactorDataset', 'DATASETS', 'build_dataset',
'CocoDataset', 'NuScenesDataset', 'LyftDataset', 'ObjectSample',
'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle',
'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
'LoadPointsFromFile', 'NormalizePointsColor', 'IndoorPointSample',
'LoadAnnotations3D', 'SUNRGBDDataset', 'ScanNetDataset',
'SemanticKITTIDataset', 'Custom3DDataset', 'LoadPointsFromMultiSweeps',
'WaymoDataset', 'BackgroundPointsFilter', 'VoxelBasedPointSampler',
'NuScenesDataset_ViewInfo'
]
================================================
FILE: mmdet3d/datasets/builder.py
================================================
import platform
from mmcv.utils import build_from_cfg
from mmdet.datasets import DATASETS
from mmdet.datasets.builder import _concat_dataset
if platform.system() != 'Windows':
# https://github.com/pytorch/pytorch/issues/973
import resource
rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
hard_limit = rlimit[1]
soft_limit = min(4096, hard_limit)
resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
def build_dataset(cfg, default_args=None):
from mmdet3d.datasets.dataset_wrappers import CBGSDataset
from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset,
ConcatDataset, RepeatDataset)
if isinstance(cfg, (list, tuple)):
dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
elif cfg['type'] == 'ConcatDataset':
dataset = ConcatDataset(
[build_dataset(c, default_args) for c in cfg['datasets']],
cfg.get('separate_eval', True))
elif cfg['type'] == 'RepeatDataset':
dataset = RepeatDataset(
build_dataset(cfg['dataset'], default_args), cfg['times'])
elif cfg['type'] == 'ClassBalancedDataset':
dataset = ClassBalancedDataset(
build_dataset(cfg['dataset'], default_args), cfg['oversample_thr'])
elif cfg['type'] == 'CBGSDataset':
dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args))
elif isinstance(cfg.get('ann_file'), (list, tuple)):
dataset = _concat_dataset(cfg, default_args)
else:
dataset = build_from_cfg(cfg, DATASETS, default_args)
return dataset
================================================
FILE: mmdet3d/datasets/custom_3d.py
================================================
import mmcv
import numpy as np
import tempfile
from os import path as osp
from torch.utils.data import Dataset
from mmdet.datasets import DATASETS
from ..core.bbox import get_box_type
from .pipelines import Compose
@DATASETS.register_module()
class Custom3DDataset(Dataset):
"""Customized 3D dataset.
This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI
dataset.
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR'. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
"""
def __init__(self,
data_root,
ann_file,
pipeline=None,
classes=None,
modality=None,
box_type_3d='LiDAR',
filter_empty_gt=True,
test_mode=False):
super().__init__()
self.data_root = data_root
self.ann_file = ann_file
self.test_mode = test_mode
self.modality = modality
self.filter_empty_gt = filter_empty_gt
self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d)
self.CLASSES = self.get_classes(classes)
self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
self.data_infos = self.load_annotations(self.ann_file)
if pipeline is not None:
self.pipeline = Compose(pipeline)
# set group flag for the sampler
if not self.test_mode:
self._set_group_flag()
def load_annotations(self, ann_file):
"""Load annotations from ann_file.
Args:
ann_file (str): Path of the annotation file.
Returns:
list[dict]: List of annotations.
"""
return mmcv.load(ann_file)
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str): Filename of point clouds.
- file_name (str): Filename of point clouds.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
sample_idx = info['point_cloud']['lidar_idx']
pts_filename = osp.join(self.data_root, info['pts_path'])
input_dict = dict(
pts_filename=pts_filename,
sample_idx=sample_idx,
file_name=pts_filename)
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any():
return None
return input_dict
def pre_pipeline(self, results):
"""Initialization before data preparation.
Args:
results (dict): Dict before data preprocessing.
- img_fields (list): Image fields.
- bbox3d_fields (list): 3D bounding boxes fields.
- pts_mask_fields (list): Mask fields of points.
- pts_seg_fields (list): Mask fields of point segments.
- bbox_fields (list): Fields of bounding boxes.
- mask_fields (list): Fields of masks.
- seg_fields (list): Segment fields.
- box_type_3d (str): 3D box type.
- box_mode_3d (str): 3D box mode.
"""
results['img_fields'] = []
results['bbox3d_fields'] = []
results['pts_mask_fields'] = []
results['pts_seg_fields'] = []
results['bbox_fields'] = []
results['mask_fields'] = []
results['seg_fields'] = []
results['box_type_3d'] = self.box_type_3d
results['box_mode_3d'] = self.box_mode_3d
def prepare_train_data(self, index):
"""Training data preparation.
Args:
index (int): Index for accessing the target data.
Returns:
dict: Training data dict of the corresponding index.
"""
input_dict = self.get_data_info(index)
if input_dict is None:
return None
self.pre_pipeline(input_dict)
example = self.pipeline(input_dict)
if self.filter_empty_gt and \
(example is None or
~(example['gt_labels_3d']._data != -1).any()):
return None
return example
def prepare_test_data(self, index):
"""Prepare data for testing.
Args:
index (int): Index for accessing the target data.
Returns:
dict: Testing data dict of the corresponding index.
"""
input_dict = self.get_data_info(index)
self.pre_pipeline(input_dict)
example = self.pipeline(input_dict)
return example
@classmethod
def get_classes(cls, classes=None):
"""Get class names of current dataset.
Args:
classes (Sequence[str] | str | None): If classes is None, use
default CLASSES defined by builtin dataset. If classes is a
string, take it as a file name. The file contains the name of
classes where each line contains one class name. If classes is
a tuple or list, override the CLASSES defined by the dataset.
Return:
list[str]: A list of class names.
"""
if classes is None:
return cls.CLASSES
if isinstance(classes, str):
# take it as a file path
class_names = mmcv.list_from_file(classes)
elif isinstance(classes, (tuple, list)):
class_names = classes
else:
raise ValueError(f'Unsupported type {type(classes)} of classes.')
return class_names
def format_results(self,
outputs,
pklfile_prefix=None,
submission_prefix=None):
"""Format the results to pkl file.
Args:
outputs (list[dict]): Testing results of the dataset.
pklfile_prefix (str | None): The prefix of pkl files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
Returns:
tuple: (outputs, tmp_dir), outputs is the detection results, \
tmp_dir is the temporal directory created for saving json \
files when ``jsonfile_prefix`` is not specified.
"""
if pklfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
pklfile_prefix = osp.join(tmp_dir.name, 'results')
out = f'{pklfile_prefix}.pkl'
mmcv.dump(outputs, out)
return outputs, tmp_dir
def evaluate(self,
results,
metric=None,
iou_thr=(0.25, 0.5),
logger=None,
show=False,
out_dir=None):
"""Evaluate.
Evaluation in indoor protocol.
Args:
results (list[dict]): List of results.
metric (str | list[str]): Metrics to be evaluated.
iou_thr (list[float]): AP IoU thresholds.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
Returns:
dict: Evaluation results.
"""
from mmdet3d.core.evaluation import indoor_eval
assert isinstance(
results, list), f'Expect results to be list, got {type(results)}.'
assert len(results) > 0, 'Expect length of results > 0.'
assert len(results) == len(self.data_infos)
assert isinstance(
results[0], dict
), f'Expect elements in results to be dict, got {type(results[0])}.'
gt_annos = [info['annos'] for info in self.data_infos]
label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)}
ret_dict = indoor_eval(
gt_annos,
results,
iou_thr,
label2cat,
logger=logger,
box_type_3d=self.box_type_3d,
box_mode_3d=self.box_mode_3d)
if show:
self.show(results, out_dir)
return ret_dict
def __len__(self):
"""Return the length of data infos.
Returns:
int: Length of data infos.
"""
return len(self.data_infos)
def _rand_another(self, idx):
"""Randomly get another item with the same flag.
Returns:
int: Another index of item with the same flag.
"""
pool = np.where(self.flag == self.flag[idx])[0]
return np.random.choice(pool)
def __getitem__(self, idx):
"""Get item from infos according to the given index.
Returns:
dict: Data dictionary of the corresponding index.
"""
if self.test_mode:
return self.prepare_test_data(idx)
while True:
data = self.prepare_train_data(idx)
if data is None:
idx = self._rand_another(idx)
continue
return data
def _set_group_flag(self):
"""Set flag according to image aspect ratio.
Images with aspect ratio greater than 1 will be set as group 1,
otherwise group 0. In 3D datasets, they are all the same, thus are all
zeros.
"""
self.flag = np.zeros(len(self), dtype=np.uint8)
================================================
FILE: mmdet3d/datasets/dataset_wrappers.py
================================================
import numpy as np
from .builder import DATASETS
@DATASETS.register_module()
class CBGSDataset(object):
"""A wrapper of class sampled dataset with ann_file path. Implementation of
paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object
Detection `_.
Balance the number of scenes under different classes.
Args:
dataset (:obj:`CustomDataset`): The dataset to be class sampled.
"""
def __init__(self, dataset):
self.dataset = dataset
self.CLASSES = dataset.CLASSES
self.cat2id = {name: i for i, name in enumerate(self.CLASSES)}
self.sample_indices = self._get_sample_indices()
# self.dataset.data_infos = self.data_infos
if hasattr(self.dataset, 'flag'):
self.flag = np.array(
[self.dataset.flag[ind] for ind in self.sample_indices],
dtype=np.uint8)
def _get_sample_indices(self):
"""Load annotations from ann_file.
Args:
ann_file (str): Path of the annotation file.
Returns:
list[dict]: List of annotations after class sampling.
"""
class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()}
for idx in range(len(self.dataset)):
sample_cat_ids = self.dataset.get_cat_ids(idx)
for cat_id in sample_cat_ids:
class_sample_idxs[cat_id].append(idx)
duplicated_samples = sum(
[len(v) for _, v in class_sample_idxs.items()])
class_distribution = {
k: len(v) / duplicated_samples
for k, v in class_sample_idxs.items()
}
sample_indices = []
frac = 1.0 / len(self.CLASSES)
ratios = [frac / v for v in class_distribution.values()]
for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios):
sample_indices += np.random.choice(cls_inds,
int(len(cls_inds) *
ratio)).tolist()
return sample_indices
def __getitem__(self, idx):
"""Get item from infos according to the given index.
Returns:
dict: Data dictionary of the corresponding index.
"""
ori_idx = self.sample_indices[idx]
return self.dataset[ori_idx]
def __len__(self):
"""Return the length of data infos.
Returns:
int: Length of data infos.
"""
return len(self.sample_indices)
================================================
FILE: mmdet3d/datasets/kitti2d_dataset.py
================================================
import mmcv
import numpy as np
from mmdet.datasets import DATASETS, CustomDataset
@DATASETS.register_module()
class Kitti2DDataset(CustomDataset):
r"""KITTI 2D Dataset.
This class serves as the API for experiments on the `KITTI Dataset
`_.
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR'. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
"""
CLASSES = ('car', 'pedestrian', 'cyclist')
"""
Annotation format:
[
{
'image': {
'image_idx': 0,
'image_path': 'training/image_2/000000.png',
'image_shape': array([ 370, 1224], dtype=int32)
},
'point_cloud': {
'num_features': 4,
'velodyne_path': 'training/velodyne/000000.bin'
},
'calib': {
'P0': (4, 4),
'P1': (4, 4),
'P2': (4, 4),
'P3': (4, 4),
'R0_rect':4x4 np.array,
'Tr_velo_to_cam': 4x4 np.array,
'Tr_imu_to_velo': 4x4 np.array
},
'annos': {
'name': (n),
'truncated': (n),
'occluded': (n),
'alpha': (n),
'bbox': (n, 4),
'dimensions': (n, 3),
'location': (n, 3),
'rotation_y': (n),
'score': (n),
'index': array([0], dtype=int32),
'group_ids': array([0], dtype=int32),
'difficulty': array([0], dtype=int32),
'num_points_in_gt': (n),
}
}
]
"""
def load_annotations(self, ann_file):
"""Load annotations from ann_file.
Args:
ann_file (str): Path of the annotation file.
Returns:
list[dict]: List of annotations.
"""
self.data_infos = mmcv.load(ann_file)
self.cat2label = {
cat_name: i
for i, cat_name in enumerate(self.CLASSES)
}
return self.data_infos
def _filter_imgs(self, min_size=32):
"""Filter images without ground truths."""
valid_inds = []
for i, img_info in enumerate(self.data_infos):
if len(img_info['annos']['name']) > 0:
valid_inds.append(i)
return valid_inds
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: Annotation information consists of the following keys:
- bboxes (np.ndarray): Ground truth bboxes.
- labels (np.ndarray): Labels of ground truths.
"""
# Use index to get the annos, thus the evalhook could also use this api
info = self.data_infos[index]
annos = info['annos']
gt_names = annos['name']
gt_bboxes = annos['bbox']
difficulty = annos['difficulty']
# remove classes that is not needed
selected = self.keep_arrays_by_name(gt_names, self.CLASSES)
gt_bboxes = gt_bboxes[selected]
gt_names = gt_names[selected]
difficulty = difficulty[selected]
gt_labels = np.array([self.cat2label[n] for n in gt_names])
anns_results = dict(
bboxes=gt_bboxes.astype(np.float32),
labels=gt_labels,
)
return anns_results
def prepare_train_img(self, idx):
"""Training image preparation.
Args:
index (int): Index for accessing the target image data.
Returns:
dict: Training image data dict after preprocessing
corresponding to the index.
"""
img_raw_info = self.data_infos[idx]['image']
img_info = dict(filename=img_raw_info['image_path'])
ann_info = self.get_ann_info(idx)
if len(ann_info['bboxes']) == 0:
return None
results = dict(img_info=img_info, ann_info=ann_info)
if self.proposals is not None:
results['proposals'] = self.proposals[idx]
self.pre_pipeline(results)
return self.pipeline(results)
def prepare_test_img(self, idx):
"""Prepare data for testing.
Args:
index (int): Index for accessing the target image data.
Returns:
dict: Testing image data dict after preprocessing
corresponding to the index.
"""
img_raw_info = self.data_infos[idx]['image']
img_info = dict(filename=img_raw_info['image_path'])
results = dict(img_info=img_info)
if self.proposals is not None:
results['proposals'] = self.proposals[idx]
self.pre_pipeline(results)
return self.pipeline(results)
def drop_arrays_by_name(self, gt_names, used_classes):
"""Drop irrelevant ground truths by name.
Args:
gt_names (list[str]): Names of ground truths.
used_classes (list[str]): Classes of interest.
Returns:
np.ndarray: Indices of ground truths that will be dropped.
"""
inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
inds = np.array(inds, dtype=np.int64)
return inds
def keep_arrays_by_name(self, gt_names, used_classes):
"""Keep useful ground truths by name.
Args:
gt_names (list[str]): Names of ground truths.
used_classes (list[str]): Classes of interest.
Returns:
np.ndarray: Indices of ground truths that will be keeped.
"""
inds = [i for i, x in enumerate(gt_names) if x in used_classes]
inds = np.array(inds, dtype=np.int64)
return inds
def reformat_bbox(self, outputs, out=None):
"""Reformat bounding boxes to KITTI 2D styles.
Args:
outputs (list[np.ndarray]): List of arrays storing the inferenced
bounding boxes and scores.
out (str | None): The prefix of output file. Default: None.
Returns:
list[dict]: A list of dictionaries with the kitti 2D format.
"""
from mmdet3d.core.bbox.transforms import bbox2result_kitti2d
sample_idx = [info['image']['image_idx'] for info in self.data_infos]
result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx,
out)
return result_files
def evaluate(self, result_files, eval_types=None):
"""Evaluation in KITTI protocol.
Args:
result_files (str): Path of result files.
eval_types (str): Types of evaluation. Default: None.
KITTI dataset only support 'bbox' evaluation type.
Returns:
tuple (str, dict): Average precision results in str format
and average precision results in dict format.
"""
from mmdet3d.core.evaluation import kitti_eval
eval_types = ['bbox'] if not eval_types else eval_types
assert eval_types in ('bbox', ['bbox'
]), 'KITTI data set only evaluate bbox'
gt_annos = [info['annos'] for info in self.data_infos]
ap_result_str, ap_dict = kitti_eval(
gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
return ap_result_str, ap_dict
================================================
FILE: mmdet3d/datasets/kitti_dataset.py
================================================
import copy
import mmcv
import numpy as np
import os
import tempfile
import torch
from mmcv.utils import print_log
from os import path as osp
from mmdet.datasets import DATASETS
from ..core import show_result
from ..core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode,
points_cam2img)
from .custom_3d import Custom3DDataset
@DATASETS.register_module()
class KittiDataset(Custom3DDataset):
r"""KITTI Dataset.
This class serves as the API for experiments on the `KITTI Dataset
`_.
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
split (str): Split of input data.
pts_prefix (str, optional): Prefix of points files.
Defaults to 'velodyne'.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR' in this dataset. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
pcd_limit_range (list): The range of point cloud used to filter
invalid predicted boxes. Default: [0, -40, -3, 70.4, 40, 0.0].
"""
CLASSES = ('car', 'pedestrian', 'cyclist')
def __init__(self,
data_root,
ann_file,
split,
pts_prefix='velodyne',
pipeline=None,
classes=None,
modality=None,
box_type_3d='LiDAR',
filter_empty_gt=True,
test_mode=False,
pcd_limit_range=[0, -40, -3, 70.4, 40, 0.0]):
super().__init__(
data_root=data_root,
ann_file=ann_file,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode)
self.split = split
self.root_split = os.path.join(self.data_root, split)
assert self.modality is not None
self.pcd_limit_range = pcd_limit_range
self.pts_prefix = pts_prefix
def _get_pts_filename(self, idx):
"""Get point cloud filename according to the given index.
Args:
index (int): Index of the point cloud file to get.
Returns:
str: Name of the point cloud file.
"""
pts_filename = osp.join(self.root_split, self.pts_prefix,
f'{idx:06d}.bin')
return pts_filename
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str): Filename of point clouds.
- img_prefix (str | None): Prefix of image files.
- img_info (dict): Image info.
- lidar2img (list[np.ndarray], optional): Transformations \
from lidar to different cameras.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
sample_idx = info['image']['image_idx']
img_filename = os.path.join(self.data_root,
info['image']['image_path'])
# TODO: consider use torch.Tensor only
rect = info['calib']['R0_rect'].astype(np.float32)
Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
P2 = info['calib']['P2'].astype(np.float32)
lidar2img = P2 @ rect @ Trv2c
pts_filename = self._get_pts_filename(sample_idx)
input_dict = dict(
sample_idx=sample_idx,
pts_filename=pts_filename,
img_prefix=None,
img_info=dict(filename=img_filename),
lidar2img=lidar2img)
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
return input_dict
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: annotation information consists of the following keys:
- gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
3D ground truth bboxes.
- gt_labels_3d (np.ndarray): Labels of ground truths.
- gt_bboxes (np.ndarray): 2D ground truth bboxes.
- gt_labels (np.ndarray): Labels of ground truths.
- gt_names (list[str]): Class names of ground truths.
"""
# Use index to get the annos, thus the evalhook could also use this api
info = self.data_infos[index]
rect = info['calib']['R0_rect'].astype(np.float32)
Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
annos = info['annos']
# we need other objects to avoid collision when sample
annos = self.remove_dontcare(annos)
loc = annos['location']
dims = annos['dimensions']
rots = annos['rotation_y']
gt_names = annos['name']
gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]],
axis=1).astype(np.float32)
# convert gt_bboxes_3d to velodyne coordinates
gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to(
self.box_mode_3d, np.linalg.inv(rect @ Trv2c))
gt_bboxes = annos['bbox']
selected = self.drop_arrays_by_name(gt_names, ['DontCare'])
gt_bboxes = gt_bboxes[selected].astype('float32')
gt_names = gt_names[selected]
gt_labels = []
for cat in gt_names:
if cat in self.CLASSES:
gt_labels.append(self.CLASSES.index(cat))
else:
gt_labels.append(-1)
gt_labels = np.array(gt_labels).astype(np.int64)
gt_labels_3d = copy.deepcopy(gt_labels)
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
bboxes=gt_bboxes,
labels=gt_labels,
gt_names=gt_names)
return anns_results
def drop_arrays_by_name(self, gt_names, used_classes):
"""Drop irrelevant ground truths by name.
Args:
gt_names (list[str]): Names of ground truths.
used_classes (list[str]): Classes of interest.
Returns:
np.ndarray: Indices of ground truths that will be dropped.
"""
inds = [i for i, x in enumerate(gt_names) if x not in used_classes]
inds = np.array(inds, dtype=np.int64)
return inds
def keep_arrays_by_name(self, gt_names, used_classes):
"""Keep useful ground truths by name.
Args:
gt_names (list[str]): Names of ground truths.
used_classes (list[str]): Classes of interest.
Returns:
np.ndarray: Indices of ground truths that will be keeped.
"""
inds = [i for i, x in enumerate(gt_names) if x in used_classes]
inds = np.array(inds, dtype=np.int64)
return inds
def remove_dontcare(self, ann_info):
"""Remove annotations that do not need to be cared.
Args:
ann_info (dict): Dict of annotation infos. The ``'DontCare'``
annotations will be removed according to ann_file['name'].
Returns:
dict: Annotations after filtering.
"""
img_filtered_annotations = {}
relevant_annotation_indices = [
i for i, x in enumerate(ann_info['name']) if x != 'DontCare'
]
for key in ann_info.keys():
img_filtered_annotations[key] = (
ann_info[key][relevant_annotation_indices])
return img_filtered_annotations
def format_results(self,
outputs,
pklfile_prefix=None,
submission_prefix=None):
"""Format the results to pkl file.
Args:
outputs (list[dict]): Testing results of the dataset.
pklfile_prefix (str | None): The prefix of pkl files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
submission_prefix (str | None): The prefix of submitted files. It
includes the file path and the prefix of filename, e.g.,
"a/b/prefix". If not specified, a temp file will be created.
Default: None.
Returns:
tuple: (result_files, tmp_dir), result_files is a dict containing \
the json filepaths, tmp_dir is the temporal directory created \
for saving json files when jsonfile_prefix is not specified.
"""
if pklfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
pklfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
if not isinstance(outputs[0], dict):
result_files = self.bbox2result_kitti2d(outputs, self.CLASSES,
pklfile_prefix,
submission_prefix)
elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0]:
result_files = dict()
for name in outputs[0]:
results_ = [out[name] for out in outputs]
pklfile_prefix_ = pklfile_prefix + name
if submission_prefix is not None:
submission_prefix_ = submission_prefix + name
else:
submission_prefix_ = None
if 'img' in name:
result_files = self.bbox2result_kitti2d(
results_, self.CLASSES, pklfile_prefix_,
submission_prefix_)
else:
result_files_ = self.bbox2result_kitti(
results_, self.CLASSES, pklfile_prefix_,
submission_prefix_)
result_files[name] = result_files_
else:
result_files = self.bbox2result_kitti(outputs, self.CLASSES,
pklfile_prefix,
submission_prefix)
return result_files, tmp_dir
def evaluate(self,
results,
metric=None,
logger=None,
pklfile_prefix=None,
submission_prefix=None,
show=False,
out_dir=None):
"""Evaluation in KITTI protocol.
Args:
results (list[dict]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
pklfile_prefix (str | None): The prefix of pkl files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
submission_prefix (str | None): The prefix of submission datas.
If not specified, the submission data will not be generated.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
Returns:
dict[str, float]: Results of each evaluation metric.
"""
result_files, tmp_dir = self.format_results(results, pklfile_prefix)
from mmdet3d.core.evaluation import kitti_eval
gt_annos = [info['annos'] for info in self.data_infos]
if isinstance(result_files, dict):
ap_dict = dict()
for name, result_files_ in result_files.items():
eval_types = ['bbox', 'bev', '3d']
if 'img' in name:
eval_types = ['bbox']
ap_result_str, ap_dict_ = kitti_eval(
gt_annos,
result_files_,
self.CLASSES,
eval_types=eval_types)
for ap_type, ap in ap_dict_.items():
ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap))
print_log(
f'Results of {name}:\n' + ap_result_str, logger=logger)
else:
if metric == 'img_bbox':
ap_result_str, ap_dict = kitti_eval(
gt_annos, result_files, self.CLASSES, eval_types=['bbox'])
else:
ap_result_str, ap_dict = kitti_eval(gt_annos, result_files,
self.CLASSES)
print_log('\n' + ap_result_str, logger=logger)
if tmp_dir is not None:
tmp_dir.cleanup()
if show:
self.show(results, out_dir)
return ap_dict
def bbox2result_kitti(self,
net_outputs,
class_names,
pklfile_prefix=None,
submission_prefix=None):
"""Convert 3D detection results to kitti format for evaluation and test
submission.
Args:
net_outputs (list[np.ndarray]): List of array storing the \
inferenced bounding boxes and scores.
class_names (list[String]): A list of class names.
pklfile_prefix (str | None): The prefix of pkl file.
submission_prefix (str | None): The prefix of submission file.
Returns:
list[dict]: A list of dictionaries with the kitti format.
"""
assert len(net_outputs) == len(self.data_infos), \
'invalid list length of network outputs'
if submission_prefix is not None:
mmcv.mkdir_or_exist(submission_prefix)
det_annos = []
print('\nConverting prediction to KITTI format')
for idx, pred_dicts in enumerate(
mmcv.track_iter_progress(net_outputs)):
annos = []
info = self.data_infos[idx]
sample_idx = info['image']['image_idx']
image_shape = info['image']['image_shape'][:2]
box_dict = self.convert_valid_bboxes(pred_dicts, info)
anno = {
'name': [],
'truncated': [],
'occluded': [],
'alpha': [],
'bbox': [],
'dimensions': [],
'location': [],
'rotation_y': [],
'score': []
}
if len(box_dict['bbox']) > 0:
box_2d_preds = box_dict['bbox']
box_preds = box_dict['box3d_camera']
scores = box_dict['scores']
box_preds_lidar = box_dict['box3d_lidar']
label_preds = box_dict['label_preds']
for box, box_lidar, bbox, score, label in zip(
box_preds, box_preds_lidar, box_2d_preds, scores,
label_preds):
bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
bbox[:2] = np.maximum(bbox[:2], [0, 0])
anno['name'].append(class_names[int(label)])
anno['truncated'].append(0.0)
anno['occluded'].append(0)
anno['alpha'].append(
-np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
anno['bbox'].append(bbox)
anno['dimensions'].append(box[3:6])
anno['location'].append(box[:3])
anno['rotation_y'].append(box[6])
anno['score'].append(score)
anno = {k: np.stack(v) for k, v in anno.items()}
annos.append(anno)
else:
anno = {
'name': np.array([]),
'truncated': np.array([]),
'occluded': np.array([]),
'alpha': np.array([]),
'bbox': np.zeros([0, 4]),
'dimensions': np.zeros([0, 3]),
'location': np.zeros([0, 3]),
'rotation_y': np.array([]),
'score': np.array([]),
}
annos.append(anno)
if submission_prefix is not None:
curr_file = f'{submission_prefix}/{sample_idx:06d}.txt'
with open(curr_file, 'w') as f:
bbox = anno['bbox']
loc = anno['location']
dims = anno['dimensions'] # lhw -> hwl
for idx in range(len(bbox)):
print(
'{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
'{:.4f} {:.4f} {:.4f} '
'{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(
anno['name'][idx], anno['alpha'][idx],
bbox[idx][0], bbox[idx][1], bbox[idx][2],
bbox[idx][3], dims[idx][1], dims[idx][2],
dims[idx][0], loc[idx][0], loc[idx][1],
loc[idx][2], anno['rotation_y'][idx],
anno['score'][idx]),
file=f)
annos[-1]['sample_idx'] = np.array(
[sample_idx] * len(annos[-1]['score']), dtype=np.int64)
det_annos += annos
if pklfile_prefix is not None:
if not pklfile_prefix.endswith(('.pkl', '.pickle')):
out = f'{pklfile_prefix}.pkl'
mmcv.dump(det_annos, out)
print(f'Result is saved to {out}.')
return det_annos
def bbox2result_kitti2d(self,
net_outputs,
class_names,
pklfile_prefix=None,
submission_prefix=None):
"""Convert 2D detection results to kitti format for evaluation and test
submission.
Args:
net_outputs (list[np.ndarray]): List of array storing the \
inferenced bounding boxes and scores.
class_names (list[String]): A list of class names.
pklfile_prefix (str | None): The prefix of pkl file.
submission_prefix (str | None): The prefix of submission file.
Returns:
list[dict]: A list of dictionaries have the kitti format
"""
assert len(net_outputs) == len(self.data_infos), \
'invalid list length of network outputs'
det_annos = []
print('\nConverting prediction to KITTI format')
for i, bboxes_per_sample in enumerate(
mmcv.track_iter_progress(net_outputs)):
annos = []
anno = dict(
name=[],
truncated=[],
occluded=[],
alpha=[],
bbox=[],
dimensions=[],
location=[],
rotation_y=[],
score=[])
sample_idx = self.data_infos[i]['image']['image_idx']
num_example = 0
for label in range(len(bboxes_per_sample)):
bbox = bboxes_per_sample[label]
for i in range(bbox.shape[0]):
anno['name'].append(class_names[int(label)])
anno['truncated'].append(0.0)
anno['occluded'].append(0)
anno['alpha'].append(0.0)
anno['bbox'].append(bbox[i, :4])
# set dimensions (height, width, length) to zero
anno['dimensions'].append(
np.zeros(shape=[3], dtype=np.float32))
# set the 3D translation to (-1000, -1000, -1000)
anno['location'].append(
np.ones(shape=[3], dtype=np.float32) * (-1000.0))
anno['rotation_y'].append(0.0)
anno['score'].append(bbox[i, 4])
num_example += 1
if num_example == 0:
annos.append(
dict(
name=np.array([]),
truncated=np.array([]),
occluded=np.array([]),
alpha=np.array([]),
bbox=np.zeros([0, 4]),
dimensions=np.zeros([0, 3]),
location=np.zeros([0, 3]),
rotation_y=np.array([]),
score=np.array([]),
))
else:
anno = {k: np.stack(v) for k, v in anno.items()}
annos.append(anno)
annos[-1]['sample_idx'] = np.array(
[sample_idx] * num_example, dtype=np.int64)
det_annos += annos
if pklfile_prefix is not None:
# save file in pkl format
pklfile_path = (
pklfile_prefix[:-4] if pklfile_prefix.endswith(
('.pkl', '.pickle')) else pklfile_prefix)
mmcv.dump(det_annos, pklfile_path)
if submission_prefix is not None:
# save file in submission format
mmcv.mkdir_or_exist(submission_prefix)
print(f'Saving KITTI submission to {submission_prefix}')
for i, anno in enumerate(det_annos):
sample_idx = self.data_infos[i]['image']['image_idx']
cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt'
with open(cur_det_file, 'w') as f:
bbox = anno['bbox']
loc = anno['location']
dims = anno['dimensions'][::-1] # lhw -> hwl
for idx in range(len(bbox)):
print(
'{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} '
'{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format(
anno['name'][idx],
anno['alpha'][idx],
*bbox[idx], # 4 float
*dims[idx], # 3 float
*loc[idx], # 3 float
anno['rotation_y'][idx],
anno['score'][idx]),
file=f,
)
print('Result is saved to {}'.format(submission_prefix))
return det_annos
def convert_valid_bboxes(self, box_dict, info):
"""Convert the predicted boxes into valid ones.
Args:
box_dict (dict): Box dictionaries to be converted.
- boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes.
- scores_3d (torch.Tensor): Scores of boxes.
- labels_3d (torch.Tensor): Class labels of boxes.
info (dict): Data info.
Returns:
dict: Valid predicted boxes.
- bbox (np.ndarray): 2D bounding boxes.
- box3d_camera (np.ndarray): 3D bounding boxes in \
camera coordinate.
- box3d_lidar (np.ndarray): 3D bounding boxes in \
LiDAR coordinate.
- scores (np.ndarray): Scores of boxes.
- label_preds (np.ndarray): Class label predictions.
- sample_idx (int): Sample index.
"""
# TODO: refactor this function
box_preds = box_dict['boxes_3d']
scores = box_dict['scores_3d']
labels = box_dict['labels_3d']
sample_idx = info['image']['image_idx']
# TODO: remove the hack of yaw
box_preds.tensor[:, -1] = box_preds.tensor[:, -1] - np.pi
box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
if len(box_preds) == 0:
return dict(
bbox=np.zeros([0, 4]),
box3d_camera=np.zeros([0, 7]),
box3d_lidar=np.zeros([0, 7]),
scores=np.zeros([0]),
label_preds=np.zeros([0, 4]),
sample_idx=sample_idx)
rect = info['calib']['R0_rect'].astype(np.float32)
Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
P2 = info['calib']['P2'].astype(np.float32)
img_shape = info['image']['image_shape']
P2 = box_preds.tensor.new_tensor(P2)
box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
box_corners = box_preds_camera.corners
box_corners_in_image = points_cam2img(box_corners, P2)
# box_corners_in_image: [N, 8, 2]
minxy = torch.min(box_corners_in_image, dim=1)[0]
maxxy = torch.max(box_corners_in_image, dim=1)[0]
box_2d_preds = torch.cat([minxy, maxxy], dim=1)
# Post-processing
# check box_preds_camera
image_shape = box_preds.tensor.new_tensor(img_shape)
valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) &
(box_2d_preds[:, 1] < image_shape[0]) &
(box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0))
# check box_preds
limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
(box_preds.center < limit_range[3:]))
valid_inds = valid_cam_inds & valid_pcd_inds.all(-1)
if valid_inds.sum() > 0:
return dict(
bbox=box_2d_preds[valid_inds, :].numpy(),
box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
box3d_lidar=box_preds[valid_inds].tensor.numpy(),
scores=scores[valid_inds].numpy(),
label_preds=labels[valid_inds].numpy(),
sample_idx=sample_idx,
)
else:
return dict(
bbox=np.zeros([0, 4]),
box3d_camera=np.zeros([0, 7]),
box3d_lidar=np.zeros([0, 7]),
scores=np.zeros([0]),
label_preds=np.zeros([0, 4]),
sample_idx=sample_idx,
)
def show(self, results, out_dir, show=True):
"""Results visualization.
Args:
results (list[dict]): List of bounding boxes results.
out_dir (str): Output directory of visualization result.
show (bool): Visualize the results online.
"""
assert out_dir is not None, 'Expect out_dir, got none.'
for i, result in enumerate(results):
example = self.prepare_test_data(i)
data_info = self.data_infos[i]
pts_path = data_info['point_cloud']['velodyne_path']
file_name = osp.split(pts_path)[-1].split('.')[0]
# for now we convert points into depth mode
points = example['points'][0]._data.numpy()
points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
Coord3DMode.DEPTH)
gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
Box3DMode.DEPTH)
pred_bboxes = result['boxes_3d'].tensor.numpy()
pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
Box3DMode.DEPTH)
show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
show)
================================================
FILE: mmdet3d/datasets/lyft_dataset.py
================================================
import mmcv
import numpy as np
import pandas as pd
import tempfile
from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft
from lyft_dataset_sdk.utils.data_classes import Box as LyftBox
from os import path as osp
from pyquaternion import Quaternion
from mmdet3d.core.evaluation.lyft_eval import lyft_eval
from mmdet.datasets import DATASETS
from ..core import show_result
from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
from .custom_3d import Custom3DDataset
@DATASETS.register_module()
class LyftDataset(Custom3DDataset):
r"""Lyft Dataset.
This class serves as the API for experiments on the Lyft Dataset.
Please refer to
``_ # noqa
for data downloading.
Args:
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
data_root (str): Path of dataset root.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
load_interval (int, optional): Interval of loading the dataset. It is
used to uniformly sample the dataset. Defaults to 1.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR' in this dataset. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
"""
NameMapping = {
'bicycle': 'bicycle',
'bus': 'bus',
'car': 'car',
'emergency_vehicle': 'emergency_vehicle',
'motorcycle': 'motorcycle',
'other_vehicle': 'other_vehicle',
'pedestrian': 'pedestrian',
'truck': 'truck',
'animal': 'animal'
}
DefaultAttribute = {
'car': 'is_stationary',
'truck': 'is_stationary',
'bus': 'is_stationary',
'emergency_vehicle': 'is_stationary',
'other_vehicle': 'is_stationary',
'motorcycle': 'is_stationary',
'bicycle': 'is_stationary',
'pedestrian': 'is_stationary',
'animal': 'is_stationary'
}
CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle',
'motorcycle', 'bicycle', 'pedestrian', 'animal')
def __init__(self,
ann_file,
pipeline=None,
data_root=None,
classes=None,
load_interval=1,
modality=None,
box_type_3d='LiDAR',
filter_empty_gt=True,
test_mode=False):
self.load_interval = load_interval
super().__init__(
data_root=data_root,
ann_file=ann_file,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode)
if self.modality is None:
self.modality = dict(
use_camera=False,
use_lidar=True,
use_radar=False,
use_map=False,
use_external=False,
)
def load_annotations(self, ann_file):
"""Load annotations from ann_file.
Args:
ann_file (str): Path of the annotation file.
Returns:
list[dict]: List of annotations sorted by timestamps.
"""
data = mmcv.load(ann_file)
data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
data_infos = data_infos[::self.load_interval]
self.metadata = data['metadata']
self.version = self.metadata['version']
return data_infos
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): sample index
- pts_filename (str): filename of point clouds
- sweeps (list[dict]): infos of sweeps
- timestamp (float): sample timestamp
- img_filename (str, optional): image filename
- lidar2img (list[np.ndarray], optional): transformations \
from lidar to different cameras
- ann_info (dict): annotation info
"""
info = self.data_infos[index]
# standard protocal modified from SECOND.Pytorch
input_dict = dict(
sample_idx=info['token'],
pts_filename=info['lidar_path'],
sweeps=info['sweeps'],
timestamp=info['timestamp'] / 1e6,
)
if self.modality['use_camera']:
image_paths = []
lidar2img_rts = []
for cam_type, cam_info in info['cams'].items():
image_paths.append(cam_info['data_path'])
# obtain lidar to image transformation matrix
lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
lidar2cam_t = cam_info[
'sensor2lidar_translation'] @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
intrinsic = cam_info['cam_intrinsic']
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
lidar2img_rts.append(lidar2img_rt)
input_dict.update(
dict(
img_filename=image_paths,
lidar2img=lidar2img_rts,
))
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
return input_dict
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: Annotation information consists of the following keys:
- gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
3D ground truth bboxes.
- gt_labels_3d (np.ndarray): Labels of ground truths.
- gt_names (list[str]): Class names of ground truths.
"""
info = self.data_infos[index]
gt_bboxes_3d = info['gt_boxes']
gt_names_3d = info['gt_names']
gt_labels_3d = []
for cat in gt_names_3d:
if cat in self.CLASSES:
gt_labels_3d.append(self.CLASSES.index(cat))
else:
gt_labels_3d.append(-1)
gt_labels_3d = np.array(gt_labels_3d)
if 'gt_shape' in info:
gt_shape = info['gt_shape']
gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1)
# the lyft box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0)
gt_bboxes_3d = LiDARInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
)
return anns_results
def _format_bbox(self, results, jsonfile_prefix=None):
"""Convert the results to the standard format.
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str): The prefix of the output jsonfile.
You can specify the output directory/filename by
modifying the jsonfile_prefix. Default: None.
Returns:
str: Path of the output json file.
"""
lyft_annos = {}
mapped_class_names = self.CLASSES
print('Start to convert detection format...')
for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
annos = []
boxes = output_to_lyft_box(det)
sample_token = self.data_infos[sample_id]['token']
boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes)
for i, box in enumerate(boxes):
name = mapped_class_names[box.label]
lyft_anno = dict(
sample_token=sample_token,
translation=box.center.tolist(),
size=box.wlh.tolist(),
rotation=box.orientation.elements.tolist(),
name=name,
score=box.score)
annos.append(lyft_anno)
lyft_annos[sample_token] = annos
lyft_submissions = {
'meta': self.modality,
'results': lyft_annos,
}
mmcv.mkdir_or_exist(jsonfile_prefix)
res_path = osp.join(jsonfile_prefix, 'results_lyft.json')
print('Results writes to', res_path)
mmcv.dump(lyft_submissions, res_path)
return res_path
def _evaluate_single(self,
result_path,
logger=None,
metric='bbox',
result_name='pts_bbox'):
"""Evaluation for a single model in Lyft protocol.
Args:
result_path (str): Path of the result file.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
metric (str): Metric name used for evaluation. Default: 'bbox'.
result_name (str): Result name in the metric prefix.
Default: 'pts_bbox'.
Returns:
dict: Dictionary of evaluation details.
"""
output_dir = osp.join(*osp.split(result_path)[:-1])
lyft = Lyft(
data_path=osp.join(self.data_root, self.version),
json_path=osp.join(self.data_root, self.version, self.version),
verbose=True)
eval_set_map = {
'v1.01-train': 'val',
}
metrics = lyft_eval(lyft, self.data_root, result_path,
eval_set_map[self.version], output_dir, logger)
# record metrics
detail = dict()
metric_prefix = f'{result_name}_Lyft'
for i, name in enumerate(metrics['class_names']):
AP = float(metrics['mAPs_cate'][i])
detail[f'{metric_prefix}/{name}_AP'] = AP
detail[f'{metric_prefix}/mAP'] = metrics['Final mAP']
return detail
def format_results(self, results, jsonfile_prefix=None, csv_savepath=None):
"""Format the results to json (standard format for COCO evaluation).
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
csv_savepath (str | None): The path for saving csv files.
It includes the file path and the csv filename,
e.g., "a/b/filename.csv". If not specified,
the result will not be converted to csv file.
Returns:
tuple: Returns (result_files, tmp_dir), where `result_files` is a \
dict containing the json filepaths, `tmp_dir` is the temporal \
directory created for saving json files when \
`jsonfile_prefix` is not specified.
"""
assert isinstance(results, list), 'results must be a list'
assert len(results) == len(self), (
'The length of results is not equal to the dataset len: {} != {}'.
format(len(results), len(self)))
if jsonfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
jsonfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
if not isinstance(results[0], dict):
result_files = self._format_bbox(results, jsonfile_prefix)
else:
result_files = dict()
for name in results[0]:
print(f'\nFormating bboxes of {name}')
results_ = [out[name] for out in results]
tmp_file_ = osp.join(jsonfile_prefix, name)
result_files.update(
{name: self._format_bbox(results_, tmp_file_)})
if csv_savepath is not None:
self.json2csv(result_files['pts_bbox'], csv_savepath)
return result_files, tmp_dir
def evaluate(self,
results,
metric='bbox',
logger=None,
jsonfile_prefix=None,
csv_savepath=None,
result_names=['pts_bbox'],
show=False,
out_dir=None):
"""Evaluation in Lyft protocol.
Args:
results (list[dict]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
csv_savepath (str | None): The path for saving csv files.
It includes the file path and the csv filename,
e.g., "a/b/filename.csv". If not specified,
the result will not be converted to csv file.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
Returns:
dict[str, float]: Evaluation results.
"""
result_files, tmp_dir = self.format_results(results, jsonfile_prefix,
csv_savepath)
if isinstance(result_files, dict):
results_dict = dict()
for name in result_names:
print(f'Evaluating bboxes of {name}')
ret_dict = self._evaluate_single(result_files[name])
results_dict.update(ret_dict)
elif isinstance(result_files, str):
results_dict = self._evaluate_single(result_files)
if tmp_dir is not None:
tmp_dir.cleanup()
if show:
self.show(results, out_dir)
return results_dict
def show(self, results, out_dir):
"""Results visualization.
Args:
results (list[dict]): List of bounding boxes results.
out_dir (str): Output directory of visualization result.
"""
for i, result in enumerate(results):
example = self.prepare_test_data(i)
points = example['points'][0]._data.numpy()
data_info = self.data_infos[i]
pts_path = data_info['lidar_path']
file_name = osp.split(pts_path)[-1].split('.')[0]
# for now we convert points into depth mode
points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
Coord3DMode.DEPTH)
inds = result['pts_bbox']['scores_3d'] > 0.1
gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
Box3DMode.DEPTH)
pred_bboxes = result['pts_bbox']['boxes_3d'][inds].tensor.numpy()
pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
Box3DMode.DEPTH)
show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name)
def json2csv(self, json_path, csv_savepath):
"""Convert the json file to csv format for submission.
Args:
json_path (str): Path of the result json file.
csv_savepath (str): Path to save the csv file.
"""
results = mmcv.load(json_path)['results']
sample_list_path = osp.join(self.data_root, 'sample_submission.csv')
data = pd.read_csv(sample_list_path)
Id_list = list(data['Id'])
pred_list = list(data['PredictionString'])
cnt = 0
print('Converting the json to csv...')
for token in results.keys():
cnt += 1
predictions = results[token]
prediction_str = ''
for i in range(len(predictions)):
prediction_str += \
str(predictions[i]['score']) + ' ' + \
str(predictions[i]['translation'][0]) + ' ' + \
str(predictions[i]['translation'][1]) + ' ' + \
str(predictions[i]['translation'][2]) + ' ' + \
str(predictions[i]['size'][0]) + ' ' + \
str(predictions[i]['size'][1]) + ' ' + \
str(predictions[i]['size'][2]) + ' ' + \
str(Quaternion(list(predictions[i]['rotation']))
.yaw_pitch_roll[0]) + ' ' + \
predictions[i]['name'] + ' '
prediction_str = prediction_str[:-1]
idx = Id_list.index(token)
pred_list[idx] = prediction_str
df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list})
df.to_csv(csv_savepath, index=False)
def output_to_lyft_box(detection):
"""Convert the output to the box class in the Lyft.
Args:
detection (dict): Detection results.
Returns:
list[:obj:`LyftBox`]: List of standard LyftBoxes.
"""
box3d = detection['boxes_3d']
scores = detection['scores_3d'].numpy()
labels = detection['labels_3d'].numpy()
box_gravity_center = box3d.gravity_center.numpy()
box_dims = box3d.dims.numpy()
box_yaw = box3d.yaw.numpy()
# TODO: check whether this is necessary
# with dir_offset & dir_limit in the head
box_yaw = -box_yaw - np.pi / 2
box_list = []
for i in range(len(box3d)):
quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
box = LyftBox(
box_gravity_center[i],
box_dims[i],
quat,
label=labels[i],
score=scores[i])
box_list.append(box)
return box_list
def lidar_lyft_box_to_global(info, boxes):
"""Convert the box from ego to global coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes.
Returns:
list: List of standard LyftBoxes in the global
coordinate.
"""
box_list = []
for box in boxes:
# Move box to ego vehicle coord system
box.rotate(Quaternion(info['lidar2ego_rotation']))
box.translate(np.array(info['lidar2ego_translation']))
# Move box to global coord system
box.rotate(Quaternion(info['ego2global_rotation']))
box.translate(np.array(info['ego2global_translation']))
box_list.append(box)
return box_list
================================================
FILE: mmdet3d/datasets/nuscenes_dataset.py
================================================
import mmcv
import numpy as np
import pyquaternion
import tempfile
from nuscenes.utils.data_classes import Box as NuScenesBox
from os import path as osp
from mmdet.datasets import DATASETS
from ..core import show_result
from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes
from .custom_3d import Custom3DDataset
@DATASETS.register_module()
class NuScenesDataset(Custom3DDataset):
r"""NuScenes Dataset.
This class serves as the API for experiments on the NuScenes Dataset.
Please refer to `NuScenes Dataset `_
for data downloading.
Args:
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
data_root (str): Path of dataset root.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
load_interval (int, optional): Interval of loading the dataset. It is
used to uniformly sample the dataset. Defaults to 1.
with_velocity (bool, optional): Whether include velocity prediction
into the experiments. Defaults to True.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR' in this dataset. Available options includes.
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
eval_version (bool, optional): Configuration version of evaluation.
Defaults to 'detection_cvpr_2019'.
use_valid_flag (bool): Whether to use `use_valid_flag` key in the info
file as mask to filter gt_boxes and gt_names. Defaults to False.
"""
NameMapping = {
'movable_object.barrier': 'barrier',
'vehicle.bicycle': 'bicycle',
'vehicle.bus.bendy': 'bus',
'vehicle.bus.rigid': 'bus',
'vehicle.car': 'car',
'vehicle.construction': 'construction_vehicle',
'vehicle.motorcycle': 'motorcycle',
'human.pedestrian.adult': 'pedestrian',
'human.pedestrian.child': 'pedestrian',
'human.pedestrian.construction_worker': 'pedestrian',
'human.pedestrian.police_officer': 'pedestrian',
'movable_object.trafficcone': 'traffic_cone',
'vehicle.trailer': 'trailer',
'vehicle.truck': 'truck'
}
DefaultAttribute = {
'car': 'vehicle.parked',
'pedestrian': 'pedestrian.moving',
'trailer': 'vehicle.parked',
'truck': 'vehicle.parked',
'bus': 'vehicle.moving',
'motorcycle': 'cycle.without_rider',
'construction_vehicle': 'vehicle.parked',
'bicycle': 'cycle.without_rider',
'barrier': '',
'traffic_cone': '',
}
AttrMapping = {
'cycle.with_rider': 0,
'cycle.without_rider': 1,
'pedestrian.moving': 2,
'pedestrian.standing': 3,
'pedestrian.sitting_lying_down': 4,
'vehicle.moving': 5,
'vehicle.parked': 6,
'vehicle.stopped': 7,
}
AttrMapping_rev = [
'cycle.with_rider',
'cycle.without_rider',
'pedestrian.moving',
'pedestrian.standing',
'pedestrian.sitting_lying_down',
'vehicle.moving',
'vehicle.parked',
'vehicle.stopped',
]
CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle',
'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone',
'barrier')
def __init__(self,
ann_file,
num_views=6,
pipeline=None,
data_root=None,
classes=None,
load_interval=1,
with_velocity=True,
modality=None,
box_type_3d='LiDAR',
filter_empty_gt=True,
test_mode=False,
eval_version='detection_cvpr_2019',
use_valid_flag=False):
self.load_interval = load_interval
self.use_valid_flag = use_valid_flag
super().__init__(
data_root=data_root,
ann_file=ann_file,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode)
self.num_views = num_views
assert self.num_views <= 6
self.with_velocity = with_velocity
self.eval_version = eval_version
from nuscenes.eval.detection.config import config_factory
self.eval_detection_configs = config_factory(self.eval_version)
if self.modality is None:
self.modality = dict(
use_camera=False,
use_lidar=True,
use_radar=False,
use_map=False,
use_external=False,
)
def get_cat_ids(self, idx):
"""Get category distribution of single scene.
Args:
idx (int): Index of the data_info.
Returns:
dict[list]: for each category, if the current scene
contains such boxes, store a list containing idx,
otherwise, store empty list.
"""
info = self.data_infos[idx]
if self.use_valid_flag:
mask = info['valid_flag']
gt_names = set(info['gt_names'][mask])
else:
gt_names = set(info['gt_names'])
cat_ids = []
for name in gt_names:
if name in self.CLASSES:
cat_ids.append(self.cat2id[name])
return cat_ids
def load_annotations(self, ann_file):
"""Load annotations from ann_file.
Args:
ann_file (str): Path of the annotation file.
Returns:
list[dict]: List of annotations sorted by timestamps.
"""
data = mmcv.load(ann_file)
data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp']))
data_infos = data_infos[::self.load_interval]
self.metadata = data['metadata']
self.version = self.metadata['version']
return data_infos
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str): Filename of point clouds.
- sweeps (list[dict]): Infos of sweeps.
- timestamp (float): Sample timestamp.
- img_filename (str, optional): Image filename.
- lidar2img (list[np.ndarray], optional): Transformations \
from lidar to different cameras.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
# standard protocal modified from SECOND.Pytorch
input_dict = dict(
sample_idx=info['token'],
pts_filename=info['lidar_path'],
sweeps=info['sweeps'],
timestamp=info['timestamp'] / 1e6,
)
cam_orders = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT']
if self.modality['use_camera']:
image_paths = []
lidar2img_rts = []
# for cam_type, cam_info in info['cams'].items():
intrinsics = []
lidar2cam_rs = []
lidar2cam_ts = []
for cam_type in cam_orders:
cam_info = info['cams'][cam_type]
image_paths.append(cam_info['data_path'])
# obtain lidar to image transformation matrix
lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
lidar2cam_t = cam_info[
'sensor2lidar_translation'] @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
intrinsic = cam_info['cam_intrinsic']
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
lidar2img_rts.append(lidar2img_rt)
intrinsics.append(intrinsic)
lidar2cam_rs.append(lidar2cam_r)
lidar2cam_ts.append(lidar2cam_t)
input_dict.update(
dict(
img_filename=image_paths,
lidar2img=lidar2img_rts,
cam_intrinsic=intrinsics,
lidar2cam_r=lidar2cam_rs,
lidar2cam_t=lidar2cam_ts,
))
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
return input_dict
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: Annotation information consists of the following keys:
- gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): \
3D ground truth bboxes
- gt_labels_3d (np.ndarray): Labels of ground truths.
- gt_names (list[str]): Class names of ground truths.
"""
info = self.data_infos[index]
# filter out bbox containing no points
if self.use_valid_flag:
mask = info['valid_flag']
else:
mask = info['num_lidar_pts'] > 0
gt_bboxes_3d = info['gt_boxes'][mask]
gt_names_3d = info['gt_names'][mask]
gt_labels_3d = []
for cat in gt_names_3d:
if cat in self.CLASSES:
gt_labels_3d.append(self.CLASSES.index(cat))
else:
gt_labels_3d.append(-1)
gt_labels_3d = np.array(gt_labels_3d)
if self.with_velocity:
gt_velocity = info['gt_velocity'][mask]
nan_mask = np.isnan(gt_velocity[:, 0])
gt_velocity[nan_mask] = [0.0, 0.0]
gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
# the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0)
gt_bboxes_3d = LiDARInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
gt_names=gt_names_3d)
return anns_results
def _format_bbox(self, results, jsonfile_prefix=None):
"""Convert the results to the standard format.
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str): The prefix of the output jsonfile.
You can specify the output directory/filename by
modifying the jsonfile_prefix. Default: None.
Returns:
str: Path of the output json file.
"""
nusc_annos = {}
mapped_class_names = self.CLASSES
print('Start to convert detection format...')
for sample_id, det in enumerate(mmcv.track_iter_progress(results)):
annos = []
boxes = output_to_nusc_box(det)
sample_token = self.data_infos[sample_id]['token']
boxes = lidar_nusc_box_to_global(self.data_infos[sample_id], boxes,
mapped_class_names,
self.eval_detection_configs,
self.eval_version)
for i, box in enumerate(boxes):
name = mapped_class_names[box.label]
if np.sqrt(box.velocity[0]**2 + box.velocity[1]**2) > 0.2:
if name in [
'car',
'construction_vehicle',
'bus',
'truck',
'trailer',
]:
attr = 'vehicle.moving'
elif name in ['bicycle', 'motorcycle']:
attr = 'cycle.with_rider'
else:
attr = NuScenesDataset.DefaultAttribute[name]
else:
if name in ['pedestrian']:
attr = 'pedestrian.standing'
elif name in ['bus']:
attr = 'vehicle.stopped'
else:
attr = NuScenesDataset.DefaultAttribute[name]
nusc_anno = dict(
sample_token=sample_token,
translation=box.center.tolist(),
size=box.wlh.tolist(),
rotation=box.orientation.elements.tolist(),
velocity=box.velocity[:2].tolist(),
detection_name=name,
detection_score=box.score,
attribute_name=attr)
annos.append(nusc_anno)
nusc_annos[sample_token] = annos
nusc_submissions = {
'meta': self.modality,
'results': nusc_annos,
}
mmcv.mkdir_or_exist(jsonfile_prefix)
res_path = osp.join(jsonfile_prefix, 'results_nusc.json')
print('Results writes to', res_path)
mmcv.dump(nusc_submissions, res_path)
return res_path
def _evaluate_single(self,
result_path,
logger=None,
metric='bbox',
result_name='pts_bbox'):
"""Evaluation for a single model in nuScenes protocol.
Args:
result_path (str): Path of the result file.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
metric (str): Metric name used for evaluation. Default: 'bbox'.
result_name (str): Result name in the metric prefix.
Default: 'pts_bbox'.
Returns:
dict: Dictionary of evaluation details.
"""
from nuscenes import NuScenes
from nuscenes.eval.detection.evaluate import NuScenesEval
output_dir = osp.join(*osp.split(result_path)[:-1])
nusc = NuScenes(
version=self.version, dataroot=self.data_root, verbose=False)
eval_set_map = {
'v1.0-mini': 'mini_val',
'v1.0-trainval': 'val',
}
nusc_eval = NuScenesEval(
nusc,
config=self.eval_detection_configs,
result_path=result_path,
eval_set=eval_set_map[self.version],
output_dir=output_dir,
verbose=False)
nusc_eval.main(render_curves=False)
# record metrics
metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json'))
detail = dict()
metric_prefix = f'{result_name}_NuScenes'
for name in self.CLASSES:
for k, v in metrics['label_aps'][name].items():
val = float('{:.4f}'.format(v))
detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val
for k, v in metrics['label_tp_errors'][name].items():
val = float('{:.4f}'.format(v))
detail['{}/{}_{}'.format(metric_prefix, name, k)] = val
detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score']
detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap']
return detail
def format_results(self, results, jsonfile_prefix=None):
"""Format the results to json (standard format for COCO evaluation).
Args:
results (list[dict]): Testing results of the dataset.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
Returns:
tuple: Returns (result_files, tmp_dir), where `result_files` is a \
dict containing the json filepaths, `tmp_dir` is the temporal \
directory created for saving json files when \
`jsonfile_prefix` is not specified.
"""
assert isinstance(results, list), 'results must be a list'
assert len(results) == len(self), (
'The length of results is not equal to the dataset len: {} != {}'.
format(len(results), len(self)))
if jsonfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
jsonfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
if not isinstance(results[0], dict):
result_files = self._format_bbox(results, jsonfile_prefix)
else:
result_files = dict()
for name in results[0]:
print(f'\nFormating bboxes of {name}')
results_ = [out[name] for out in results]
tmp_file_ = osp.join(jsonfile_prefix, name)
result_files.update(
{name: self._format_bbox(results_, tmp_file_)})
return result_files, tmp_dir
def evaluate(self,
results,
metric='bbox',
logger=None,
jsonfile_prefix=None,
result_names=['pts_bbox'],
show=False,
out_dir=None):
"""Evaluation in nuScenes protocol.
Args:
results (list[dict]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
jsonfile_prefix (str | None): The prefix of json files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
Returns:
dict[str, float]: Results of each evaluation metric.
"""
result_files, tmp_dir = self.format_results(results, jsonfile_prefix)
if isinstance(result_files, dict):
results_dict = dict()
for name in result_names:
print('Evaluating bboxes of {}'.format(name))
ret_dict = self._evaluate_single(result_files[name])
results_dict.update(ret_dict)
elif isinstance(result_files, str):
results_dict = self._evaluate_single(result_files)
if tmp_dir is not None:
tmp_dir.cleanup()
if show:
self.show(results, out_dir)
return results_dict
def show(self, results, out_dir):
"""Results visualization.
Args:
results (list[dict]): List of bounding boxes results.
out_dir (str): Output directory of visualization result.
"""
for i, result in enumerate(results):
example = self.prepare_test_data(i)
points = example['points'][0]._data.numpy()
data_info = self.data_infos[i]
pts_path = data_info['lidar_path']
file_name = osp.split(pts_path)[-1].split('.')[0]
# for now we convert points into depth mode
points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
Coord3DMode.DEPTH)
inds = result['pts_bbox']['scores_3d'] > 0.1
gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR,
Box3DMode.DEPTH)
pred_bboxes = result['pts_bbox']['boxes_3d'][inds].tensor.numpy()
pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR,
Box3DMode.DEPTH)
show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name)
def output_to_nusc_box(detection):
"""Convert the output to the box class in the nuScenes.
Args:
detection (dict): Detection results.
- boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox.
- scores_3d (torch.Tensor): Detection scores.
- labels_3d (torch.Tensor): Predicted box labels.
Returns:
list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes.
"""
box3d = detection['boxes_3d']
scores = detection['scores_3d'].numpy()
labels = detection['labels_3d'].numpy()
box_gravity_center = box3d.gravity_center.numpy()
box_dims = box3d.dims.numpy()
box_yaw = box3d.yaw.numpy()
# TODO: check whether this is necessary
# with dir_offset & dir_limit in the head
box_yaw = -box_yaw - np.pi / 2
box_list = []
for i in range(len(box3d)):
quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i])
velocity = (*box3d.tensor[i, 7:9], 0.0)
# velo_val = np.linalg.norm(box3d[i, 7:9])
# velo_ori = box3d[i, 6]
# velocity = (
# velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0)
box = NuScenesBox(
box_gravity_center[i],
box_dims[i],
quat,
label=labels[i],
score=scores[i],
velocity=velocity)
box_list.append(box)
return box_list
def lidar_nusc_box_to_global(info,
boxes,
classes,
eval_configs,
eval_version='detection_cvpr_2019'):
"""Convert the box from ego to global coordinate.
Args:
info (dict): Info for a specific sample data, including the
calibration information.
boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes.
classes (list[str]): Mapped classes in the evaluation.
eval_configs (object): Evaluation configuration object.
eval_version (str): Evaluation version.
Default: 'detection_cvpr_2019'
Returns:
list: List of standard NuScenesBoxes in the global
coordinate.
"""
box_list = []
for box in boxes:
# Move box to ego vehicle coord system
box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation']))
box.translate(np.array(info['lidar2ego_translation']))
# filter det in ego.
cls_range_map = eval_configs.class_range
radius = np.linalg.norm(box.center[:2], 2)
det_range = cls_range_map[classes[box.label]]
if radius > det_range:
continue
# Move box to global coord system
box.rotate(pyquaternion.Quaternion(info['ego2global_rotation']))
box.translate(np.array(info['ego2global_translation']))
box_list.append(box)
return box_list
================================================
FILE: mmdet3d/datasets/nuscenes_dataset_viewInfo.py
================================================
import mmcv
import numpy as np
import pyquaternion
import tempfile
from nuscenes.utils.data_classes import Box as NuScenesBox
from os import path as osp
from mmdet.datasets import DATASETS
from ..core import show_result
from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes, CameraInstance3DBoxes
from .nuscenes_dataset import NuScenesDataset
@DATASETS.register_module()
class NuScenesDataset_ViewInfo(NuScenesDataset):
"""
Compared with NuScenesDataset, we also load 2d annotations
"""
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: Annotation information consists of the following keys:
"""
info = self.data_infos[index]
# filter out bbox containing no points
if self.use_valid_flag:
mask = info['valid_flag']
else:
mask = info['num_lidar_pts'] > 0
gt_bboxes_3d = info['gt_boxes'][mask]
gt_names_3d = info['gt_names'][mask]
gt_visible_3d = info['gt_visible'][mask]
# .copy() cannot be missed!
gt_bboxes2d_view = info['gt_bboxes2d_view'].copy()
gt_bboxes2d_view[..., :2] = gt_bboxes2d_view[..., :2] + gt_bboxes2d_view[..., 2:4] / 2
gt_bboxes_lidar_view = info['gt_bboxes_lidar_view'].copy()
gt_names2d_view = info['gt_names2d_view']
gt_viewsIDs = info['gt_viewsIDs']
gt_labels_3d = []
for cat in gt_names_3d:
if cat in self.CLASSES:
gt_labels_3d.append(self.CLASSES.index(cat))
else:
gt_labels_3d.append(-1)
gt_labels_3d = np.array(gt_labels_3d)
gt_labels2d_view = []
for cat in gt_names2d_view:
if cat in self.CLASSES:
gt_labels2d_view.append(self.CLASSES.index(cat))
else:
gt_labels2d_view.append(-1)
gt_labels2d_view = np.array(gt_labels2d_view)
gt_labels2d_view = np.stack([gt_labels2d_view, gt_viewsIDs], axis=-1)
gt_bboxes_cam_view = info['gt_bboxes_cam_view'].copy()
if self.with_velocity:
gt_velocity = info['gt_velocity'][mask].copy()
nan_mask = np.isnan(gt_velocity[:, 0])
gt_velocity[nan_mask] = [0.0, 0.0]
gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1)
gt_cam_vel = info['gt_velocity_cam_view'].copy()
nan_mask_cam = np.isnan(gt_cam_vel[:, 0])
gt_cam_vel[nan_mask_cam] = [0.0, 0.0]
gt_bboxes_cam_view = np.concatenate([gt_bboxes_cam_view, gt_cam_vel], axis=-1)
gt_lidar_vel =info['gt_velocity_lidar_view'].copy()
nan_mask_lidar = np.isnan(gt_lidar_vel[:, 0])
gt_lidar_vel[nan_mask_lidar] = [0.0, 0.0]
gt_bboxes_lidar_view = np.concatenate([gt_bboxes_lidar_view, gt_lidar_vel], axis=-1)
gt_bboxes_cam_view = CameraInstance3DBoxes(
gt_bboxes_cam_view,
box_dim=gt_bboxes_cam_view.shape[-1],
origin=(0.5, 0.5, 0.5)
)
gt_bboxes_lidar_view = LiDARInstance3DBoxes(
gt_bboxes_lidar_view,
box_dim=gt_bboxes_lidar_view.shape[-1],
origin=(0.5, 0.5, 0.5)
).convert_to(self.box_mode_3d)
# the nuscenes box center is [0.5, 0.5, 0.5], we change it to be
# the same as KITTI (0.5, 0.5, 0)
gt_bboxes_3d = LiDARInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
if "gt_pts_centers_view" in info:
gt_pts_centers_view = info['gt_pts_centers_view'].copy()
gt_img_centers_view = info['gt_img_centers_view'].copy()
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
gt_visible_3d=gt_visible_3d,
gt_names=gt_names_3d,
bboxes=gt_bboxes2d_view,
labels=gt_labels2d_view,
pts_centers_view=gt_pts_centers_view,
img_centers_view=gt_img_centers_view,
bboxes_cam_view=gt_bboxes_cam_view,
bboxes_lidar_view=gt_bboxes_lidar_view,
)
else:
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
gt_visible_3d=gt_visible_3d,
gt_names=gt_names_3d,
bboxes=gt_bboxes2d_view,
labels=gt_labels2d_view,
)
return anns_results
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str): Filename of point clouds.
- sweeps (list[dict]): Infos of sweeps.
- timestamp (float): Sample timestamp.
- img_filename (str, optional): Image filename.
- lidar2img (list[np.ndarray], optional): Transformations \
from lidar to different cameras.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
# standard protocal modified from SECOND.Pytorch
input_dict = dict(
sample_idx=info['token'],
pts_filename=info['lidar_path'],
sweeps=info['sweeps'],
timestamp=info['timestamp'] / 1e6,
)
cam_orders = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT']
if self.modality['use_camera']:
image_paths = []
lidar2img_rts = []
# for cam_type, cam_info in info['cams'].items():
intrinsics = []
lidar2cam_rs = []
lidar2cam_ts = []
for cam_type in cam_orders:
cam_info = info['cams'][cam_type]
image_paths.append(cam_info['data_path'])
# obtain lidar to image transformation matrix
lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation'])
lidar2cam_t = cam_info[
'sensor2lidar_translation'] @ lidar2cam_r.T
lidar2cam_rt = np.eye(4)
lidar2cam_rt[:3, :3] = lidar2cam_r.T
lidar2cam_rt[3, :3] = -lidar2cam_t
intrinsic = cam_info['cam_intrinsic']
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic
lidar2img_rt = (viewpad @ lidar2cam_rt.T)
lidar2img_rts.append(lidar2img_rt.copy())
intrinsics.append(intrinsic.copy())
lidar2cam_rs.append(lidar2cam_r.copy())
lidar2cam_ts.append(-lidar2cam_t.copy())
input_dict.update(
dict(
img_filename=image_paths,
lidar2img=lidar2img_rts,
cam_intrinsic=intrinsics,
lidar2cam_r=lidar2cam_rs,
lidar2cam_t=lidar2cam_ts,
))
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
return input_dict
================================================
FILE: mmdet3d/datasets/pipelines/__init__.py
================================================
from mmdet.datasets.pipelines import Compose
from .dbsampler import DataBaseSampler
from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D
from .loading import (LoadAnnotations3D, LoadMultiViewImageFromFiles,
LoadPointsFromFile, LoadPointsFromMultiSweeps,
NormalizePointsColor, PointSegClassMapping,
MyLoadAnnotations3D)
from .test_time_aug import MultiScaleFlipAug3D
from .transforms_3d import (BackgroundPointsFilter, GlobalRotScaleTrans,
IndoorPointSample, ObjectNoise, ObjectRangeFilter,
ObjectSample, PointShuffle, PointsRangeFilter,
RandomFlip3D, VoxelBasedPointSampler, OurRandomFlip3D,
OurGlobalRotScaleTrans, OurObjectRangeFilter)
from .transforms_2d import OurRandomAffine, PhotoMetricDistortionMultiViewImage
__all__ = [
'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans',
'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D',
'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile',
'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler',
'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample',
'PointSegClassMapping', 'MultiScaleFlipAug3D', 'LoadPointsFromMultiSweeps',
'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'MyLoadAnnotations3D',
'OurRandomFlip3D', 'OurGlobalRotScaleTrans', 'OurRandomAffine',
'PhotoMetricDistortionMultiViewImage', 'OurObjectRangeFilter'
]
================================================
FILE: mmdet3d/datasets/pipelines/data_augment_utils.py
================================================
import numba
import numpy as np
import warnings
from numba.errors import NumbaPerformanceWarning
from mmdet3d.core.bbox import box_np_ops
warnings.filterwarnings('ignore', category=NumbaPerformanceWarning)
@numba.njit
def _rotation_box2d_jit_(corners, angle, rot_mat_T):
"""Rotate 2D boxes.
Args:
corners (np.ndarray): Corners of boxes.
angle (float): Rotation angle.
rot_mat_T (np.ndarray): Transposed rotation matrix.
"""
rot_sin = np.sin(angle)
rot_cos = np.cos(angle)
rot_mat_T[0, 0] = rot_cos
rot_mat_T[0, 1] = -rot_sin
rot_mat_T[1, 0] = rot_sin
rot_mat_T[1, 1] = rot_cos
corners[:] = corners @ rot_mat_T
@numba.jit(nopython=True)
def box_collision_test(boxes, qboxes, clockwise=True):
"""Box collision test.
Args:
boxes (np.ndarray): Corners of current boxes.
qboxes (np.ndarray): Boxes to be avoid colliding.
clockwise (bool): Whether the corners are in clockwise order.
Default: True.
"""
N = boxes.shape[0]
K = qboxes.shape[0]
ret = np.zeros((N, K), dtype=np.bool_)
slices = np.array([1, 2, 3, 0])
lines_boxes = np.stack((boxes, boxes[:, slices, :]),
axis=2) # [N, 4, 2(line), 2(xy)]
lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2)
# vec = np.zeros((2,), dtype=boxes.dtype)
boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes)
qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes)
for i in range(N):
for j in range(K):
# calculate standup first
iw = (
min(boxes_standup[i, 2], qboxes_standup[j, 2]) -
max(boxes_standup[i, 0], qboxes_standup[j, 0]))
if iw > 0:
ih = (
min(boxes_standup[i, 3], qboxes_standup[j, 3]) -
max(boxes_standup[i, 1], qboxes_standup[j, 1]))
if ih > 0:
for k in range(4):
for box_l in range(4):
A = lines_boxes[i, k, 0]
B = lines_boxes[i, k, 1]
C = lines_qboxes[j, box_l, 0]
D = lines_qboxes[j, box_l, 1]
acd = (D[1] - A[1]) * (C[0] -
A[0]) > (C[1] - A[1]) * (
D[0] - A[0])
bcd = (D[1] - B[1]) * (C[0] -
B[0]) > (C[1] - B[1]) * (
D[0] - B[0])
if acd != bcd:
abc = (C[1] - A[1]) * (B[0] - A[0]) > (
B[1] - A[1]) * (
C[0] - A[0])
abd = (D[1] - A[1]) * (B[0] - A[0]) > (
B[1] - A[1]) * (
D[0] - A[0])
if abc != abd:
ret[i, j] = True # collision.
break
if ret[i, j] is True:
break
if ret[i, j] is False:
# now check complete overlap.
# box overlap qbox:
box_overlap_qbox = True
for box_l in range(4): # point l in qboxes
for k in range(4): # corner k in boxes
vec = boxes[i, k] - boxes[i, (k + 1) % 4]
if clockwise:
vec = -vec
cross = vec[1] * (
boxes[i, k, 0] - qboxes[j, box_l, 0])
cross -= vec[0] * (
boxes[i, k, 1] - qboxes[j, box_l, 1])
if cross >= 0:
box_overlap_qbox = False
break
if box_overlap_qbox is False:
break
if box_overlap_qbox is False:
qbox_overlap_box = True
for box_l in range(4): # point box_l in boxes
for k in range(4): # corner k in qboxes
vec = qboxes[j, k] - qboxes[j, (k + 1) % 4]
if clockwise:
vec = -vec
cross = vec[1] * (
qboxes[j, k, 0] - boxes[i, box_l, 0])
cross -= vec[0] * (
qboxes[j, k, 1] - boxes[i, box_l, 1])
if cross >= 0: #
qbox_overlap_box = False
break
if qbox_overlap_box is False:
break
if qbox_overlap_box:
ret[i, j] = True # collision.
else:
ret[i, j] = True # collision.
return ret
@numba.njit
def noise_per_box(boxes, valid_mask, loc_noises, rot_noises):
"""Add noise to every box (only on the horizontal plane).
Args:
boxes (np.ndarray): Input boxes with shape (N, 5).
valid_mask (np.ndarray): Mask to indicate which boxes are valid
with shape (N).
loc_noises (np.ndarray): Location noises with shape (N, M, 3).
rot_noises (np.ndarray): Rotation noises with shape (N, M).
Returns:
np.ndarray: Mask to indicate whether the noise is
added successfully (pass the collision test).
"""
num_boxes = boxes.shape[0]
num_tests = loc_noises.shape[1]
box_corners = box_np_ops.box2d_to_corner_jit(boxes)
current_corners = np.zeros((4, 2), dtype=boxes.dtype)
rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
success_mask = -np.ones((num_boxes, ), dtype=np.int64)
# print(valid_mask)
for i in range(num_boxes):
if valid_mask[i]:
for j in range(num_tests):
current_corners[:] = box_corners[i]
current_corners -= boxes[i, :2]
_rotation_box2d_jit_(current_corners, rot_noises[i, j],
rot_mat_T)
current_corners += boxes[i, :2] + loc_noises[i, j, :2]
coll_mat = box_collision_test(
current_corners.reshape(1, 4, 2), box_corners)
coll_mat[0, i] = False
# print(coll_mat)
if not coll_mat.any():
success_mask[i] = j
box_corners[i] = current_corners
break
return success_mask
@numba.njit
def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises,
global_rot_noises):
"""Add noise to every box (only on the horizontal plane). Version 2 used
when enable global rotations.
Args:
boxes (np.ndarray): Input boxes with shape (N, 5).
valid_mask (np.ndarray): Mask to indicate which boxes are valid
with shape (N).
loc_noises (np.ndarray): Location noises with shape (N, M, 3).
rot_noises (np.ndarray): Rotation noises with shape (N, M).
Returns:
np.ndarray: Mask to indicate whether the noise is
added successfully (pass the collision test).
"""
num_boxes = boxes.shape[0]
num_tests = loc_noises.shape[1]
box_corners = box_np_ops.box2d_to_corner_jit(boxes)
current_corners = np.zeros((4, 2), dtype=boxes.dtype)
current_box = np.zeros((1, 5), dtype=boxes.dtype)
rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype)
dst_pos = np.zeros((2, ), dtype=boxes.dtype)
success_mask = -np.ones((num_boxes, ), dtype=np.int64)
corners_norm = np.zeros((4, 2), dtype=boxes.dtype)
corners_norm[1, 1] = 1.0
corners_norm[2] = 1.0
corners_norm[3, 0] = 1.0
corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype)
corners_norm = corners_norm.reshape(4, 2)
for i in range(num_boxes):
if valid_mask[i]:
for j in range(num_tests):
current_box[0, :] = boxes[i]
current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2)
current_grot = np.arctan2(boxes[i, 0], boxes[i, 1])
dst_grot = current_grot + global_rot_noises[i, j]
dst_pos[0] = current_radius * np.sin(dst_grot)
dst_pos[1] = current_radius * np.cos(dst_grot)
current_box[0, :2] = dst_pos
current_box[0, -1] += (dst_grot - current_grot)
rot_sin = np.sin(current_box[0, -1])
rot_cos = np.cos(current_box[0, -1])
rot_mat_T[0, 0] = rot_cos
rot_mat_T[0, 1] = -rot_sin
rot_mat_T[1, 0] = rot_sin
rot_mat_T[1, 1] = rot_cos
current_corners[:] = current_box[
0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2]
current_corners -= current_box[0, :2]
_rotation_box2d_jit_(current_corners, rot_noises[i, j],
rot_mat_T)
current_corners += current_box[0, :2] + loc_noises[i, j, :2]
coll_mat = box_collision_test(
current_corners.reshape(1, 4, 2), box_corners)
coll_mat[0, i] = False
if not coll_mat.any():
success_mask[i] = j
box_corners[i] = current_corners
loc_noises[i, j, :2] += (dst_pos - boxes[i, :2])
rot_noises[i, j] += (dst_grot - current_grot)
break
return success_mask
def _select_transform(transform, indices):
"""Select transform.
Args:
transform (np.ndarray): Transforms to select from.
indices (np.ndarray): Mask to indicate which transform to select.
Returns:
np.ndarray: Selected transforms.
"""
result = np.zeros((transform.shape[0], *transform.shape[2:]),
dtype=transform.dtype)
for i in range(transform.shape[0]):
if indices[i] != -1:
result[i] = transform[i, indices[i]]
return result
@numba.njit
def _rotation_matrix_3d_(rot_mat_T, angle, axis):
"""Get the 3D rotation matrix.
Args:
rot_mat_T (np.ndarray): Transposed rotation matrix.
angle (float): Rotation angle.
axis (int): Rotation axis.
"""
rot_sin = np.sin(angle)
rot_cos = np.cos(angle)
rot_mat_T[:] = np.eye(3)
if axis == 1:
rot_mat_T[0, 0] = rot_cos
rot_mat_T[0, 2] = -rot_sin
rot_mat_T[2, 0] = rot_sin
rot_mat_T[2, 2] = rot_cos
elif axis == 2 or axis == -1:
rot_mat_T[0, 0] = rot_cos
rot_mat_T[0, 1] = -rot_sin
rot_mat_T[1, 0] = rot_sin
rot_mat_T[1, 1] = rot_cos
elif axis == 0:
rot_mat_T[1, 1] = rot_cos
rot_mat_T[1, 2] = -rot_sin
rot_mat_T[2, 1] = rot_sin
rot_mat_T[2, 2] = rot_cos
@numba.njit
def points_transform_(points, centers, point_masks, loc_transform,
rot_transform, valid_mask):
"""Apply transforms to points and box centers.
Args:
points (np.ndarray): Input points.
centers (np.ndarray): Input box centers.
point_masks (np.ndarray): Mask to indicate which points need
to be transformed.
loc_transform (np.ndarray): Location transform to be applied.
rot_transform (np.ndarray): Rotation transform to be applied.
valid_mask (np.ndarray): Mask to indicate which boxes are valid.
"""
num_box = centers.shape[0]
num_points = points.shape[0]
rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype)
for i in range(num_box):
_rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2)
for i in range(num_points):
for j in range(num_box):
if valid_mask[j]:
if point_masks[i, j] == 1:
points[i, :3] -= centers[j, :3]
points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j]
points[i, :3] += centers[j, :3]
points[i, :3] += loc_transform[j]
break # only apply first box's transform
@numba.njit
def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask):
"""Transform 3D boxes.
Args:
boxes (np.ndarray): 3D boxes to be transformed.
loc_transform (np.ndarray): Location transform to be applied.
rot_transform (np.ndarray): Rotation transform to be applied.
valid_mask (np.ndarray | None): Mask to indicate which boxes are valid.
"""
num_box = boxes.shape[0]
for i in range(num_box):
if valid_mask[i]:
boxes[i, :3] += loc_transform[i]
boxes[i, 6] += rot_transform[i]
def noise_per_object_v3_(gt_boxes,
points=None,
valid_mask=None,
rotation_perturb=np.pi / 4,
center_noise_std=1.0,
global_random_rot_range=np.pi / 4,
num_try=100):
"""Random rotate or remove each groundtruth independently. use kitti viewer
to test this function points_transform_
Args:
gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7).
points (np.ndarray | None): Input point cloud with shape (M, 4).
Default: None.
valid_mask (np.ndarray | None): Mask to indicate which boxes are valid.
Default: None.
rotation_perturb (float): Rotation perturbation. Default: pi / 4.
center_noise_std (float): Center noise standard deviation.
Default: 1.0.
global_random_rot_range (float): Global random rotation range.
Default: pi/4.
num_try (int): Number of try. Default: 100.
"""
num_boxes = gt_boxes.shape[0]
if not isinstance(rotation_perturb, (list, tuple, np.ndarray)):
rotation_perturb = [-rotation_perturb, rotation_perturb]
if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)):
global_random_rot_range = [
-global_random_rot_range, global_random_rot_range
]
enable_grot = np.abs(global_random_rot_range[0] -
global_random_rot_range[1]) >= 1e-3
if not isinstance(center_noise_std, (list, tuple, np.ndarray)):
center_noise_std = [
center_noise_std, center_noise_std, center_noise_std
]
if valid_mask is None:
valid_mask = np.ones((num_boxes, ), dtype=np.bool_)
center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype)
loc_noises = np.random.normal(
scale=center_noise_std, size=[num_boxes, num_try, 3])
rot_noises = np.random.uniform(
rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try])
gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1])
grot_lowers = global_random_rot_range[0] - gt_grots
grot_uppers = global_random_rot_range[1] - gt_grots
global_rot_noises = np.random.uniform(
grot_lowers[..., np.newaxis],
grot_uppers[..., np.newaxis],
size=[num_boxes, num_try])
origin = (0.5, 0.5, 0)
gt_box_corners = box_np_ops.center_to_corner_box3d(
gt_boxes[:, :3],
gt_boxes[:, 3:6],
gt_boxes[:, 6],
origin=origin,
axis=2)
# TODO: rewrite this noise box function?
if not enable_grot:
selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]],
valid_mask, loc_noises, rot_noises)
else:
selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]],
valid_mask, loc_noises, rot_noises,
global_rot_noises)
loc_transforms = _select_transform(loc_noises, selected_noise)
rot_transforms = _select_transform(rot_noises, selected_noise)
surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners)
if points is not None:
# TODO: replace this points_in_convex function by my tools?
point_masks = box_np_ops.points_in_convex_polygon_3d_jit(
points[:, :3], surfaces)
points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms,
rot_transforms, valid_mask)
box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask)
================================================
FILE: mmdet3d/datasets/pipelines/dbsampler.py
================================================
import copy
import mmcv
import numpy as np
import os
from mmdet3d.core.bbox import box_np_ops
from mmdet3d.datasets.pipelines import data_augment_utils
from mmdet.datasets import PIPELINES
from ..registry import OBJECTSAMPLERS
class BatchSampler:
"""Class for sampling specific category of ground truths.
Args:
sample_list (list[dict]): List of samples.
name (str | None): The category of samples. Default: None.
epoch (int | None): Sampling epoch. Default: None.
shuffle (bool): Whether to shuffle indices. Default: False.
drop_reminder (bool): Drop reminder. Default: False.
"""
def __init__(self,
sampled_list,
name=None,
epoch=None,
shuffle=True,
drop_reminder=False):
self._sampled_list = sampled_list
self._indices = np.arange(len(sampled_list))
if shuffle:
np.random.shuffle(self._indices)
self._idx = 0
self._example_num = len(sampled_list)
self._name = name
self._shuffle = shuffle
self._epoch = epoch
self._epoch_counter = 0
self._drop_reminder = drop_reminder
def _sample(self, num):
"""Sample specific number of ground truths and return indices.
Args:
num (int): Sampled number.
Returns:
list[int]: Indices of sampled ground truths.
"""
if self._idx + num >= self._example_num:
ret = self._indices[self._idx:].copy()
self._reset()
else:
ret = self._indices[self._idx:self._idx + num]
self._idx += num
return ret
def _reset(self):
"""Reset the index of batchsampler to zero."""
assert self._name is not None
# print("reset", self._name)
if self._shuffle:
np.random.shuffle(self._indices)
self._idx = 0
def sample(self, num):
"""Sample specific number of ground truths.
Args:
num (int): Sampled number.
Returns:
list[dict]: Sampled ground truths.
"""
indices = self._sample(num)
return [self._sampled_list[i] for i in indices]
@OBJECTSAMPLERS.register_module()
class DataBaseSampler(object):
"""Class for sampling data from the ground truth database.
Args:
info_path (str): Path of groundtruth database info.
data_root (str): Path of groundtruth database.
rate (float): Rate of actual sampled over maximum sampled number.
prepare (dict): Name of preparation functions and the input value.
sample_groups (dict): Sampled classes and numbers.
classes (list[str]): List of classes. Default: None.
points_loader(dict): Config of points loader. Default: dict(
type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3])
"""
def __init__(self,
info_path,
data_root,
rate,
prepare,
sample_groups,
classes=None,
points_loader=dict(
type='LoadPointsFromFile',
coord_type='LIDAR',
load_dim=4,
use_dim=[0, 1, 2, 3])):
super().__init__()
self.data_root = data_root
self.info_path = info_path
self.rate = rate
self.prepare = prepare
self.classes = classes
self.cat2label = {name: i for i, name in enumerate(classes)}
self.label2cat = {i: name for i, name in enumerate(classes)}
self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES)
db_infos = mmcv.load(info_path)
# filter database infos
from mmdet3d.utils import get_root_logger
logger = get_root_logger()
for k, v in db_infos.items():
logger.info(f'load {len(v)} {k} database infos')
for prep_func, val in prepare.items():
db_infos = getattr(self, prep_func)(db_infos, val)
logger.info('After filter database:')
for k, v in db_infos.items():
logger.info(f'load {len(v)} {k} database infos')
self.db_infos = db_infos
# load sample groups
# TODO: more elegant way to load sample groups
self.sample_groups = []
for name, num in sample_groups.items():
self.sample_groups.append({name: int(num)})
self.group_db_infos = self.db_infos # just use db_infos
self.sample_classes = []
self.sample_max_nums = []
for group_info in self.sample_groups:
self.sample_classes += list(group_info.keys())
self.sample_max_nums += list(group_info.values())
self.sampler_dict = {}
for k, v in self.group_db_infos.items():
self.sampler_dict[k] = BatchSampler(v, k, shuffle=True)
# TODO: No group_sampling currently
@staticmethod
def filter_by_difficulty(db_infos, removed_difficulty):
"""Filter ground truths by difficulties.
Args:
db_infos (dict): Info of groundtruth database.
removed_difficulty (list): Difficulties that are not qualified.
Returns:
dict: Info of database after filtering.
"""
new_db_infos = {}
for key, dinfos in db_infos.items():
new_db_infos[key] = [
info for info in dinfos
if info['difficulty'] not in removed_difficulty
]
return new_db_infos
@staticmethod
def filter_by_min_points(db_infos, min_gt_points_dict):
"""Filter ground truths by number of points in the bbox.
Args:
db_infos (dict): Info of groundtruth database.
min_gt_points_dict (dict): Different number of minimum points
needed for different categories of ground truths.
Returns:
dict: Info of database after filtering.
"""
for name, min_num in min_gt_points_dict.items():
min_num = int(min_num)
if min_num > 0:
filtered_infos = []
for info in db_infos[name]:
if info['num_points_in_gt'] >= min_num:
filtered_infos.append(info)
db_infos[name] = filtered_infos
return db_infos
def sample_all(self, gt_bboxes, gt_labels, img=None):
"""Sampling all categories of bboxes.
Args:
gt_bboxes (np.ndarray): Ground truth bounding boxes.
gt_labels (np.ndarray): Ground truth labels of boxes.
Returns:
dict: Dict of sampled 'pseudo ground truths'.
- gt_labels_3d (np.ndarray): ground truths labels \
of sampled objects.
- gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): \
sampled ground truth 3D bounding boxes
- points (np.ndarray): sampled points
- group_ids (np.ndarray): ids of sampled ground truths
"""
sampled_num_dict = {}
sample_num_per_class = []
for class_name, max_sample_num in zip(self.sample_classes,
self.sample_max_nums):
class_label = self.cat2label[class_name]
# sampled_num = int(max_sample_num -
# np.sum([n == class_name for n in gt_names]))
sampled_num = int(max_sample_num -
np.sum([n == class_label for n in gt_labels]))
sampled_num = np.round(self.rate * sampled_num).astype(np.int64)
sampled_num_dict[class_name] = sampled_num
sample_num_per_class.append(sampled_num)
sampled = []
sampled_gt_bboxes = []
avoid_coll_boxes = gt_bboxes
for class_name, sampled_num in zip(self.sample_classes,
sample_num_per_class):
if sampled_num > 0:
sampled_cls = self.sample_class_v2(class_name, sampled_num,
avoid_coll_boxes)
sampled += sampled_cls
if len(sampled_cls) > 0:
if len(sampled_cls) == 1:
sampled_gt_box = sampled_cls[0]['box3d_lidar'][
np.newaxis, ...]
else:
sampled_gt_box = np.stack(
[s['box3d_lidar'] for s in sampled_cls], axis=0)
sampled_gt_bboxes += [sampled_gt_box]
avoid_coll_boxes = np.concatenate(
[avoid_coll_boxes, sampled_gt_box], axis=0)
ret = None
if len(sampled) > 0:
sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0)
# center = sampled_gt_bboxes[:, 0:3]
# num_sampled = len(sampled)
s_points_list = []
count = 0
for info in sampled:
file_path = os.path.join(
self.data_root,
info['path']) if self.data_root else info['path']
results = dict(pts_filename=file_path)
s_points = self.points_loader(results)['points']
s_points.translate(info['box3d_lidar'][:3])
count += 1
s_points_list.append(s_points)
gt_labels = np.array([self.cat2label[s['name']] for s in sampled],
dtype=np.long)
ret = {
'gt_labels_3d':
gt_labels,
'gt_bboxes_3d':
sampled_gt_bboxes,
'points':
s_points_list[0].cat(s_points_list),
'group_ids':
np.arange(gt_bboxes.shape[0],
gt_bboxes.shape[0] + len(sampled))
}
return ret
def sample_class_v2(self, name, num, gt_bboxes):
"""Sampling specific categories of bounding boxes.
Args:
name (str): Class of objects to be sampled.
num (int): Number of sampled bboxes.
gt_bboxes (np.ndarray): Ground truth boxes.
Returns:
list[dict]: Valid samples after collision test.
"""
sampled = self.sampler_dict[name].sample(num)
sampled = copy.deepcopy(sampled)
num_gt = gt_bboxes.shape[0]
num_sampled = len(sampled)
gt_bboxes_bv = box_np_ops.center_to_corner_box2d(
gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6])
sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0)
boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy()
sp_boxes_new = boxes[gt_bboxes.shape[0]:]
sp_boxes_bv = box_np_ops.center_to_corner_box2d(
sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6])
total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0)
coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv)
diag = np.arange(total_bv.shape[0])
coll_mat[diag, diag] = False
valid_samples = []
for i in range(num_gt, num_gt + num_sampled):
if coll_mat[i].any():
coll_mat[i] = False
coll_mat[:, i] = False
else:
valid_samples.append(sampled[i - num_gt])
return valid_samples
================================================
FILE: mmdet3d/datasets/pipelines/formating.py
================================================
import numpy as np
from mmcv.parallel import DataContainer as DC
from mmdet3d.core.bbox import BaseInstance3DBoxes
from mmdet3d.core.points import BasePoints
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import to_tensor
PIPELINES._module_dict.pop('DefaultFormatBundle')
@PIPELINES.register_module()
class DefaultFormatBundle(object):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields, including "img",
"proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
- gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
- gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
(3)to DataContainer (stack=True)
"""
def __init__(self, ):
return
def __call__(self, results):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with
default bundle.
"""
if 'img' in results:
if isinstance(results['img'], list):
# process multiple imgs in single frame
imgs = [img.transpose(2, 0, 1) for img in results['img']]
imgs = np.ascontiguousarray(np.stack(imgs, axis=0))
results['img'] = DC(to_tensor(imgs), stack=True)
else:
img = np.ascontiguousarray(results['img'].transpose(2, 0, 1))
results['img'] = DC(to_tensor(img), stack=True)
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels',
'gt_labels_3d', 'pts_instance_mask', 'pts_semantic_mask',
'gt_pts_centers_view', 'gt_img_centers_view', 'gt_visible_3d'
]:
if key not in results:
continue
if isinstance(results[key], list):
results[key] = DC([to_tensor(res) for res in results[key]])
else:
results[key] = DC(to_tensor(results[key]))
if 'gt_bboxes_3d' in results:
if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes):
results['gt_bboxes_3d'] = DC(
results['gt_bboxes_3d'], cpu_only=True)
else:
results['gt_bboxes_3d'] = DC(
to_tensor(results['gt_bboxes_3d']))
if 'gt_bboxes_cam_view' in results:
if isinstance(results['gt_bboxes_cam_view'], BaseInstance3DBoxes):
results['gt_bboxes_cam_view'] = DC(
results['gt_bboxes_cam_view'], cpu_only=True)
else:
results['gt_bboxes_cam_view'] = DC(
to_tensor(results['gt_bboxes_cam_view']))
if 'gt_bboxes_lidar_view' in results:
if isinstance(results['gt_bboxes_lidar_view'], BaseInstance3DBoxes):
results['gt_bboxes_lidar_view'] = DC(
results['gt_bboxes_lidar_view'], cpu_only=True)
else:
results['gt_bboxes_lidar_view'] = DC(
to_tensor(results['gt_bboxes_lidar_view']))
if 'gt_masks' in results:
results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
if 'gt_semantic_seg' in results:
results['gt_semantic_seg'] = DC(
to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
return results
def __repr__(self):
return self.__class__.__name__
@PIPELINES.register_module()
class Collect3D(object):
"""Collect data from the loader relevant to the specific task.
This is usually the last stage of the data loader pipeline. Typically keys
is set to some subset of "img", "proposals", "gt_bboxes",
"gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
The "img_meta" item is always populated. The contents of the "img_meta"
dictionary depends on "meta_keys". By default this includes:
- 'img_shape': shape of the image input to the network as a tuple \
(h, w, c). Note that images may be zero padded on the \
bottom/right if the batch tensor is larger than this shape.
- 'scale_factor': a float indicating the preprocessing scale
- 'flip': a boolean indicating if image flip transform was used
- 'filename': path to the image file
- 'ori_shape': original shape of the image as a tuple (h, w, c)
- 'pad_shape': image shape after padding
- 'lidar2img': transform from lidar to image
- 'pcd_horizontal_flip': a boolean indicating if point cloud is \
flipped horizontally
- 'pcd_vertical_flip': a boolean indicating if point cloud is \
flipped vertically
- 'box_mode_3d': 3D box mode
- 'box_type_3d': 3D box type
- 'img_norm_cfg': a dict of normalization information:
- mean: per channel mean subtraction
- std: per channel std divisor
- to_rgb: bool indicating if bgr was converted to rgb
- 'rect': rectification matrix
- 'Trv2c': transformation from velodyne to camera coordinate
- 'P2': transformation betweeen cameras
- 'pcd_trans': point cloud transformations
- 'sample_idx': sample index
- 'pcd_scale_factor': point cloud scale factor
- 'pcd_rotation': rotation applied to point cloud
- 'pts_filename': path to point cloud file.
Args:
keys (Sequence[str]): Keys of results to be collected in ``data``.
meta_keys (Sequence[str], optional): Meta keys to be converted to
``mmcv.DataContainer`` and collected in ``data[img_metas]``.
Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', \
'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', \
'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', \
'img_norm_cfg', 'rect', 'Trv2c', 'P2', 'pcd_trans', \
'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename')
"""
def __init__(self,
keys,
meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img',
'pad_shape', 'scale_factor', 'flip', 'image_flip',
'pcd_horizontal_flip', 'pcd_vertical_flip',
'box_mode_3d', 'box_type_3d', 'img_norm_cfg',
'rect', 'Trv2c', 'P2', 'pcd_trans', 'sample_idx',
'pcd_scale_factor', 'pcd_rotation', 'pts_filename',
'transformation_3d_flow', 'cam_intrinsic', 'lidar2cam_r',
'lidar2cam_t', 'valid_shape', 'img_scale_ratios', 'pcd_rotation_angle')):
self.keys = keys
self.meta_keys = meta_keys
def __call__(self, results):
"""Call function to collect keys in results. The keys in ``meta_keys``
will be converted to :obj:`mmcv.DataContainer`.
Args:
results (dict): Result dict contains the data to collect.
Returns:
dict: The result dict contains the following keys
- keys in ``self.keys``
- ``img_metas``
"""
data = {}
img_metas = {}
for key in self.meta_keys:
if key in results:
img_metas[key] = results[key]
data['img_metas'] = DC(img_metas, cpu_only=True)
for key in self.keys:
data[key] = results[key]
return data
def __repr__(self):
"""str: Return a string that describes the module."""
return self.__class__.__name__ + '(keys={}, meta_keys={})'.format(
self.keys, self.meta_keys)
@PIPELINES.register_module()
class DefaultFormatBundle3D(DefaultFormatBundle):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields for voxels,
including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and
"gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
"""
def __init__(self, class_names, with_gt=True, with_label=True):
super(DefaultFormatBundle3D, self).__init__()
self.class_names = class_names
self.with_gt = with_gt
self.with_label = with_label
def __call__(self, results):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with
default bundle.
"""
# Format 3D data
if 'points' in results:
assert isinstance(results['points'], BasePoints)
results['points'] = DC(results['points'].tensor)
for key in ['voxels', 'coors', 'voxel_centers', 'num_points']:
if key not in results:
continue
results[key] = DC(to_tensor(results[key]), stack=False)
if self.with_gt:
# Clean GT bboxes in the final
if 'gt_bboxes_3d_mask' in results:
gt_bboxes_3d_mask = results['gt_bboxes_3d_mask']
results['gt_bboxes_3d'] = results['gt_bboxes_3d'][
gt_bboxes_3d_mask]
if 'gt_names_3d' in results:
results['gt_names_3d'] = results['gt_names_3d'][
gt_bboxes_3d_mask]
if 'gt_bboxes_mask' in results:
gt_bboxes_mask = results['gt_bboxes_mask']
if 'gt_bboxes' in results:
results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask]
results['gt_names'] = results['gt_names'][gt_bboxes_mask]
if self.with_label:
if 'gt_names' in results and len(results['gt_names']) == 0:
results['gt_labels'] = np.array([], dtype=np.int64)
elif 'gt_names' in results and isinstance(
results['gt_names'][0], list):
# gt_labels might be a list of list in multi-view setting
results['gt_labels'] = [
np.array([self.class_names.index(n) for n in res],
dtype=np.int64) for res in results['gt_names']
]
elif 'gt_names' in results:
results['gt_labels'] = np.array([
self.class_names.index(n) for n in results['gt_names']
],
dtype=np.int64)
# we still assume one pipeline for one frame LiDAR
# thus, the 3D name is list[string]
if 'gt_names_3d' in results:
results['gt_labels_3d'] = np.array([
self.class_names.index(n)
for n in results['gt_names_3d']
],
dtype=np.int64)
results = super(DefaultFormatBundle3D, self).__call__(results)
return results
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(class_names={}, '.format(self.class_names)
repr_str += 'with_gt={}, with_label={})'.format(
self.with_gt, self.with_label)
return repr_str
================================================
FILE: mmdet3d/datasets/pipelines/loading.py
================================================
import mmcv
import numpy as np
import torch
import cv2
import copy
from mmdet3d.core.points import BasePoints, get_points_type
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import LoadAnnotations
@PIPELINES.register_module()
class MyResize(object):
"""Resize images & bbox & mask.
This transform resizes the input image to some scale. Bboxes and masks are
then resized with the same scale factor. If the input dict contains the key
"scale", then the scale in the input dict is used, otherwise the specified
scale in the init method is used. If the input dict contains the key
"scale_factor" (if MultiScaleFlipAug does not give img_scale but
scale_factor), the actual scale will be computed by image shape and
scale_factor.
`img_scale` can either be a tuple (single-scale) or a list of tuple
(multi-scale). There are 3 multiscale modes:
- ``ratio_range is not None``: randomly sample a ratio from the ratio \
range and multiply it with the image scale.
- ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
sample a scale from the multiscale range.
- ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
sample a scale from multiple scales.
Args:
img_scale (tuple or list[tuple]): Images scales for resizing.
multiscale_mode (str): Either "range" or "value".
ratio_range (tuple[float]): (min_ratio, max_ratio)
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image.
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
These two backends generates slightly different results. Defaults
to 'cv2'.
override (bool, optional): Whether to override `scale` and
`scale_factor` so as to call resize twice. Default False. If True,
after the first resizing, the existed `scale` and `scale_factor`
will be ignored so the second resizing can be allowed.
This option is a work-around for multiple times of resize in DETR.
Defaults to False.
"""
def __init__(self,
img_scale=None,
multiscale_mode='range',
ratio_range=None,
keep_ratio=True,
bbox_clip_border=True,
backend='cv2',
override=False):
if img_scale is None:
self.img_scale = None
else:
if isinstance(img_scale, list):
self.img_scale = img_scale
else:
self.img_scale = [img_scale]
assert mmcv.is_list_of(self.img_scale, tuple)
if ratio_range is not None:
# mode 1: given a scale and a range of image ratio
assert len(self.img_scale) == 1
else:
# mode 2: given multiple scales or a range of scales
assert multiscale_mode in ['value', 'range']
self.backend = backend
self.multiscale_mode = multiscale_mode
self.ratio_range = ratio_range
self.keep_ratio = keep_ratio
# TODO: refactor the override option in Resize
self.override = override
self.bbox_clip_border = bbox_clip_border
@staticmethod
def random_select(img_scales):
"""Randomly select an img_scale from given candidates.
Args:
img_scales (list[tuple]): Images scales for selection.
Returns:
(tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
where ``img_scale`` is the selected image scale and \
``scale_idx`` is the selected index in the given candidates.
"""
assert mmcv.is_list_of(img_scales, tuple)
scale_idx = np.random.randint(len(img_scales))
img_scale = img_scales[scale_idx]
return img_scale, scale_idx
@staticmethod
def random_sample(img_scales):
"""Randomly sample an img_scale when ``multiscale_mode=='range'``.
Args:
img_scales (list[tuple]): Images scale range for sampling.
There must be two tuples in img_scales, which specify the lower
and uper bound of image scales.
Returns:
(tuple, None): Returns a tuple ``(img_scale, None)``, where \
``img_scale`` is sampled scale and None is just a placeholder \
to be consistent with :func:`random_select`.
"""
assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
min(img_scale_long),
max(img_scale_long) + 1)
short_edge = np.random.randint(
min(img_scale_short),
max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
return img_scale, None
@staticmethod
def random_sample_ratio(img_scale, ratio_range):
"""Randomly sample an img_scale when ``ratio_range`` is specified.
A ratio will be randomly sampled from the range specified by
``ratio_range``. Then it would be multiplied with ``img_scale`` to
generate sampled scale.
Args:
img_scale (tuple): Images scale base to multiply with ratio.
ratio_range (tuple[float]): The minimum and maximum ratio to scale
the ``img_scale``.
Returns:
(tuple, None): Returns a tuple ``(scale, None)``, where \
``scale`` is sampled ratio multiplied with ``img_scale`` and \
None is just a placeholder to be consistent with \
:func:`random_select`.
"""
assert isinstance(img_scale, tuple) and len(img_scale) == 2
min_ratio, max_ratio = ratio_range
assert min_ratio <= max_ratio
ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
return scale, None
def _random_scale(self, results):
"""Randomly sample an img_scale according to ``ratio_range`` and
``multiscale_mode``.
If ``ratio_range`` is specified, a ratio will be sampled and be
multiplied with ``img_scale``.
If multiple scales are specified by ``img_scale``, a scale will be
sampled according to ``multiscale_mode``.
Otherwise, single scale will be used.
Args:
results (dict): Result dict from :obj:`dataset`.
Returns:
dict: Two new keys 'scale` and 'scale_idx` are added into \
``results``, which would be used by subsequent pipelines.
"""
if self.ratio_range is not None:
scale, scale_idx = self.random_sample_ratio(
self.img_scale[0], self.ratio_range)
elif len(self.img_scale) == 1:
scale, scale_idx = self.img_scale[0], 0
elif self.multiscale_mode == 'range':
scale, scale_idx = self.random_sample(self.img_scale)
elif self.multiscale_mode == 'value':
scale, scale_idx = self.random_select(self.img_scale)
else:
raise NotImplementedError
results['scale'] = scale
results['scale_idx'] = scale_idx
def _resize_img(self, results):
"""Resize images with ``results['scale']``."""
imgs = results['img']
results['img'] = [imgs[i] for i in range(len(imgs))]
for key in results.get('img_fields', ['img']):
for idx in range(len(results['img'])):
if self.keep_ratio:
img, scale_factor = mmcv.imrescale(
results[key][idx],
results['scale'],
return_scale=True,
backend=self.backend)
# the w_scale and h_scale has minor difference
# a real fix should be done in the mmcv.imrescale in the future
new_h, new_w = img.shape[:2]
h, w = results[key][idx].shape[:2]
w_scale = new_w / w
h_scale = new_h / h
else:
img, w_scale, h_scale = mmcv.imresize(
results[key][idx],
results['scale'],
return_scale=True,
backend=self.backend)
results[key][idx] = img
scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
dtype=np.float32)
results['img_shape'] = img.shape
# in case that there is no padding
results['pad_shape'] = img.shape
results['scale_factor'] = scale_factor
results['keep_ratio'] = self.keep_ratio
if 'valid_shape' in results:
scaling = np.array([[w_scale, h_scale]])
results['valid_shape'] = results['valid_shape'] * scaling
def _resize_bboxes(self, results):
"""Resize bounding boxes with ``results['scale_factor']``."""
for key in results.get('bbox_fields', []):
bboxes = results[key] * results['scale_factor']
if self.bbox_clip_border:
img_shape = results['img_shape']
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
results[key] = bboxes
def _resize_centers(self, results):
centers = results['gt_img_centers_view']
centers[:, :2] = centers[:, :2] * results['scale_factor'][:2]
img_shape = results['img_shape']
centers[:, 0] = np.clip(centers[:, 0], 0, img_shape[1])
centers[:, 1] = np.clip(centers[:, 1], 0, img_shape[0])
results['gt_img_centers_view'] = centers
def _resize_masks(self, results):
"""Resize masks with ``results['scale']``"""
for key in results.get('mask_fields', []):
if results[key] is None:
continue
if self.keep_ratio:
results[key] = results[key].rescale(results['scale'])
else:
results[key] = results[key].resize(results['img_shape'][:2])
def _resize_seg(self, results):
"""Resize semantic segmentation map with ``results['scale']``."""
for key in results.get('seg_fields', []):
if self.keep_ratio:
gt_seg = mmcv.imrescale(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
else:
gt_seg = mmcv.imresize(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
results['gt_semantic_seg'] = gt_seg
def _resize_camera(self, results):
scale_factor = results['scale_factor']
w_scale = scale_factor[0]
h_scale = scale_factor[1]
scaling_matrix = np.array([
[w_scale, 0, 0],
[0, h_scale, 0],
[0, 0, 1]
])
for i in range(len(results['cam_intrinsic'])):
results['cam_intrinsic'][i] = scaling_matrix @ results['cam_intrinsic'][i]
def __call__(self, results):
"""Call function to resize images, bounding boxes, masks, semantic
segmentation map.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
'keep_ratio' keys are added into result dict.
"""
if 'scale' not in results:
if 'scale_factor' in results:
img_shape = results['img'][0].shape[:2]
scale_factor = results['scale_factor']
assert isinstance(scale_factor, float)
results['scale'] = tuple(
[int(x * scale_factor) for x in img_shape][::-1])
else:
self._random_scale(results)
else:
if not self.override:
assert 'scale_factor' not in results, (
'scale and scale_factor cannot be both set.')
else:
results.pop('scale')
if 'scale_factor' in results:
results.pop('scale_factor')
self._random_scale(results)
self._resize_img(results)
self._resize_bboxes(results)
self._resize_masks(results)
self._resize_seg(results)
if 'gt_img_centers_view' in results:
self._resize_centers(results)
if 'cam_intrinsic' in results:
self._resize_camera(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(img_scale={self.img_scale}, '
repr_str += f'multiscale_mode={self.multiscale_mode}, '
repr_str += f'ratio_range={self.ratio_range}, '
repr_str += f'keep_ratio={self.keep_ratio}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@PIPELINES.register_module()
class MyNormalize(object):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def __init__(self, mean, std, to_rgb=True):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_rgb = to_rgb
def __call__(self, results):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
for key in results.get('img_fields', ['img']):
for idx in range(len(results['img'])):
results[key][idx] = mmcv.imnormalize(results[key][idx], self.mean, self.std,
self.to_rgb)
results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=self.to_rgb)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
return repr_str
@PIPELINES.register_module()
class MyPad(object):
"""Pad the image & mask.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
Args:
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
"""
def __init__(self, size=None, size_divisor=None, pad_val=0):
self.size = size
self.size_divisor = size_divisor
self.pad_val = pad_val
# only one of size and size_divisor should be valid
assert size is not None or size_divisor is not None
assert size is None or size_divisor is None
def _pad_img(self, results):
"""Pad images according to ``self.size``."""
for key in results.get('img_fields', ['img']):
if self.size is not None:
padded_img = mmcv.impad(
results[key], shape=self.size, pad_val=self.pad_val)
elif self.size_divisor is not None:
for idx in range(len(results[key])):
padded_img = mmcv.impad_to_multiple(
results[key][idx], self.size_divisor, pad_val=self.pad_val)
results[key][idx] = padded_img
results['pad_shape'] = padded_img.shape
results['pad_fixed_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
def _pad_masks(self, results):
"""Pad masks according to ``results['pad_shape']``."""
pad_shape = results['pad_shape'][:2]
for key in results.get('mask_fields', []):
results[key] = results[key].pad(pad_shape, pad_val=self.pad_val)
def _pad_seg(self, results):
"""Pad semantic segmentation map according to
``results['pad_shape']``."""
for key in results.get('seg_fields', []):
results[key] = mmcv.impad(
results[key], shape=results['pad_shape'][:2])
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self._pad_img(results)
self._pad_masks(results)
self._pad_seg(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.size}, '
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
@PIPELINES.register_module()
class LoadMultiViewImageFromFiles(object):
"""Load multi channel images from a list of separate channel files.
Expects results['img_filename'] to be a list of filenames.
Args:
to_float32 (bool): Whether to convert the img to float32.
Defaults to False.
color_type (str): Color type of the file. Defaults to 'unchanged'.
"""
def __init__(self, to_float32=False, img_scale=None, color_type='unchanged'):
self.to_float32 = to_float32
self.img_scale = img_scale
self.color_type = color_type
def pad(self, img):
# to pad the 5 input images into a same size (for Waymo)
if img.shape[0] != self.img_scale[0]:
img = np.concatenate([img, np.zeros_like(img[0:1280-886,:])], axis=0)
return img
def __call__(self, results):
"""Call function to load multi-view image from files.
Args:
results (dict): Result dict containing multi-view image filenames.
Returns:
dict: The result dict containing the multi-view image data. \
Added keys and values are described below.
- filename (str): Multi-view image filenames.
- img (np.ndarray): Multi-view image arrays.
- img_shape (tuple[int]): Shape of multi-view image arrays.
- ori_shape (tuple[int]): Shape of original image arrays.
- pad_shape (tuple[int]): Shape of padded image arrays.
- scale_factor (float): Scale factor.
- img_norm_cfg (dict): Normalization configuration of images.
"""
filename = results['img_filename']
if self.img_scale is None:
img = np.stack(
[mmcv.imread(name, self.color_type) for name in filename], axis=-1)
else:
img = np.stack(
[self.pad(mmcv.imread(name, self.color_type)) for name in filename], axis=-1)
if self.to_float32:
img = img.astype(np.float32)
results['filename'] = filename
# unravel to list, see `DefaultFormatBundle` in formating.py
# which will transpose each image separately and then stack into array
results['img'] = [img[..., i] for i in range(img.shape[-1])]
results['img_shape'] = img.shape
results['ori_shape'] = img.shape
# Set initial values for default meta_keys
results['pad_shape'] = img.shape
# results['scale_factor'] = [1.0, 1.0]
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results['img_norm_cfg'] = dict(
mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32),
to_rgb=False)
results['img_fields'] = ['img']
return results
def __repr__(self):
"""str: Return a string that describes the module."""
return "{} (to_float32={}, color_type='{}')".format(
self.__class__.__name__, self.to_float32, self.color_type)
@PIPELINES.register_module()
class LoadPointsFromMultiSweeps(object):
"""Load points from multiple sweeps.
This is usually used for nuScenes dataset to utilize previous sweeps.
Args:
sweeps_num (int): Number of sweeps. Defaults to 10.
load_dim (int): Dimension number of the loaded points. Defaults to 5.
use_dim (list[int]): Which dimension to use. Defaults to [0, 1, 2, 4].
file_client_args (dict): Config dict of file clients, refer to
https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
for more details. Defaults to dict(backend='disk').
pad_empty_sweeps (bool): Whether to repeat keyframe when
sweeps is empty. Defaults to False.
remove_close (bool): Whether to remove close points.
Defaults to False.
test_mode (bool): If test_model=True used for testing, it will not
randomly sample sweeps but select the nearest N frames.
Defaults to False.
"""
def __init__(self,
sweeps_num=10,
load_dim=5,
use_dim=[0, 1, 2, 4],
file_client_args=dict(backend='disk'),
pad_empty_sweeps=False,
remove_close=False,
test_mode=False):
self.load_dim = load_dim
self.sweeps_num = sweeps_num
self.use_dim = use_dim
self.file_client_args = file_client_args.copy()
self.file_client = None
self.pad_empty_sweeps = pad_empty_sweeps
self.remove_close = remove_close
self.test_mode = test_mode
def _load_points(self, pts_filename):
"""Private function to load point clouds data.
Args:
pts_filename (str): Filename of point clouds data.
Returns:
np.ndarray: An array containing point clouds data.
"""
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
try:
pts_bytes = self.file_client.get(pts_filename)
points = np.frombuffer(pts_bytes, dtype=np.float32)
except ConnectionError:
mmcv.check_file_exist(pts_filename)
if pts_filename.endswith('.npy'):
points = np.load(pts_filename)
else:
points = np.fromfile(pts_filename, dtype=np.float32)
return points
def _remove_close(self, points, radius=1.0):
"""Removes point too close within a certain radius from origin.
Args:
points (np.ndarray): Sweep points.
radius (float): Radius below which points are removed.
Defaults to 1.0.
Returns:
np.ndarray: Points after removing.
"""
if isinstance(points, np.ndarray):
points_numpy = points
elif isinstance(points, BasePoints):
points_numpy = points.tensor.numpy()
else:
raise NotImplementedError
x_filt = np.abs(points_numpy[:, 0]) < radius
y_filt = np.abs(points_numpy[:, 1]) < radius
not_close = np.logical_not(np.logical_and(x_filt, y_filt))
return points[not_close]
def __call__(self, results):
"""Call function to load multi-sweep point clouds from files.
Args:
results (dict): Result dict containing multi-sweep point cloud \
filenames.
Returns:
dict: The result dict containing the multi-sweep points data. \
Added key and value are described below.
- points (np.ndarray): Multi-sweep point cloud arrays.
"""
points = results['points']
points.tensor[:, 4] = 0
sweep_points_list = [points]
ts = results['timestamp']
if self.pad_empty_sweeps and len(results['sweeps']) == 0:
for i in range(self.sweeps_num):
if self.remove_close:
sweep_points_list.append(self._remove_close(points))
else:
sweep_points_list.append(points)
else:
if len(results['sweeps']) <= self.sweeps_num:
choices = np.arange(len(results['sweeps']))
elif self.test_mode:
choices = np.arange(self.sweeps_num)
else:
choices = np.random.choice(
len(results['sweeps']), self.sweeps_num, replace=False)
for idx in choices:
sweep = results['sweeps'][idx]
points_sweep = self._load_points(sweep['data_path'])
points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim)
if self.remove_close:
points_sweep = self._remove_close(points_sweep)
sweep_ts = sweep['timestamp'] / 1e6
points_sweep[:, :3] = points_sweep[:, :3] @ sweep[
'sensor2lidar_rotation'].T
points_sweep[:, :3] += sweep['sensor2lidar_translation']
points_sweep[:, 4] = ts - sweep_ts
points_sweep = points.new_point(points_sweep)
sweep_points_list.append(points_sweep)
points = points.cat(sweep_points_list)
points = points[:, self.use_dim]
results['points'] = points
return results
def __repr__(self):
"""str: Return a string that describes the module."""
return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})'
@PIPELINES.register_module()
class PointSegClassMapping(object):
"""Map original semantic class to valid category ids.
Map valid classes as 0~len(valid_cat_ids)-1 and
others as len(valid_cat_ids).
Args:
valid_cat_ids (tuple[int]): A tuple of valid category.
"""
def __init__(self, valid_cat_ids):
self.valid_cat_ids = valid_cat_ids
def __call__(self, results):
"""Call function to map original semantic class to valid category ids.
Args:
results (dict): Result dict containing point semantic masks.
Returns:
dict: The result dict containing the mapped category ids. \
Updated key and value are described below.
- pts_semantic_mask (np.ndarray): Mapped semantic masks.
"""
assert 'pts_semantic_mask' in results
pts_semantic_mask = results['pts_semantic_mask']
neg_cls = len(self.valid_cat_ids)
for i in range(pts_semantic_mask.shape[0]):
if pts_semantic_mask[i] in self.valid_cat_ids:
converted_id = self.valid_cat_ids.index(pts_semantic_mask[i])
pts_semantic_mask[i] = converted_id
else:
pts_semantic_mask[i] = neg_cls
results['pts_semantic_mask'] = pts_semantic_mask
return results
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(valid_cat_ids={})'.format(self.valid_cat_ids)
return repr_str
@PIPELINES.register_module()
class NormalizePointsColor(object):
"""Normalize color of points.
Args:
color_mean (list[float]): Mean color of the point cloud.
"""
def __init__(self, color_mean):
self.color_mean = color_mean
def __call__(self, results):
"""Call function to normalize color of points.
Args:
results (dict): Result dict containing point clouds data.
Returns:
dict: The result dict containing the normalized points. \
Updated key and value are described below.
- points (np.ndarray): Points after color normalization.
"""
points = results['points']
assert points.shape[1] >= 6, \
f'Expect points have channel >=6, got {points.shape[1]}'
points[:, 3:6] = points[:, 3:6] - np.array(self.color_mean) / 256.0
results['points'] = points
return results
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(color_mean={})'.format(self.color_mean)
return repr_str
@PIPELINES.register_module()
class LoadPointsFromFile(object):
"""Load Points From File.
Load sunrgbd and scannet points from file.
Args:
load_dim (int): The dimension of the loaded points.
Defaults to 6.
coord_type (str): The type of coordinates of points cloud.
Available options includes:
- 'LIDAR': Points in LiDAR coordinates.
- 'DEPTH': Points in depth coordinates, usually for indoor dataset.
- 'CAMERA': Points in camera coordinates.
use_dim (list[int]): Which dimensions of the points to be used.
Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4
or use_dim=[0, 1, 2, 3] to use the intensity dimension.
shift_height (bool): Whether to use shifted height. Defaults to False.
file_client_args (dict): Config dict of file clients, refer to
https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
for more details. Defaults to dict(backend='disk').
"""
def __init__(self,
coord_type,
load_dim=6,
use_dim=[0, 1, 2],
shift_height=False,
file_client_args=dict(backend='disk')):
self.shift_height = shift_height
if isinstance(use_dim, int):
use_dim = list(range(use_dim))
assert max(use_dim) < load_dim, \
f'Expect all used dimensions < {load_dim}, got {use_dim}'
assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH']
self.coord_type = coord_type
self.load_dim = load_dim
self.use_dim = use_dim
self.file_client_args = file_client_args.copy()
self.file_client = None
def _load_points(self, pts_filename):
"""Private function to load point clouds data.
Args:
pts_filename (str): Filename of point clouds data.
Returns:
np.ndarray: An array containing point clouds data.
"""
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
try:
pts_bytes = self.file_client.get(pts_filename)
points = np.frombuffer(pts_bytes, dtype=np.float32)
except ConnectionError:
mmcv.check_file_exist(pts_filename)
if pts_filename.endswith('.npy'):
points = np.load(pts_filename)
else:
points = np.fromfile(pts_filename, dtype=np.float32)
return points
def __call__(self, results):
"""Call function to load points data from file.
Args:
results (dict): Result dict containing point clouds data.
Returns:
dict: The result dict containing the point clouds data. \
Added key and value are described below.
- points (np.ndarray): Point clouds data.
"""
pts_filename = results['pts_filename']
points = self._load_points(pts_filename)
points = points.reshape(-1, self.load_dim)
points = points[:, self.use_dim]
attribute_dims = None
if self.shift_height:
floor_height = np.percentile(points[:, 2], 0.99)
height = points[:, 2] - floor_height
points = np.concatenate([points, np.expand_dims(height, 1)], 1)
attribute_dims = dict(height=3)
points_class = get_points_type(self.coord_type)
points = points_class(
points, points_dim=points.shape[-1], attribute_dims=attribute_dims)
results['points'] = points
return results
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__ + '('
repr_str += 'shift_height={}, '.format(self.shift_height)
repr_str += 'file_client_args={}), '.format(self.file_client_args)
repr_str += 'load_dim={}, '.format(self.load_dim)
repr_str += 'use_dim={})'.format(self.use_dim)
return repr_str
@PIPELINES.register_module()
class LoadAnnotations3D(LoadAnnotations):
"""Load Annotations3D.
Load instance mask and semantic mask of points and
encapsulate the items into related fields.
Args:
with_bbox_3d (bool, optional): Whether to load 3D boxes.
Defaults to True.
with_label_3d (bool, optional): Whether to load 3D labels.
Defaults to True.
with_mask_3d (bool, optional): Whether to load 3D instance masks.
for points. Defaults to False.
with_seg_3d (bool, optional): Whether to load 3D semantic masks.
for points. Defaults to False.
with_bbox (bool, optional): Whether to load 2D boxes.
Defaults to False.
with_label (bool, optional): Whether to load 2D labels.
Defaults to False.
with_mask (bool, optional): Whether to load 2D instance masks.
Defaults to False.
with_seg (bool, optional): Whether to load 2D semantic masks.
Defaults to False.
poly2mask (bool, optional): Whether to convert polygon annotations
to bitmasks. Defaults to True.
seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks.
Defaults to int64
file_client_args (dict): Config dict of file clients, refer to
https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py
for more details.
"""
def __init__(self,
with_bbox_3d=True,
with_label_3d=True,
with_mask_3d=False,
with_seg_3d=False,
with_bbox=False,
with_label=False,
with_mask=False,
with_seg=False,
poly2mask=True,
seg_3d_dtype='int',
file_client_args=dict(backend='disk')):
super().__init__(
with_bbox,
with_label,
with_mask,
with_seg,
poly2mask,
file_client_args=file_client_args)
self.with_bbox_3d = with_bbox_3d
self.with_label_3d = with_label_3d
self.with_mask_3d = with_mask_3d
self.with_seg_3d = with_seg_3d
self.seg_3d_dtype = seg_3d_dtype
def _load_bboxes_3d(self, results):
"""Private function to load 3D bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box annotations.
"""
results['gt_bboxes_3d'] = results['ann_info']['gt_bboxes_3d']
results['bbox3d_fields'].append('gt_bboxes_3d')
return results
def _load_labels_3d(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['gt_labels_3d'] = results['ann_info']['gt_labels_3d']
return results
def _load_masks_3d(self, results):
"""Private function to load 3D mask annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D mask annotations.
"""
pts_instance_mask_path = results['ann_info']['pts_instance_mask_path']
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
try:
mask_bytes = self.file_client.get(pts_instance_mask_path)
pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int)
except ConnectionError:
mmcv.check_file_exist(pts_instance_mask_path)
pts_instance_mask = np.fromfile(
pts_instance_mask_path, dtype=np.long)
results['pts_instance_mask'] = pts_instance_mask
results['pts_mask_fields'].append('pts_instance_mask')
return results
def _load_semantic_seg_3d(self, results):
"""Private function to load 3D semantic segmentation annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing the semantic segmentation annotations.
"""
pts_semantic_mask_path = results['ann_info']['pts_semantic_mask_path']
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
try:
mask_bytes = self.file_client.get(pts_semantic_mask_path)
# add .copy() to fix read-only bug
pts_semantic_mask = np.frombuffer(
mask_bytes, dtype=self.seg_3d_dtype).copy()
except ConnectionError:
mmcv.check_file_exist(pts_semantic_mask_path)
pts_semantic_mask = np.fromfile(
pts_semantic_mask_path, dtype=np.long)
results['pts_semantic_mask'] = pts_semantic_mask
results['pts_seg_fields'].append('pts_semantic_mask')
return results
def __call__(self, results):
"""Call function to load multiple types annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box, label, mask and
semantic segmentation annotations.
"""
results = super().__call__(results)
if self.with_bbox_3d:
results = self._load_bboxes_3d(results)
if results is None:
return None
if self.with_label_3d:
results = self._load_labels_3d(results)
if self.with_mask_3d:
results = self._load_masks_3d(results)
if self.with_seg_3d:
results = self._load_semantic_seg_3d(results)
return results
def __repr__(self):
"""str: Return a string that describes the module."""
indent_str = ' '
repr_str = self.__class__.__name__ + '(\n'
repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, '
repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, '
repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, '
repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, '
repr_str += f'{indent_str}with_bbox={self.with_bbox}, '
repr_str += f'{indent_str}with_label={self.with_label}, '
repr_str += f'{indent_str}with_mask={self.with_mask}, '
repr_str += f'{indent_str}with_seg={self.with_seg}, '
repr_str += f'{indent_str}poly2mask={self.poly2mask})'
return repr_str
@PIPELINES.register_module()
class MyLoadAnnotations3D(LoadAnnotations3D):
def __init__(self, with_bbox_3d=True, with_label_3d=True, with_mask_3d=False, with_seg_3d=False, with_bbox=False,
with_label=False, with_mask=False, with_seg=False, poly2mask=True, with_centers=False, with_cam_bbox=False,
with_visible=False, seg_3d_dtype='int', file_client_args=dict(backend='disk')):
super().__init__(
with_bbox_3d=with_bbox_3d,
with_label_3d=with_label_3d,
with_mask_3d=with_mask_3d,
with_seg_3d=with_seg_3d,
with_bbox=with_bbox,
with_label=with_label,
with_mask=with_mask,
with_seg=with_seg,
poly2mask=poly2mask,
seg_3d_dtype=seg_3d_dtype,
file_client_args=file_client_args)
self.with_centers = with_centers
self.with_cam_bbox = with_cam_bbox
self.with_visible = with_visible
def __call__(self, results):
"""Call function to load multiple types annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded 3D bounding box, label, mask and
semantic segmentation annotations.
"""
results = super().__call__(results)
if self.with_centers:
results = self._load_centers_2d(results)
if self.with_cam_bbox:
results = self._load_cam_box(results)
if self.with_visible:
results = self._load_visible(results)
return results
def _load_centers_2d(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['gt_pts_centers_view'] = results['ann_info']['pts_centers_view']
results['gt_img_centers_view'] = results['ann_info']['img_centers_view']
return results
def _load_cam_box(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet3d.CustomDataset`.
Returns:
dict: The dict containing loaded label annotations.
"""
results['gt_bboxes_cam_view'] = results['ann_info']['bboxes_cam_view']
results['gt_bboxes_lidar_view'] = results['ann_info']['bboxes_lidar_view']
return results
def _load_visible(self, results):
results['gt_visible_3d'] = results['ann_info']['gt_visible_3d']
return results
@PIPELINES.register_module()
class SparseDepth(object):
"""
Generate a sparse depth map from the point clouds, the depth map should have the same size with image features
"""
def __init__(self, scale_factors, depth_mean=14.41, depth_var=156.89, exp_time=0):
self.scale_factors = scale_factors
self.depth_mean = depth_mean
self.depth_var = depth_var
self.exp_time = exp_time
def __call__(self, results):
all_points = results['points'].tensor
curr_mask = all_points[:, 4] == 0
points = all_points[curr_mask]
points = points[:, :3]
points_4d = torch.cat([points, torch.ones_like(points[:, :1])], dim=1)
lidar2cam_rs = results['lidar2cam_r']
lidar2cam_ts = results['lidar2cam_t']
cam_intrinsic = results['cam_intrinsic']
depth_features = []
for view_id in range(len(lidar2cam_rs)):
if 'valid_shape' in results:
h_shape = int(results['valid_shape'][view_id, 1])
w_shape = int(results['valid_shape'][view_id, 0])
else:
h_shape = results['pad_shape'][0]
w_shape = results['pad_shape'][1]
cam_ext = np.eye(4)
cam_int = np.eye(4)
cam_ext[:3, :3] = lidar2cam_rs[view_id]
cam_ext[:3, 3] = lidar2cam_ts[view_id]
cam_int[:3, :3] = cam_intrinsic[view_id]
cam_ext = torch.from_numpy(cam_ext).type_as(points_4d)
cam_int = torch.from_numpy(cam_int).type_as(points_4d)
points_4d_view = points_4d @ cam_ext.T
points_4d_view = points_4d_view @ cam_int.T
points_2d_view = points_4d_view[:, :2]
depth = points_4d_view[:, 2]
depth = torch.clamp(depth, min=1e-4)
points_2d_view[:, 0] = points_2d_view[:, 0] / depth
points_2d_view[:, 1] = points_2d_view[:, 1] / depth
valid_mask = (points_2d_view[:, 0] > 0) & (points_2d_view[:, 0] < w_shape-1) & \
(points_2d_view[:, 1] > 0) & (points_2d_view[:, 1] < h_shape-1)
points_2d_view = points_2d_view[valid_mask]
depth = depth[valid_mask]
sort_id = np.argsort(-depth)
points_2d_view = points_2d_view[sort_id]
depth = depth[sort_id]
depth_features_view = []
w_scale_shape = results['pad_shape'][1] // self.scale_factors[0]
h_scale_shape = results['pad_shape'][0] // self.scale_factors[0]
for scale in self.scale_factors:
w_scale_factor = 1.0 / scale
h_scale_factor = 1.0 / scale
scale_factor = torch.Tensor([[w_scale_factor, h_scale_factor]])
depth_feature = torch.zeros((2, h_scale_shape, w_scale_shape))
points_2d_view_scale = points_2d_view * scale_factor
cx = points_2d_view_scale[:, 0].long()
cy = points_2d_view_scale[:, 1].long()
depth_feature[0, cy, cx] = depth
depth_feature[1, cy, cx] = 1
if self.exp_time > 0:
zero_inds = depth_feature[1] == 0
depth_map = depth_feature[0]
depth_map[zero_inds] = 9999
for i in range(self.exp_time):
depth_feature_new = torch.zeros_like(depth_map) + 9999
depth_feature_new[1:] = torch.minimum(depth_feature_new[1:], depth_map[:-1])
depth_feature_new[:-1] = torch.minimum(depth_feature_new[:-1], depth_map[1:])
depth_feature_new[:, 1:] = torch.minimum(depth_feature_new[:, 1:], depth_map[:, :-1])
depth_feature_new[:, :-1] = torch.minimum(depth_feature_new[:, :-1], depth_map[:, 1:])
depth_map = torch.where(zero_inds, depth_feature_new, depth_map)
zero_inds = depth_map == 9999
depth_map[zero_inds] = 0
depth_feature[0] = depth_map
depth_feature[1, torch.logical_not(zero_inds)] = 1
depth_features_view.append(depth_feature)
depth_features_view = torch.stack(depth_features_view, dim=0) # [num_scale, 2, h_scale_shape, w_scale_shape)
depth_features.append(depth_features_view)
depth_features = torch.stack(depth_features, dim=0) # [num_view, num_scale, 2, h_scale_shape, w_scale_shape)
depth_features[:, :, 0] = (depth_features[:, :, 0] - self.depth_mean) / np.sqrt(self.depth_var)
depth_features[:, :, 0] = depth_features[:, :, 0] * depth_features[:, :, 1]
results['sparse_depth'] = depth_features
return results
================================================
FILE: mmdet3d/datasets/pipelines/test_time_aug.py
================================================
import mmcv
import warnings
from copy import deepcopy
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import Compose
@PIPELINES.register_module()
class MultiScaleFlipAug3D(object):
"""Test-time augmentation with multiple scales and flipping.
Args:
transforms (list[dict]): Transforms to apply in each augmentation.
img_scale (tuple | list[tuple]: Images scales for resizing.
pts_scale_ratio (float | list[float]): Points scale ratios for
resizing.
flip (bool): Whether apply flip augmentation. Defaults to False.
flip_direction (str | list[str]): Flip augmentation directions
for images, options are "horizontal" and "vertical".
If flip_direction is list, multiple flip augmentations will
be applied. It has no effect when ``flip == False``.
Defaults to "horizontal".
pcd_horizontal_flip (bool): Whether apply horizontal flip augmentation
to point cloud. Defaults to True. Note that it works only when
'flip' is turned on.
pcd_vertical_flip (bool): Whether apply vertical flip augmentation
to point cloud. Defaults to True. Note that it works only when
'flip' is turned on.
"""
def __init__(self,
transforms,
img_scale,
pts_scale_ratio,
pts_rotation=0,
flip=False,
flip_direction='horizontal',
pcd_horizontal_flip=False,
pcd_vertical_flip=False):
self.transforms = Compose(transforms)
self.img_scale = img_scale if isinstance(img_scale,
list) else [img_scale]
self.pts_scale_ratio = pts_scale_ratio \
if isinstance(pts_scale_ratio, list) else[float(pts_scale_ratio)]
self.pts_rotation = pts_rotation if isinstance(pts_rotation, list) else[float(pts_rotation)]
assert mmcv.is_list_of(self.img_scale, tuple)
assert mmcv.is_list_of(self.pts_scale_ratio, float)
assert mmcv.is_list_of(self.pts_rotation, float)
self.flip = flip
self.pcd_horizontal_flip = pcd_horizontal_flip
self.pcd_vertical_flip = pcd_vertical_flip
self.flip_direction = flip_direction if isinstance(
flip_direction, list) else [flip_direction]
assert mmcv.is_list_of(self.flip_direction, str)
if not self.flip and self.flip_direction != ['horizontal']:
warnings.warn(
'flip_direction has no effect when flip is set to False')
if (self.flip and not any([(t['type'] == 'RandomFlip3D'
or t['type'] == 'RandomFlip')
for t in transforms])):
warnings.warn(
'flip has no effect when RandomFlip is not in transforms')
def __call__(self, results):
"""Call function to augment common fields in results.
Args:
results (dict): Result dict contains the data to augment.
Returns:
dict: The result dict contains the data that is augmented with \
different scales and flips.
"""
aug_data = []
# modified from `flip_aug = [False, True] if self.flip else [False]`
# to reduce unnecessary scenes when using double flip augmentation
# during test time
flip_aug = [True] if self.flip else [False]
pcd_horizontal_flip_aug = [False, True] \
if self.flip and self.pcd_horizontal_flip else [False]
pcd_vertical_flip_aug = [False, True] \
if self.flip and self.pcd_vertical_flip else [False]
for scale in self.img_scale:
for pts_scale_ratio in self.pts_scale_ratio:
for pts_rotation in self.pts_rotation:
for flip in flip_aug:
for pcd_horizontal_flip in pcd_horizontal_flip_aug:
for pcd_vertical_flip in pcd_vertical_flip_aug:
for direction in self.flip_direction:
# results.copy will cause bug
# since it is shallow copy
_results = deepcopy(results)
_results['scale'] = scale
_results['flip'] = flip
_results['pcd_scale_factor'] = \
pts_scale_ratio
_results['flip_direction'] = direction
_results['pcd_horizontal_flip'] = \
pcd_horizontal_flip
_results['pcd_vertical_flip'] = \
pcd_vertical_flip
_results['pcd_rotation_angle'] = pts_rotation
data = self.transforms(_results)
aug_data.append(data)
# list of dict to dict of list
aug_data_dict = {key: [] for key in aug_data[0]}
for data in aug_data:
for key, val in data.items():
aug_data_dict[key].append(val)
return aug_data_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(transforms={self.transforms}, '
repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, '
repr_str += f'flip_direction={self.flip_direction})'
return repr_str
================================================
FILE: mmdet3d/datasets/pipelines/transforms_2d.py
================================================
import copy
import inspect
import math
import warnings
import cv2
import mmcv
import numpy as np
from numpy import random
from mmdet.datasets.builder import PIPELINES
@PIPELINES.register_module()
class OurRandomAffine:
"""Random affine transform data augmentation.
This operation randomly generates affine transform matrix which including
rotation, translation, shear and scaling transforms.
Args:
max_rotate_degree (float): Maximum degrees of rotation transform.
Default: 10.
max_translate_ratio (float): Maximum ratio of translation.
Default: 0.1.
scaling_ratio_range (tuple[float]): Min and max ratio of
scaling transform. Default: (0.5, 1.5).
max_shear_degree (float): Maximum degrees of shear
transform. Default: 2.
border (tuple[int]): Distance from height and width sides of input
image to adjust output shape. Only used in mosaic dataset.
Default: (0, 0).
border_val (tuple[int]): Border padding values of 3 channels.
Default: (114, 114, 114).
min_bbox_size (float): Width and height threshold to filter bboxes.
If the height or width of a box is smaller than this value, it
will be removed. Default: 2.
min_area_ratio (float): Threshold of area ratio between
original bboxes and wrapped bboxes. If smaller than this value,
the box will be removed. Default: 0.2.
max_aspect_ratio (float): Aspect ratio of width and height
threshold to filter bboxes. If max(h/w, w/h) larger than this
value, the box will be removed.
bbox_clip_border (bool, optional): Whether to clip the objects outside
the border of the image. In some dataset like MOT17, the gt bboxes
are allowed to cross the border of images. Therefore, we don't
need to clip the gt bboxes in these cases. Defaults to True.
skip_filter (bool): Whether to skip filtering rules. If it
is True, the filter rule will not be applied, and the
`min_bbox_size` and `min_area_ratio` and `max_aspect_ratio`
is invalid. Default to True.
"""
def __init__(self,
# max_translate_ratio=0.1,
scaling_ratio_range=(0.5, 1.5),
flip_ratio=0.5,
border=(0, 0),
border_val=(103.53, 116.28, 123.675),
bbox_clip_border=True,
flip_sync_3d=False,
scaling_sync_view=False,
trans_when_scaling=True,
):
# assert 0 <= max_translate_ratio <= 1
assert scaling_ratio_range[0] <= scaling_ratio_range[1]
assert scaling_ratio_range[0] > 0
# self.max_translate_ratio = max_translate_ratio
self.scaling_ratio_range = scaling_ratio_range
self.flip_ratio = flip_ratio
self.border = border
self.border_val = border_val
self.bbox_clip_border = bbox_clip_border
self.flip_sync = flip_sync_3d
self.scaling_sync_view = scaling_sync_view
self.trans_when_scaling = trans_when_scaling
def _transform_bbox(self, results, warp_mats, flips, width, height):
valid_mask = np.ones(results['gt_labels'].shape[0]) > 0
if 'gt_bboxes_cam_view' in results:
bboxes_cam = results['gt_bboxes_cam_view']
else:
bboxes_cam = None
for view_id in range(len(warp_mats)):
warp_matrix = warp_mats[view_id]
bbox_mask = results['gt_labels'][:, 1] == view_id
if np.sum(bbox_mask) == 0:
continue
flip = flips[view_id]
flip_matrix = self._get_flip_matrix(flip, width)
if bboxes_cam is not None:
if flip:
bboxes_cam.tensor[bbox_mask, 0::7] = -bboxes_cam.tensor[bbox_mask, 0::7]
bboxes_cam.tensor[bbox_mask, 6] = -bboxes_cam.tensor[bbox_mask, 6] + np.pi
bbox_view = results['gt_bboxes'][bbox_mask]
centers_view = results['gt_img_centers_view'][bbox_mask, :2]
num_bboxes = bbox_view.shape[0]
xtl = bbox_view[:, 0] - bbox_view[:, 2] / 2
ytl = bbox_view[:, 1] - bbox_view[:, 3] / 2
xtr = bbox_view[:, 0] + bbox_view[:, 2] / 2
ytr = bbox_view[:, 1] - bbox_view[:, 3] / 2
xbl = bbox_view[:, 0] - bbox_view[:, 2] / 2
ybl = bbox_view[:, 1] + bbox_view[:, 3] / 2
xbr = bbox_view[:, 0] + bbox_view[:, 2] / 2
ybr = bbox_view[:, 1] + bbox_view[:, 3] / 2
xs = np.vstack([xtl, xtr, xbl, xbr]).T # [N, 4]
ys = np.vstack([ytl, ytr, ybl, ybr]).T # [N, 4]
xs = xs.reshape(-1) # [N*4,]
ys = ys.reshape(-1) # [N*4,]
ones = np.ones_like(ys)
points = np.vstack([xs, ys, ones]) # [3, N*4]
warp_points = warp_matrix @ flip_matrix @ points # [3, N*4]
warp_points = warp_points[:2] / warp_points[2]
xs = warp_points[0].reshape(num_bboxes, 4) # [N, 4]
ys = warp_points[1].reshape(num_bboxes, 4) # [N, 4]
xs_min = xs.min(1) # [N, ]
ys_min = ys.min(1) # [N, ]
xs_max = xs.max(1) # [N, ]
ys_max = ys.max(1) # [N, ]
if self.bbox_clip_border:
xs_min = xs_min.clip(0, width)
xs_max = xs_max.clip(0, width)
ys_min = ys_min.clip(0, height)
ys_max = ys_max.clip(0, height)
cxs = (xs_min + xs_max) / 2
cys = (ys_min + ys_max) / 2
ws = xs_max - xs_min
hs = ys_max - ys_min
warp_bboxes = np.vstack((cxs, cys, ws, hs)).T # [N, 4]
ones = np.ones_like(centers_view[:, :1]) # [N, 1]
center_points = np.concatenate([centers_view, ones], axis=1).T # [3, N]
warp_points = warp_matrix @ flip_matrix @ center_points # [3, N]
warp_points = warp_points[:2] / warp_points[2]
new_center_points = warp_points.T # [N, 2]
valid_mask_view = (new_center_points[:, 0] > 0) & (new_center_points[:, 0] < width-1) & (new_center_points[:, 1] > 0) & (new_center_points[:, 1] < height-1)
valid_mask[bbox_mask] = valid_mask_view
results['gt_bboxes'][bbox_mask] = warp_bboxes
results['gt_img_centers_view'][bbox_mask, :2] = new_center_points
if 'gt_bboxes_cam_view' in results:
results['gt_bboxes_cam_view'] = bboxes_cam[valid_mask]
results['gt_bboxes_lidar_view'] = results['gt_bboxes_lidar_view'][valid_mask]
results['gt_bboxes'] = results['gt_bboxes'][valid_mask]
results['gt_img_centers_view'] = results['gt_img_centers_view'][valid_mask]
results['gt_pts_centers_view'] = results['gt_pts_centers_view'][valid_mask]
results['gt_labels'] = results['gt_labels'][valid_mask]
return results
def _transform_camera(self, results, warp_mats, flips, width):
for id in range(len(warp_mats)):
flip = flips[id]
flip_matrix = self._get_flip_matrix(flip, width)
intrinsic = results['cam_intrinsic'][id]
warp_matrix = warp_mats[id] @ flip_matrix
# intrinsic = warp_matrix @ intrinsic
# results['cam_intrinsic'][id] = intrinsic
viewpad = np.eye(4)
viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = warp_matrix
results['lidar2img'][id] = viewpad @ results['lidar2img'][id]
if flip:
flip_matrix = np.eye(3)
flip_matrix[0, 0] = -1
results['lidar2cam_r'][id] = flip_matrix @ results['lidar2cam_r'][id]
results['lidar2cam_t'][id] = flip_matrix @ results['lidar2cam_t'][id]
results['cam_intrinsic'][id][0, 2] = width - results['cam_intrinsic'][id][0, 2]
intrinsic = warp_mats[id] @ intrinsic
results['cam_intrinsic'][id] = intrinsic
return results
def __call__(self, results):
translate_mats = []
scale_mats = []
warp_mats = []
flips = []
scaling_ratios = []
valid_shapes = []
results['image_flip'] = []
flip_3d = False
if 'pcd_horizontal_flip' in results and results['pcd_horizontal_flip'] == True:
flip_3d = not flip_3d
if 'pcd_vertical_flip' in results and results['pcd_vertical_flip'] == True:
flip_3d = not flip_3d
if self.scaling_sync_view:
scaling_ratio = random.uniform(self.scaling_ratio_range[0], self.scaling_ratio_range[1])
for view_id in range(len(results['img'])):
img = results['img'][view_id]
height = img.shape[0] + self.border[0] * 2
width = img.shape[1] + self.border[1] * 2
if self.flip_sync:
flip = flip_3d
else:
flip = True if np.random.random() < self.flip_ratio else False
flips.append(flip)
if flip:
results['image_flip'].append(True)
img = cv2.flip(img, 1)
else:
results['image_flip'].append(False)
# Scaling
if not self.scaling_sync_view:
scaling_ratio = random.uniform(self.scaling_ratio_range[0], self.scaling_ratio_range[1])
scaling_matrix = self._get_scaling_matrix(scaling_ratio)
scaling_ratios.append(scaling_ratio)
reduction_ratio = min(1.0, scaling_ratio)
valid_shapes.append([reduction_ratio*width, reduction_ratio*height])
# Translation
if self.trans_when_scaling:
if scaling_ratio <= 1:
trans_x = 0
trans_y = 0
else:
trans_x = random.uniform((1 - scaling_ratio) * width, 0)
trans_y = random.uniform((1 - scaling_ratio) * height, 0)
else:
trans_x = 0
trans_y = 0
# trans_x = random.uniform(-self.max_translate_ratio, self.max_translate_ratio) * width
# trans_y = random.uniform(-self.max_translate_ratio, self.max_translate_ratio) * height
translate_matrix = self._get_translation_matrix(trans_x, trans_y)
warp_matrix = translate_matrix @ scaling_matrix
img = cv2.warpPerspective(
img,
warp_matrix,
dsize=(width, height),
borderValue=self.border_val
)
results['img'][view_id] = img
translate_mats.append(translate_matrix)
scale_mats.append(scaling_matrix)
warp_mats.append(warp_matrix)
# results['img_shape'] = img.shape
results['valid_shape'] = np.array(valid_shapes)
results['img_scale_ratios'] = np.array(scaling_ratios)
if 'gt_bboxes' in results:
results = self._transform_bbox(results, warp_mats, flips, width, height)
results = self._transform_camera(results, warp_mats, flips, width)
return results
def __repr__(self):
repr_str = self.__class__.__name__
# repr_str += f'max_translate_ratio={self.max_translate_ratio}, '
repr_str += f'scaling_ratio={self.scaling_ratio_range}, '
repr_str += f'flip_ratio={self.flip_ratio}, '
repr_str += f'border={self.border}, '
repr_str += f'border_val={self.border_val}, '
return repr_str
@staticmethod
def _get_scaling_matrix(scale_ratio):
scaling_matrix = np.array(
[[scale_ratio, 0., 0.], [0., scale_ratio, 0.], [0., 0., 1.]],
dtype=np.float32)
return scaling_matrix
@staticmethod
def _get_translation_matrix(x, y):
translation_matrix = np.array([[1, 0., x], [0., 1, y], [0., 0., 1.]],
dtype=np.float32)
return translation_matrix
@staticmethod
def _get_flip_matrix(flip, width):
if flip:
flip_matrix = np.array([
[-1, 0, width],
[0, 1, 0],
[0, 0, 1]
])
else:
flip_matrix = np.eye(3)
return flip_matrix
@PIPELINES.register_module()
class PhotoMetricDistortionMultiViewImage:
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18,
swap_channel=True):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
self.swap_channel = swap_channel
def __call__(self, results):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
imgs = results['img']
new_imgs = []
for img_ in imgs:
img = img_.astype(np.float32)
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
img = np.clip(img, a_max=255, a_min=0)
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
img = np.clip(img, a_max=255, a_min=0)
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
img[..., 1] = np.clip(img[..., 1], a_max=1, a_min=0)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
# import pdb
# pdb.set_trace()
alpha = random.uniform(self.contrast_lower, self.contrast_upper)
img *= alpha
# import pdb
# pdb.set_trace()
img = np.clip(img, a_max=255, a_min=0)
# randomly swap channels
if self.swap_channel:
if random.randint(2):
img = img[..., random.permutation(3)]
new_imgs.append(img.astype(np.uint8))
results['img'] = new_imgs
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str
================================================
FILE: mmdet3d/datasets/pipelines/transforms_3d.py
================================================
import numpy as np
from mmcv import is_tuple_of
from mmcv.utils import build_from_cfg
from mmdet3d.core import VoxelGenerator
from mmdet3d.core.bbox import box_np_ops
from mmdet.datasets.builder import PIPELINES
from mmdet.datasets.pipelines import RandomFlip
from ..registry import OBJECTSAMPLERS
from .data_augment_utils import noise_per_object_v3_
@PIPELINES.register_module()
class RandomFlip3D(RandomFlip):
"""Flip the points & bbox.
If the input dict contains the key "flip", then the flag will be used,
otherwise it will be randomly decided by a ratio specified in the init
method.
Args:
sync_2d (bool, optional): Whether to apply flip according to the 2D
images. If True, it will apply the same flip as that to 2D images.
If False, it will decide whether to flip randomly and independently
to that of 2D images. Defaults to True.
flip_ratio_bev_horizontal (float, optional): The flipping probability
in horizontal direction. Defaults to 0.0.
flip_ratio_bev_vertical (float, optional): The flipping probability
in vertical direction. Defaults to 0.0.
"""
def __init__(self,
sync_2d=True,
flip_ratio_bev_horizontal=0.0,
flip_ratio_bev_vertical=0.0,
**kwargs):
super(RandomFlip3D, self).__init__(
flip_ratio=flip_ratio_bev_horizontal, **kwargs)
self.sync_2d = sync_2d
self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
if flip_ratio_bev_horizontal is not None:
assert isinstance(
flip_ratio_bev_horizontal,
(int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
if flip_ratio_bev_vertical is not None:
assert isinstance(
flip_ratio_bev_vertical,
(int, float)) and 0 <= flip_ratio_bev_vertical <= 1
def random_flip_data_3d(self, input_dict, direction='horizontal'):
"""Flip 3D data randomly.
Args:
input_dict (dict): Result dict from loading pipeline.
direction (str): Flip direction. Default: horizontal.
Returns:
dict: Flipped results, 'points', 'bbox3d_fields' keys are \
updated in the result dict.
"""
assert direction in ['horizontal', 'vertical']
if len(input_dict['bbox3d_fields']) == 0: # test mode
input_dict['bbox3d_fields'].append('empty_box3d')
input_dict['empty_box3d'] = input_dict['box_type_3d'](
np.array([], dtype=np.float32))
assert len(input_dict['bbox3d_fields']) == 1
for key in input_dict['bbox3d_fields']:
input_dict['points'] = input_dict[key].flip(
direction, points=input_dict['points'])
def __call__(self, input_dict):
"""Call function to flip points, values in the ``bbox3d_fields`` and \
also flip 2D image and its annotations.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Flipped results, 'flip', 'flip_direction', \
'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \
into result dict.
"""
# filp 2D image and its annotations
super(RandomFlip3D, self).__call__(input_dict)
if self.sync_2d:
input_dict['pcd_horizontal_flip'] = input_dict['flip']
input_dict['pcd_vertical_flip'] = False
else:
if 'pcd_horizontal_flip' not in input_dict:
flip_horizontal = True if np.random.rand(
) < self.flip_ratio else False
input_dict['pcd_horizontal_flip'] = flip_horizontal
if 'pcd_vertical_flip' not in input_dict:
flip_vertical = True if np.random.rand(
) < self.flip_ratio_bev_vertical else False
input_dict['pcd_vertical_flip'] = flip_vertical
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
if input_dict['pcd_horizontal_flip']:
self.random_flip_data_3d(input_dict, 'horizontal')
input_dict['transformation_3d_flow'].extend(['HF'])
if input_dict['pcd_vertical_flip']:
self.random_flip_data_3d(input_dict, 'vertical')
input_dict['transformation_3d_flow'].extend(['VF'])
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(sync_2d={},'.format(self.sync_2d)
repr_str += 'flip_ratio_bev_vertical={})'.format(
self.flip_ratio_bev_vertical)
return repr_str
@PIPELINES.register_module()
class OurRandomFlip3D(object):
"""Flip the points & bbox.
If the input dict contains the key "flip", then the flag will be used,
otherwise it will be randomly decided by a ratio specified in the init
method.
Args:
sync_2d (bool, optional): Whether to apply flip according to the 2D
images. If True, it will apply the same flip as that to 2D images.
If False, it will decide whether to flip randomly and independently
to that of 2D images. Defaults to True.
flip_ratio_bev_horizontal (float, optional): The flipping probability
in horizontal direction. Defaults to 0.0.
flip_ratio_bev_vertical (float, optional): The flipping probability
in vertical direction. Defaults to 0.0.
"""
def __init__(self,
sync_2d=True,
flip_ratio_bev_horizontal=0.0,
flip_ratio_bev_vertical=0.0,
**kwargs):
# super(OurRandomFlip3D, self).__init__(
# flip_ratio=flip_ratio_bev_horizontal, **kwargs)
self.sync_2d = sync_2d
self.flip_ratio = flip_ratio_bev_horizontal
self.flip_ratio_bev_vertical = flip_ratio_bev_vertical
if flip_ratio_bev_horizontal is not None:
assert isinstance(
flip_ratio_bev_horizontal,
(int, float)) and 0 <= flip_ratio_bev_horizontal <= 1
if flip_ratio_bev_vertical is not None:
assert isinstance(
flip_ratio_bev_vertical,
(int, float)) and 0 <= flip_ratio_bev_vertical <= 1
def random_flip_data_3d(self, input_dict, direction='horizontal'):
"""Flip 3D data randomly.
Args:
input_dict (dict): Result dict from loading pipeline.
direction (str): Flip direction. Default: horizontal.
Returns:
dict: Flipped results, 'points', 'bbox3d_fields' keys are \
updated in the result dict.
"""
assert direction in ['horizontal', 'vertical']
if len(input_dict['bbox3d_fields']) == 0: # test mode
input_dict['bbox3d_fields'].append('empty_box3d')
input_dict['empty_box3d'] = input_dict['box_type_3d'](
np.array([], dtype=np.float32))
assert len(input_dict['bbox3d_fields']) == 1
for key in input_dict['bbox3d_fields']:
input_dict['points'] = input_dict[key].flip(
direction, points=input_dict['points'])
if direction == 'horizontal':
diag = np.ones(3)
diag[1] = -1
elif direction == 'vertical':
diag = np.ones(3)
diag[0] = -1
matrix = np.diag(diag)
for id in range(len(input_dict['lidar2cam_r'])):
input_dict['lidar2cam_r'][id] = input_dict['lidar2cam_r'][id] @ matrix
if 'gt_pts_centers_view' in input_dict and input_dict['gt_pts_centers_view'].shape[0] > 0:
input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] @ matrix
if 'gt_bboxes_lidar_view' in input_dict:
input_dict['gt_bboxes_lidar_view'].flip(direction)
def __call__(self, input_dict):
"""Call function to flip points, values in the ``bbox3d_fields`` and \
also flip 2D image and its annotations.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Flipped results, 'flip', 'flip_direction', \
'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added \
into result dict.
"""
# filp 2D image and its annotations
# super(OurRandomFlip3D, self).__call__(input_dict)
if self.sync_2d:
input_dict['pcd_horizontal_flip'] = input_dict['flip']
input_dict['pcd_vertical_flip'] = False
else:
if 'pcd_horizontal_flip' not in input_dict:
flip_horizontal = True if np.random.rand(
) < self.flip_ratio else False
input_dict['pcd_horizontal_flip'] = flip_horizontal
if 'pcd_vertical_flip' not in input_dict:
flip_vertical = True if np.random.rand(
) < self.flip_ratio_bev_vertical else False
input_dict['pcd_vertical_flip'] = flip_vertical
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
if input_dict['pcd_horizontal_flip']:
self.random_flip_data_3d(input_dict, 'horizontal')
input_dict['transformation_3d_flow'].extend(['HF'])
if input_dict['pcd_vertical_flip']:
self.random_flip_data_3d(input_dict, 'vertical')
input_dict['transformation_3d_flow'].extend(['VF'])
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(sync_2d={},'.format(self.sync_2d)
repr_str += 'flip_ratio_bev_vertical={})'.format(
self.flip_ratio_bev_vertical)
return repr_str
@PIPELINES.register_module()
class ObjectSample(object):
"""Sample GT objects to the data.
Args:
db_sampler (dict): Config dict of the database sampler.
sample_2d (bool): Whether to also paste 2D image patch to the images
This should be true when applying multi-modality cut-and-paste.
Defaults to False.
"""
def __init__(self, db_sampler, sample_2d=False):
self.sampler_cfg = db_sampler
self.sample_2d = sample_2d
if 'type' not in db_sampler.keys():
db_sampler['type'] = 'DataBaseSampler'
self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS)
@staticmethod
def remove_points_in_boxes(points, boxes):
"""Remove the points in the sampled bounding boxes.
Args:
points (np.ndarray): Input point cloud array.
boxes (np.ndarray): Sampled ground truth boxes.
Returns:
np.ndarray: Points with those in the boxes removed.
"""
masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes)
points = points[np.logical_not(masks.any(-1))]
return points
def __call__(self, input_dict):
"""Call function to sample ground truth objects to the data.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after object sampling augmentation, \
'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated \
in the result dict.
"""
gt_bboxes_3d = input_dict['gt_bboxes_3d']
gt_labels_3d = input_dict['gt_labels_3d']
# change to float for blending operation
points = input_dict['points']
if self.sample_2d:
img = input_dict['img']
gt_bboxes_2d = input_dict['gt_bboxes']
# Assume for now 3D & 2D bboxes are the same
sampled_dict = self.db_sampler.sample_all(
gt_bboxes_3d.tensor.numpy(),
gt_labels_3d,
gt_bboxes_2d=gt_bboxes_2d,
img=img)
else:
sampled_dict = self.db_sampler.sample_all(
gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None)
if sampled_dict is not None:
sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d']
sampled_points = sampled_dict['points']
sampled_gt_labels = sampled_dict['gt_labels_3d']
gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels],
axis=0)
gt_bboxes_3d = gt_bboxes_3d.new_box(
np.concatenate(
[gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d]))
points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d)
# check the points dimension
points = points.cat([sampled_points, points])
if self.sample_2d:
sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d']
gt_bboxes_2d = np.concatenate(
[gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32)
input_dict['gt_bboxes'] = gt_bboxes_2d
input_dict['img'] = sampled_dict['img']
input_dict['gt_bboxes_3d'] = gt_bboxes_3d
input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.long)
input_dict['points'] = points
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f' sample_2d={self.sample_2d},'
repr_str += f' data_root={self.sampler_cfg.data_root},'
repr_str += f' info_path={self.sampler_cfg.info_path},'
repr_str += f' rate={self.sampler_cfg.rate},'
repr_str += f' prepare={self.sampler_cfg.prepare},'
repr_str += f' classes={self.sampler_cfg.classes},'
repr_str += f' sample_groups={self.sampler_cfg.sample_groups}'
return repr_str
@PIPELINES.register_module()
class ObjectNoise(object):
"""Apply noise to each GT objects in the scene.
Args:
translation_std (list[float], optional): Standard deviation of the
distribution where translation noise are sampled from.
Defaults to [0.25, 0.25, 0.25].
global_rot_range (list[float], optional): Global rotation to the scene.
Defaults to [0.0, 0.0].
rot_range (list[float], optional): Object rotation range.
Defaults to [-0.15707963267, 0.15707963267].
num_try (int, optional): Number of times to try if the noise applied is
invalid. Defaults to 100.
"""
def __init__(self,
translation_std=[0.25, 0.25, 0.25],
global_rot_range=[0.0, 0.0],
rot_range=[-0.15707963267, 0.15707963267],
num_try=100):
self.translation_std = translation_std
self.global_rot_range = global_rot_range
self.rot_range = rot_range
self.num_try = num_try
def __call__(self, input_dict):
"""Call function to apply noise to each ground truth in the scene.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after adding noise to each object, \
'points', 'gt_bboxes_3d' keys are updated in the result dict.
"""
gt_bboxes_3d = input_dict['gt_bboxes_3d']
points = input_dict['points']
# TODO: check this inplace function
numpy_box = gt_bboxes_3d.tensor.numpy()
numpy_points = points.tensor.numpy()
noise_per_object_v3_(
numpy_box,
numpy_points,
rotation_perturb=self.rot_range,
center_noise_std=self.translation_std,
global_random_rot_range=self.global_rot_range,
num_try=self.num_try)
input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box)
input_dict['points'] = points.new_point(numpy_points)
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(num_try={},'.format(self.num_try)
repr_str += ' translation_std={},'.format(self.translation_std)
repr_str += ' global_rot_range={},'.format(self.global_rot_range)
repr_str += ' rot_range={})'.format(self.rot_range)
return repr_str
@PIPELINES.register_module()
class GlobalRotScaleTrans(object):
"""Apply global rotation, scaling and translation to a 3D scene.
Args:
rot_range (list[float]): Range of rotation angle.
Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
scale_ratio_range (list[float]): Range of scale ratio.
Defaults to [0.95, 1.05].
translation_std (list[float]): The standard deviation of ranslation
noise. This apply random translation to a scene by a noise, which
is sampled from a gaussian distribution whose standard deviation
is set by ``translation_std``. Defaults to [0, 0, 0]
shift_height (bool): Whether to shift height.
(the fourth dimension of indoor points) when scaling.
Defaults to False.
"""
def __init__(self,
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
shift_height=False):
self.rot_range = rot_range
self.scale_ratio_range = scale_ratio_range
self.translation_std = translation_std
self.shift_height = shift_height
def _trans_bbox_points(self, input_dict):
"""Private function to translate bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after translation, 'points', 'pcd_trans' \
and keys in input_dict['bbox3d_fields'] are updated \
in the result dict.
"""
if not isinstance(self.translation_std, (list, tuple, np.ndarray)):
translation_std = [
self.translation_std, self.translation_std,
self.translation_std
]
else:
translation_std = self.translation_std
translation_std = np.array(translation_std, dtype=np.float32)
trans_factor = np.random.normal(scale=translation_std, size=3).T
input_dict['points'].translate(trans_factor)
input_dict['pcd_trans'] = trans_factor
for key in input_dict['bbox3d_fields']:
input_dict[key].translate(trans_factor)
def _rot_bbox_points(self, input_dict):
"""Private function to rotate bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after rotation, 'points', 'pcd_rotation' \
and keys in input_dict['bbox3d_fields'] are updated \
in the result dict.
"""
rotation = self.rot_range
if not isinstance(rotation, list):
rotation = [-rotation, rotation]
noise_rotation = np.random.uniform(rotation[0], rotation[1])
for key in input_dict['bbox3d_fields']:
if len(input_dict[key].tensor) != 0:
points, rot_mat_T = input_dict[key].rotate(
noise_rotation, input_dict['points'])
input_dict['points'] = points
input_dict['pcd_rotation'] = rot_mat_T
# input_dict['points_instance'].rotate(noise_rotation)
def _scale_bbox_points(self, input_dict):
"""Private function to scale bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points'and keys in \
input_dict['bbox3d_fields'] are updated in the result dict.
"""
scale = input_dict['pcd_scale_factor']
points = input_dict['points']
points.scale(scale)
if self.shift_height:
assert 'height' in points.attribute_dims.keys()
points.tensor[:, points.attribute_dims['height']] *= scale
input_dict['points'] = points
for key in input_dict['bbox3d_fields']:
input_dict[key].scale(scale)
def _random_scale(self, input_dict):
"""Private function to randomly set the scale factor.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'pcd_scale_factor' are updated \
in the result dict.
"""
scale_factor = np.random.uniform(self.scale_ratio_range[0],
self.scale_ratio_range[1])
input_dict['pcd_scale_factor'] = scale_factor
def __call__(self, input_dict):
"""Private function to rotate, scale and translate bounding boxes and \
points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans' and keys in \
input_dict['bbox3d_fields'] are updated in the result dict.
"""
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
self._rot_bbox_points(input_dict)
if 'pcd_scale_factor' not in input_dict:
self._random_scale(input_dict)
self._scale_bbox_points(input_dict)
self._trans_bbox_points(input_dict)
input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(rot_range={},'.format(self.rot_range)
repr_str += ' scale_ratio_range={},'.format(self.scale_ratio_range)
repr_str += ' translation_std={})'.format(self.translation_std)
repr_str += ' shift_height={})'.format(self.shift_height)
return repr_str
@PIPELINES.register_module()
class OurGlobalRotScaleTrans(object):
"""Apply global rotation, scaling and translation to a 3D scene.
Args:
rot_range (list[float]): Range of rotation angle.
Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]).
scale_ratio_range (list[float]): Range of scale ratio.
Defaults to [0.95, 1.05].
translation_std (list[float]): The standard deviation of ranslation
noise. This apply random translation to a scene by a noise, which
is sampled from a gaussian distribution whose standard deviation
is set by ``translation_std``. Defaults to [0, 0, 0]
shift_height (bool): Whether to shift height.
(the fourth dimension of indoor points) when scaling.
Defaults to False.
"""
def __init__(self,
rot_range=[-0.78539816, 0.78539816],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0],
shift_height=False):
self.rot_range = rot_range
self.scale_ratio_range = scale_ratio_range
self.translation_std = translation_std
self.shift_height = shift_height
def _trans_bbox_points(self, input_dict):
"""Private function to translate bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after translation, 'points', 'pcd_trans' \
and keys in input_dict['bbox3d_fields'] are updated \
in the result dict.
"""
if not isinstance(self.translation_std, (list, tuple, np.ndarray)):
translation_std = [
self.translation_std, self.translation_std,
self.translation_std
]
else:
translation_std = self.translation_std
translation_std = np.array(translation_std, dtype=np.float32)
trans_factor = np.random.normal(scale=translation_std, size=3).T
input_dict['points'].translate(trans_factor)
input_dict['pcd_trans'] = trans_factor
for key in input_dict['bbox3d_fields']:
input_dict[key].translate(trans_factor)
for id in range(len(input_dict['lidar2cam_t'])):
input_dict['lidar2cam_t'][id] = input_dict['lidar2cam_t'][id] - input_dict['lidar2cam_r'][id] @ trans_factor
if 'gt_pts_centers_view' in input_dict:
input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] + trans_factor
if 'gt_bboxes_lidar_view' in input_dict:
input_dict['gt_bboxes_lidar_view'].translate(trans_factor)
def _rot_bbox_points(self, input_dict):
"""Private function to rotate bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after rotation, 'points', 'pcd_rotation' \
and keys in input_dict['bbox3d_fields'] are updated \
in the result dict.
"""
noise_rotation = input_dict['pcd_rotation_angle']
rot_mat_T = None
for key in input_dict['bbox3d_fields']:
if len(input_dict[key].tensor) != 0:
points, rot_mat_T = input_dict[key].rotate(
noise_rotation, input_dict['points'])
input_dict['points'] = points
input_dict['pcd_rotation'] = rot_mat_T
if rot_mat_T is not None:
rot_mat_T_np = rot_mat_T.numpy()
for id in range(len(input_dict['lidar2cam_r'])):
input_dict['lidar2cam_r'][id] = input_dict['lidar2cam_r'][id] @ rot_mat_T_np
if input_dict['gt_pts_centers_view'].shape[0] > 0:
input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] @ rot_mat_T_np
if 'gt_bboxes_lidar_view' in input_dict:
input_dict['gt_bboxes_lidar_view'].rotate(noise_rotation)
def _scale_bbox_points(self, input_dict):
"""Private function to scale bounding boxes and points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points'and keys in \
input_dict['bbox3d_fields'] are updated in the result dict.
"""
scale = input_dict['pcd_scale_factor']
points = input_dict['points']
points.scale(scale)
if self.shift_height:
assert 'height' in points.attribute_dims.keys()
points.tensor[:, points.attribute_dims['height']] *= scale
input_dict['points'] = points
for key in input_dict['bbox3d_fields']:
input_dict[key].scale(scale)
if 'gt_img_centers_view' in input_dict and input_dict['gt_img_centers_view'].shape[0] > 0:
input_dict['gt_img_centers_view'][:, 2] *= scale
for id in range(len(input_dict['lidar2cam_t'])):
input_dict['lidar2cam_t'][id] = input_dict['lidar2cam_t'][id] * scale
if 'gt_pts_centers_view' in input_dict and input_dict['gt_pts_centers_view'].shape[0] > 0:
input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'] * scale
if 'gt_bboxes_cam_view' in input_dict:
input_dict['gt_bboxes_cam_view'].scale(scale)
if 'gt_bboxes_lidar_view' in input_dict:
input_dict['gt_bboxes_lidar_view'].scale(scale)
def _random_scale(self, input_dict):
"""Private function to randomly set the scale factor.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'pcd_scale_factor' are updated \
in the result dict.
"""
scale_factor = np.random.uniform(self.scale_ratio_range[0],
self.scale_ratio_range[1])
input_dict['pcd_scale_factor'] = scale_factor
def _random_rotation(self, input_dict):
rotation = self.rot_range
if not isinstance(rotation, list):
rotation = [-rotation, rotation]
noise_rotation = np.random.uniform(rotation[0], rotation[1])
input_dict['pcd_rotation_angle'] = noise_rotation
def __call__(self, input_dict):
"""Private function to rotate, scale and translate bounding boxes and \
points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after scaling, 'points', 'pcd_rotation',
'pcd_scale_factor', 'pcd_trans' and keys in \
input_dict['bbox3d_fields'] are updated in the result dict.
"""
if 'transformation_3d_flow' not in input_dict:
input_dict['transformation_3d_flow'] = []
if 'pcd_rotation_angle' not in input_dict:
self._random_rotation(input_dict)
self._rot_bbox_points(input_dict)
if 'pcd_scale_factor' not in input_dict:
self._random_scale(input_dict)
self._scale_bbox_points(input_dict)
self._trans_bbox_points(input_dict)
input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(rot_range={},'.format(self.rot_range)
repr_str += ' scale_ratio_range={},'.format(self.scale_ratio_range)
repr_str += ' translation_std={})'.format(self.translation_std)
repr_str += ' shift_height={})'.format(self.shift_height)
return repr_str
@PIPELINES.register_module()
class PointShuffle(object):
"""Shuffle input points."""
def __call__(self, input_dict):
"""Call function to shuffle points.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'points' keys are updated \
in the result dict.
"""
input_dict['points'].shuffle()
return input_dict
def __repr__(self):
return self.__class__.__name__
@PIPELINES.register_module()
class ObjectRangeFilter(object):
"""Filter objects by the range.
Args:
point_cloud_range (list[float]): Point cloud range.
"""
def __init__(self, point_cloud_range):
self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
self.bev_range = self.pcd_range[[0, 1, 3, 4]]
def __call__(self, input_dict):
"""Call function to filter objects by the range.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
keys are updated in the result dict.
"""
gt_bboxes_3d = input_dict['gt_bboxes_3d']
gt_labels_3d = input_dict['gt_labels_3d']
mask = gt_bboxes_3d.in_range_bev(self.bev_range)
gt_bboxes_3d = gt_bboxes_3d[mask]
# mask is a torch tensor but gt_labels_3d is still numpy array
# using mask to index gt_labels_3d will cause bug when
# len(gt_labels_3d) == 1, where mask=1 will be interpreted
# as gt_labels_3d[1] and cause out of index error
gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
# limit rad to [-pi, pi]
gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
input_dict['gt_bboxes_3d'] = gt_bboxes_3d
input_dict['gt_labels_3d'] = gt_labels_3d
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist())
return repr_str
@PIPELINES.register_module()
class OurObjectRangeFilter(object):
"""Filter objects by the range.
Args:
point_cloud_range (list[float]): Point cloud range.
"""
def __init__(self, point_cloud_range):
self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
self.bev_range = self.pcd_range[[0, 1, 3, 4]]
def __call__(self, input_dict):
"""Call function to filter objects by the range.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
keys are updated in the result dict.
"""
gt_bboxes_3d = input_dict['gt_bboxes_3d']
gt_labels_3d = input_dict['gt_labels_3d']
mask = gt_bboxes_3d.in_range_bev(self.bev_range)
gt_bboxes_3d = gt_bboxes_3d[mask]
# mask is a torch tensor but gt_labels_3d is still numpy array
# using mask to index gt_labels_3d will cause bug when
# len(gt_labels_3d) == 1, where mask=1 will be interpreted
# as gt_labels_3d[1] and cause out of index error
gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)]
# limit rad to [-pi, pi]
gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi)
input_dict['gt_bboxes_3d'] = gt_bboxes_3d
input_dict['gt_labels_3d'] = gt_labels_3d
if 'gt_visible_3d' in input_dict:
gt_visible_3d = input_dict['gt_visible_3d']
gt_visible_3d = gt_visible_3d[mask.numpy().astype(np.bool)]
input_dict['gt_visible_3d'] = gt_visible_3d
pts_2d = input_dict['gt_pts_centers_view']
mask_2d = (pts_2d[:, 0] > self.bev_range[0]) & (pts_2d[:, 0] < self.bev_range[2]) & (pts_2d[:, 1] > self.bev_range[1]) & (pts_2d[:, 1] < self.bev_range[3])
input_dict['gt_bboxes'] = input_dict['gt_bboxes'][mask_2d]
input_dict['gt_labels'] = input_dict['gt_labels'][mask_2d]
input_dict['gt_img_centers_view'] = input_dict['gt_img_centers_view'][mask_2d]
input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'][mask_2d]
if 'gt_bboxes_cam_view' in input_dict:
input_dict['gt_bboxes_cam_view'] = input_dict['gt_bboxes_cam_view'][mask_2d]
input_dict['gt_bboxes_lidar_view'] = input_dict['gt_bboxes_lidar_view'][mask_2d]
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist())
return repr_str
@PIPELINES.register_module()
class PointsRangeFilter(object):
"""Filter points by the range.
Args:
point_cloud_range (list[float]): Point cloud range.
"""
def __init__(self, point_cloud_range):
self.pcd_range = np.array(point_cloud_range, dtype=np.float32)
def __call__(self, input_dict):
"""Call function to filter points by the range.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'points' keys are updated \
in the result dict.
"""
points = input_dict['points']
points_mask = points.in_range_3d(self.pcd_range)
clean_points = points[points_mask]
input_dict['points'] = clean_points
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(point_cloud_range={})'.format(self.pcd_range.tolist())
return repr_str
@PIPELINES.register_module()
class ObjectNameFilter(object):
"""Filter GT objects by their names.
Args:
classes (list[str]): List of class names to be kept for training.
"""
def __init__(self, classes):
self.classes = classes
self.labels = list(range(len(self.classes)))
def __call__(self, input_dict):
"""Call function to filter objects by their names.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' \
keys are updated in the result dict.
"""
gt_labels_3d = input_dict['gt_labels_3d']
gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d],
dtype=np.bool_)
input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask]
input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask]
if 'gt_visible_3d' in input_dict:
input_dict['gt_visible_3d'] = input_dict['gt_visible_3d'][gt_bboxes_mask]
if 'gt_labels' in input_dict:
gt_labels = input_dict['gt_labels']
if gt_labels.shape[0] > 0:
gt_bboxes_mask = np.array([n[0] in self.labels for n in gt_labels],
dtype=np.bool_)
input_dict['gt_bboxes'] = input_dict['gt_bboxes'][gt_bboxes_mask]
input_dict['gt_labels'] = input_dict['gt_labels'][gt_bboxes_mask]
if 'gt_img_centers_view' in input_dict:
input_dict['gt_img_centers_view'] = input_dict['gt_img_centers_view'][gt_bboxes_mask]
input_dict['gt_pts_centers_view'] = input_dict['gt_pts_centers_view'][gt_bboxes_mask]
if 'gt_bboxes_cam_view' in input_dict:
input_dict['gt_bboxes_cam_view'] = input_dict['gt_bboxes_cam_view'][gt_bboxes_mask]
if 'gt_bboxes_lidar_view' in input_dict:
input_dict['gt_bboxes_lidar_view'] = input_dict['gt_bboxes_lidar_view'][gt_bboxes_mask]
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += f'(classes={self.classes})'
return repr_str
@PIPELINES.register_module()
class IndoorPointSample(object):
"""Indoor point sample.
Sampling data to a certain number.
Args:
name (str): Name of the dataset.
num_points (int): Number of points to be sampled.
"""
def __init__(self, num_points):
self.num_points = num_points
def points_random_sampling(self,
points,
num_samples,
replace=None,
return_choices=False):
"""Points random sampling.
Sample points to a certain number.
Args:
points (np.ndarray): 3D Points.
num_samples (int): Number of samples to be sampled.
replace (bool): Whether the sample is with or without replacement.
Defaults to None.
return_choices (bool): Whether return choice. Defaults to False.
Returns:
tuple[np.ndarray] | np.ndarray:
- points (np.ndarray): 3D Points.
- choices (np.ndarray, optional): The generated random samples.
"""
if replace is None:
replace = (points.shape[0] < num_samples)
choices = np.random.choice(
points.shape[0], num_samples, replace=replace)
if return_choices:
return points[choices], choices
else:
return points[choices]
def __call__(self, results):
"""Call function to sample points to in indoor scenes.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after sampling, 'points', 'pts_instance_mask' \
and 'pts_semantic_mask' keys are updated in the result dict.
"""
points = results['points']
points, choices = self.points_random_sampling(
points, self.num_points, return_choices=True)
pts_instance_mask = results.get('pts_instance_mask', None)
pts_semantic_mask = results.get('pts_semantic_mask', None)
results['points'] = points
if pts_instance_mask is not None and pts_semantic_mask is not None:
pts_instance_mask = pts_instance_mask[choices]
pts_semantic_mask = pts_semantic_mask[choices]
results['pts_instance_mask'] = pts_instance_mask
results['pts_semantic_mask'] = pts_semantic_mask
return results
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(num_points={})'.format(self.num_points)
return repr_str
@PIPELINES.register_module()
class BackgroundPointsFilter(object):
"""Filter background points near the bounding box.
Args:
bbox_enlarge_range (tuple[float], float): Bbox enlarge range.
"""
def __init__(self, bbox_enlarge_range):
assert (is_tuple_of(bbox_enlarge_range, float)
and len(bbox_enlarge_range) == 3) \
or isinstance(bbox_enlarge_range, float), \
f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}'
if isinstance(bbox_enlarge_range, float):
bbox_enlarge_range = [bbox_enlarge_range] * 3
self.bbox_enlarge_range = np.array(
bbox_enlarge_range, dtype=np.float32)[np.newaxis, :]
def __call__(self, input_dict):
"""Call function to filter points by the range.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after filtering, 'points' keys are updated \
in the result dict.
"""
points = input_dict['points']
gt_bboxes_3d = input_dict['gt_bboxes_3d']
gt_bboxes_3d_np = gt_bboxes_3d.tensor.numpy()
gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.numpy()
enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy()
enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range
points_numpy = points.tensor.numpy()
foreground_masks = box_np_ops.points_in_rbbox(points_numpy,
gt_bboxes_3d_np)
enlarge_foreground_masks = box_np_ops.points_in_rbbox(
points_numpy, enlarged_gt_bboxes_3d)
foreground_masks = foreground_masks.max(1)
enlarge_foreground_masks = enlarge_foreground_masks.max(1)
valid_masks = ~np.logical_and(~foreground_masks,
enlarge_foreground_masks)
input_dict['points'] = points[valid_masks]
pts_instance_mask = input_dict.get('pts_instance_mask', None)
if pts_instance_mask is not None:
input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks]
pts_semantic_mask = input_dict.get('pts_semantic_mask', None)
if pts_semantic_mask is not None:
input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks]
return input_dict
def __repr__(self):
"""str: Return a string that describes the module."""
repr_str = self.__class__.__name__
repr_str += '(bbox_enlarge_range={})'.format(
self.bbox_enlarge_range.tolist())
return repr_str
@PIPELINES.register_module()
class VoxelBasedPointSampler(object):
"""Voxel based point sampler.
Apply voxel sampling to multiple sweep points.
Args:
cur_sweep_cfg (dict): Config for sampling current points.
prev_sweep_cfg (dict): Config for sampling previous points.
time_dim (int): Index that indicate the time dimention
for input points.
"""
def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3):
self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg)
self.cur_voxel_num = self.cur_voxel_generator._max_voxels
self.time_dim = time_dim
if prev_sweep_cfg is not None:
assert prev_sweep_cfg['max_num_points'] == \
cur_sweep_cfg['max_num_points']
self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg)
self.prev_voxel_num = self.prev_voxel_generator._max_voxels
else:
self.prev_voxel_generator = None
self.prev_voxel_num = 0
def _sample_points(self, points, sampler, point_dim):
"""Sample points for each points subset.
Args:
points (np.ndarray): Points subset to be sampled.
sampler (VoxelGenerator): Voxel based sampler for
each points subset.
point_dim (int): The dimention of each points
Returns:
np.ndarray: Sampled points.
"""
voxels, coors, num_points_per_voxel = sampler.generate(points)
if voxels.shape[0] < sampler._max_voxels:
padding_points = np.zeros([
sampler._max_voxels - voxels.shape[0], sampler._max_num_points,
point_dim
],
dtype=points.dtype)
padding_points[:] = voxels[0]
sample_points = np.concatenate([voxels, padding_points], axis=0)
else:
sample_points = voxels
return sample_points
def __call__(self, results):
"""Call function to sample points from multiple sweeps.
Args:
input_dict (dict): Result dict from loading pipeline.
Returns:
dict: Results after sampling, 'points', 'pts_instance_mask' \
and 'pts_semantic_mask' keys are updated in the result dict.
"""
points = results['points']
original_dim = points.shape[1]
# TODO: process instance and semantic mask while _max_num_points
# is larger than 1
# Extend points with seg and mask fields
map_fields2dim = []
start_dim = original_dim
points_numpy = points.tensor.numpy()
extra_channel = [points_numpy]
for idx, key in enumerate(results['pts_mask_fields']):
map_fields2dim.append((key, idx + start_dim))
extra_channel.append(results[key][..., None])
start_dim += len(results['pts_mask_fields'])
for idx, key in enumerate(results['pts_seg_fields']):
map_fields2dim.append((key, idx + start_dim))
extra_channel.append(results[key][..., None])
points_numpy = np.concatenate(extra_channel, axis=-1)
# Split points into two part, current sweep points and
# previous sweeps points.
# TODO: support different sampling methods for next sweeps points
# and previous sweeps points.
cur_points_flag = (points_numpy[:, self.time_dim] == 0)
cur_sweep_points = points_numpy[cur_points_flag]
prev_sweeps_points = points_numpy[~cur_points_flag]
if prev_sweeps_points.shape[0] == 0:
prev_sweeps_points = cur_sweep_points
# Shuffle points before sampling
np.random.shuffle(cur_sweep_points)
np.random.shuffle(prev_sweeps_points)
cur_sweep_points = self._sample_points(cur_sweep_points,
self.cur_voxel_generator,
points_numpy.shape[1])
if self.prev_voxel_generator is not None:
prev_sweeps_points = self._sample_points(prev_sweeps_points,
self.prev_voxel_generator,
points_numpy.shape[1])
points_numpy = np.concatenate(
[cur_sweep_points, prev_sweeps_points], 0)
else:
points_numpy = cur_sweep_points
if self.cur_voxel_generator._max_num_points == 1:
points_numpy = points_numpy.squeeze(1)
results['points'] = points.new_point(points_numpy[..., :original_dim])
# Restore the correspoinding seg and mask fields
for key, dim_index in map_fields2dim:
results[key] = points_numpy[..., dim_index]
return results
def __repr__(self):
"""str: Return a string that describes the module."""
def _auto_indent(repr_str, indent):
repr_str = repr_str.split('\n')
repr_str = [' ' * indent + t + '\n' for t in repr_str]
repr_str = ''.join(repr_str)[:-1]
return repr_str
repr_str = self.__class__.__name__
indent = 4
repr_str += '(\n'
repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n'
repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n'
repr_str += ' ' * indent + f'time_dim={self.time_dim},\n'
repr_str += ' ' * indent + 'cur_voxel_generator=\n'
repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n'
repr_str += ' ' * indent + 'prev_voxel_generator=\n'
repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})'
return repr_str
================================================
FILE: mmdet3d/datasets/registry.py
================================================
from mmcv.utils import Registry
OBJECTSAMPLERS = Registry('Object sampler')
================================================
FILE: mmdet3d/datasets/scannet_dataset.py
================================================
import numpy as np
from os import path as osp
from mmdet3d.core import show_result
from mmdet3d.core.bbox import DepthInstance3DBoxes
from mmdet.datasets import DATASETS
from .custom_3d import Custom3DDataset
@DATASETS.register_module()
class ScanNetDataset(Custom3DDataset):
r"""ScanNet Dataset.
This class serves as the API for experiments on the ScanNet Dataset.
Please refer to the `github repo `_
for data downloading.
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'Depth' in this dataset. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
"""
CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
'bookshelf', 'picture', 'counter', 'desk', 'curtain',
'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub',
'garbagebin')
def __init__(self,
data_root,
ann_file,
pipeline=None,
classes=None,
modality=None,
box_type_3d='Depth',
filter_empty_gt=True,
test_mode=False):
super().__init__(
data_root=data_root,
ann_file=ann_file,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode)
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: annotation information consists of the following keys:
- gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): \
3D ground truth bboxes
- gt_labels_3d (np.ndarray): Labels of ground truths.
- pts_instance_mask_path (str): Path of instance masks.
- pts_semantic_mask_path (str): Path of semantic masks.
"""
# Use index to get the annos, thus the evalhook could also use this api
info = self.data_infos[index]
if info['annos']['gt_num'] != 0:
gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
np.float32) # k, 6
gt_labels_3d = info['annos']['class'].astype(np.long)
else:
gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32)
gt_labels_3d = np.zeros((0, ), dtype=np.long)
# to target box structure
gt_bboxes_3d = DepthInstance3DBoxes(
gt_bboxes_3d,
box_dim=gt_bboxes_3d.shape[-1],
with_yaw=False,
origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
pts_instance_mask_path = osp.join(self.data_root,
info['pts_instance_mask_path'])
pts_semantic_mask_path = osp.join(self.data_root,
info['pts_semantic_mask_path'])
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d,
gt_labels_3d=gt_labels_3d,
pts_instance_mask_path=pts_instance_mask_path,
pts_semantic_mask_path=pts_semantic_mask_path)
return anns_results
def show(self, results, out_dir, show=True):
"""Results visualization.
Args:
results (list[dict]): List of bounding boxes results.
out_dir (str): Output directory of visualization result.
show (bool): Visualize the results online.
"""
assert out_dir is not None, 'Expect out_dir, got none.'
for i, result in enumerate(results):
data_info = self.data_infos[i]
pts_path = data_info['pts_path']
file_name = osp.split(pts_path)[-1].split('.')[0]
points = np.fromfile(
osp.join(self.data_root, pts_path),
dtype=np.float32).reshape(-1, 6)
gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
pred_bboxes = result['boxes_3d'].tensor.numpy()
show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
show)
================================================
FILE: mmdet3d/datasets/semantickitti_dataset.py
================================================
from os import path as osp
from mmdet.datasets import DATASETS
from .custom_3d import Custom3DDataset
@DATASETS.register_module()
class SemanticKITTIDataset(Custom3DDataset):
r"""SemanticKITTI Dataset.
This class serves as the API for experiments on the SemanticKITTI Dataset
Please refer to `_
for data downloading
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): NO 3D box for this dataset.
You can choose any type
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR' in this dataset. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
"""
CLASSES = ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck', 'bus',
'person', 'bicyclist', 'motorcyclist', 'road', 'parking',
'sidewalk', 'other-ground', 'building', 'fence', 'vegetation',
'trunck', 'terrian', 'pole', 'traffic-sign')
def __init__(self,
data_root,
ann_file,
pipeline=None,
classes=None,
modality=None,
box_type_3d='Lidar',
filter_empty_gt=False,
test_mode=False):
super().__init__(
data_root=data_root,
ann_file=ann_file,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode)
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: annotation information consists of the following keys:
- pts_semantic_mask_path (str): Path of semantic masks.
"""
# Use index to get the annos, thus the evalhook could also use this api
info = self.data_infos[index]
pts_semantic_mask_path = osp.join(self.data_root,
info['pts_semantic_mask_path'])
anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path)
return anns_results
================================================
FILE: mmdet3d/datasets/sunrgbd_dataset.py
================================================
import numpy as np
from collections import OrderedDict
from os import path as osp
from mmdet3d.core import show_result
from mmdet3d.core.bbox import DepthInstance3DBoxes
from mmdet.core import eval_map
from mmdet.datasets import DATASETS
from .custom_3d import Custom3DDataset
@DATASETS.register_module()
class SUNRGBDDataset(Custom3DDataset):
r"""SUNRGBD Dataset.
This class serves as the API for experiments on the SUNRGBD Dataset.
See the `download page `_
for data downloading.
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'Depth' in this dataset. Available options includes
- 'LiDAR': Box in LiDAR coordinates.
- 'Depth': Box in depth coordinates, usually for indoor dataset.
- 'Camera': Box in camera coordinates.
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
"""
CLASSES = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser',
'night_stand', 'bookshelf', 'bathtub')
def __init__(self,
data_root,
ann_file,
pipeline=None,
classes=None,
modality=dict(use_camera=True, use_lidar=True),
box_type_3d='Depth',
filter_empty_gt=True,
test_mode=False):
super().__init__(
data_root=data_root,
ann_file=ann_file,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode)
assert 'use_camera' in self.modality and \
'use_lidar' in self.modality
assert self.modality['use_camera'] or self.modality['use_lidar']
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Data information that will be passed to the data \
preprocessing pipelines. It includes the following keys:
- sample_idx (str): Sample index.
- pts_filename (str, optional): Filename of point clouds.
- file_name (str, optional): Filename of point clouds.
- img_prefix (str | None, optional): Prefix of image files.
- img_info (dict, optional): Image info.
- calib (dict, optional): Camera calibration info.
- ann_info (dict): Annotation info.
"""
info = self.data_infos[index]
sample_idx = info['point_cloud']['lidar_idx']
assert info['point_cloud']['lidar_idx'] == info['image']['image_idx']
input_dict = dict(sample_idx=sample_idx)
if self.modality['use_lidar']:
pts_filename = osp.join(self.data_root, info['pts_path'])
input_dict['pts_filename'] = pts_filename
input_dict['file_name'] = pts_filename
if self.modality['use_camera']:
img_filename = osp.join(
osp.join(self.data_root, 'sunrgbd_trainval'),
info['image']['image_path'])
input_dict['img_prefix'] = None
input_dict['img_info'] = dict(filename=img_filename)
calib = info['calib']
input_dict['calib'] = calib
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0:
return None
return input_dict
def get_ann_info(self, index):
"""Get annotation info according to the given index.
Args:
index (int): Index of the annotation data to get.
Returns:
dict: annotation information consists of the following keys:
- gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): \
3D ground truth bboxes
- gt_labels_3d (np.ndarray): Labels of ground truths.
- pts_instance_mask_path (str): Path of instance masks.
- pts_semantic_mask_path (str): Path of semantic masks.
"""
# Use index to get the annos, thus the evalhook could also use this api
info = self.data_infos[index]
if info['annos']['gt_num'] != 0:
gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype(
np.float32) # k, 6
gt_labels_3d = info['annos']['class'].astype(np.long)
else:
gt_bboxes_3d = np.zeros((0, 7), dtype=np.float32)
gt_labels_3d = np.zeros((0, ), dtype=np.long)
# to target box structure
gt_bboxes_3d = DepthInstance3DBoxes(
gt_bboxes_3d, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d)
anns_results = dict(
gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d)
if self.modality['use_camera']:
if info['annos']['gt_num'] != 0:
gt_bboxes_2d = info['annos']['bbox'].astype(np.float32)
else:
gt_bboxes_2d = np.zeros((0, 4), dtype=np.float32)
anns_results['bboxes'] = gt_bboxes_2d
anns_results['labels'] = gt_labels_3d
return anns_results
def show(self, results, out_dir, show=True):
"""Results visualization.
Args:
results (list[dict]): List of bounding boxes results.
out_dir (str): Output directory of visualization result.
show (bool): Visualize the results online.
"""
assert out_dir is not None, 'Expect out_dir, got none.'
for i, result in enumerate(results):
data_info = self.data_infos[i]
pts_path = data_info['pts_path']
file_name = osp.split(pts_path)[-1].split('.')[0]
points = np.fromfile(
osp.join(self.data_root, pts_path),
dtype=np.float32).reshape(-1, 6)
points[:, 3:] *= 255
gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor
pred_bboxes = result['boxes_3d'].tensor.numpy()
show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name,
show)
def evaluate(self,
results,
metric=None,
iou_thr=(0.25, 0.5),
iou_thr_2d=(0.5, ),
logger=None,
show=False,
out_dir=None):
# evaluate 3D detection performance
if isinstance(results[0], dict):
return super().evaluate(results, metric, iou_thr, logger, show,
out_dir)
# evaluate 2D detection performance
else:
eval_results = OrderedDict()
annotations = [self.get_ann_info(i) for i in range(len(self))]
iou_thr_2d = (iou_thr_2d) if isinstance(iou_thr_2d,
float) else iou_thr_2d
for iou_thr_2d_single in iou_thr_2d:
mean_ap, _ = eval_map(
results,
annotations,
scale_ranges=None,
iou_thr=iou_thr_2d_single,
dataset=self.CLASSES,
logger=logger)
eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap
return eval_results
================================================
FILE: mmdet3d/datasets/waymo_dataset.py
================================================
import mmcv
import numpy as np
import os
import tempfile
import torch
from mmcv.utils import print_log
from os import path as osp
from mmdet.datasets import DATASETS
from ..core.bbox import Box3DMode, points_cam2img
from .kitti_dataset import KittiDataset
@DATASETS.register_module()
class WaymoDataset(KittiDataset):
"""Waymo Dataset.
This class serves as the API for experiments on the Waymo Dataset.
Please refer to ``_for data downloading.
It is recommended to symlink the dataset root to $MMDETECTION3D/data and
organize them as the doc shows.
Args:
data_root (str): Path of dataset root.
ann_file (str): Path of annotation file.
split (str): Split of input data.
pts_prefix (str, optional): Prefix of points files.
Defaults to 'velodyne'.
pipeline (list[dict], optional): Pipeline used for data processing.
Defaults to None.
classes (tuple[str], optional): Classes used in the dataset.
Defaults to None.
modality (dict, optional): Modality to specify the sensor data used
as input. Defaults to None.
box_type_3d (str, optional): Type of 3D box of this dataset.
Based on the `box_type_3d`, the dataset will encapsulate the box
to its original format then converted them to `box_type_3d`.
Defaults to 'LiDAR' in this dataset. Available options includes
- 'LiDAR': box in LiDAR coordinates
- 'Depth': box in depth coordinates, usually for indoor dataset
- 'Camera': box in camera coordinates
filter_empty_gt (bool, optional): Whether to filter empty GT.
Defaults to True.
test_mode (bool, optional): Whether the dataset is in test mode.
Defaults to False.
pcd_limit_range (list): The range of point cloud used to filter
invalid predicted boxes. Default: [-85, -85, -5, 85, 85, 5].
"""
CLASSES = ('Car', 'Cyclist', 'Pedestrian')
def __init__(self,
data_root,
ann_file,
split,
num_views=5,
pts_prefix='velodyne',
pipeline=None,
classes=None,
modality=None,
box_type_3d='LiDAR',
filter_empty_gt=True,
test_mode=False,
load_interval=1,
pcd_limit_range=[-85, -85, -5, 85, 85, 5]):
super().__init__(
data_root=data_root,
ann_file=ann_file,
split=split,
pts_prefix=pts_prefix,
pipeline=pipeline,
classes=classes,
modality=modality,
box_type_3d=box_type_3d,
filter_empty_gt=filter_empty_gt,
test_mode=test_mode,
pcd_limit_range=pcd_limit_range)
self.num_views = num_views
assert self.num_views <= 5
# to load a subset, just set the load_interval in the dataset config
self.data_infos = self.data_infos[::load_interval]
if hasattr(self, 'flag'):
self.flag = self.flag[::load_interval]
def _get_pts_filename(self, idx):
pts_filename = osp.join(self.root_split, self.pts_prefix,
f'{idx:07d}.bin')
return pts_filename
def get_data_info(self, index):
"""Get data info according to the given index.
Args:
index (int): Index of the sample data to get.
Returns:
dict: Standard input_dict consists of the
data information.
- sample_idx (str): sample index
- pts_filename (str): filename of point clouds
- img_prefix (str | None): prefix of image files
- img_info (dict): image info
- lidar2img (list[np.ndarray], optional): transformations from
lidar to different cameras
- ann_info (dict): annotation info
"""
info = self.data_infos[index]
sample_idx = info['image']['image_idx']
img_filename = os.path.join(self.data_root,
info['image']['image_path'])
# TODO: consider use torch.Tensor only
rect = info['calib']['R0_rect'].astype(np.float32)
Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
P0 = info['calib']['P0'].astype(np.float32)
lidar2img = P0 @ rect @ Trv2c
# the Tr_velo_to_cam is computed for all images but not saved in .info for img1-4
# the size of img0-2: 1280x1920; img3-4: 886x1920
if self.modality['use_camera']:
image_paths = []
lidar2img_rts = []
# load calibration for all 5 images.
calib_path = img_filename.replace('image_0', 'calib').replace('.png', '.txt')
Tr_velo_to_cam_list = []
with open(calib_path, 'r') as f:
lines = f.readlines()
for line_num in range(6, 6 + self.num_views):
trans = np.array([float(info) for info in lines[line_num].split(' ')[1:13]]).reshape(3, 4)
trans = np.concatenate([trans, np.array([[0., 0., 0., 1.]])], axis=0).astype(np.float32)
Tr_velo_to_cam_list.append(trans)
assert np.allclose(Tr_velo_to_cam_list[0], info['calib']['Tr_velo_to_cam'].astype(np.float32))
for idx_img in range(self.num_views):
rect = info['calib']['R0_rect'].astype(np.float32)
# Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
Trv2c = Tr_velo_to_cam_list[idx_img]
P0 = info['calib'][f'P{idx_img}'].astype(np.float32)
lidar2img = P0 @ rect @ Trv2c
image_paths.append(img_filename.replace('image_0', f'image_{idx_img}'))
lidar2img_rts.append(lidar2img)
pts_filename = self._get_pts_filename(sample_idx)
input_dict = dict(
sample_idx=sample_idx,
pts_filename=pts_filename,
img_prefix=None,
)
if self.modality['use_camera']:
input_dict['img_filename'] = image_paths
input_dict['lidar2img'] = lidar2img_rts
if not self.test_mode:
annos = self.get_ann_info(index)
input_dict['ann_info'] = annos
return input_dict
def format_results(self,
outputs,
pklfile_prefix=None,
submission_prefix=None,
data_format='waymo'):
"""Format the results to pkl file.
Args:
outputs (list[dict]): Testing results of the dataset.
pklfile_prefix (str | None): The prefix of pkl files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
submission_prefix (str | None): The prefix of submitted files. It
includes the file path and the prefix of filename, e.g.,
"a/b/prefix". If not specified, a temp file will be created.
Default: None.
data_format (str | None): Output data format. Default: 'waymo'.
Another supported choice is 'kitti'.
Returns:
tuple: (result_files, tmp_dir), result_files is a dict containing
the json filepaths, tmp_dir is the temporal directory created
for saving json files when jsonfile_prefix is not specified.
"""
if pklfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
pklfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
assert ('waymo' in data_format or 'kitti' in data_format), \
f'invalid data_format {data_format}'
if (not isinstance(outputs[0], dict)) or 'img_bbox' in outputs[0]:
raise TypeError('Not supported type for reformat results.')
elif 'pts_bbox' in outputs[0]:
result_files = dict()
for name in outputs[0]:
results_ = [out[name] for out in outputs]
pklfile_prefix_ = pklfile_prefix + name
if submission_prefix is not None:
submission_prefix_ = f'{submission_prefix}_{name}'
else:
submission_prefix_ = None
result_files_ = self.bbox2result_kitti(results_, self.CLASSES,
pklfile_prefix_,
submission_prefix_)
result_files[name] = result_files_
else:
result_files = self.bbox2result_kitti(outputs, self.CLASSES,
pklfile_prefix,
submission_prefix)
if 'waymo' in data_format:
from ..core.evaluation.waymo_utils.prediction_kitti_to_waymo import \
KITTI2Waymo # noqa
waymo_root = osp.join(
self.data_root.split('kitti_format')[0], 'waymo_format')
if self.split == 'training':
waymo_tfrecords_dir = osp.join(waymo_root, 'validation')
prefix = '1'
elif self.split == 'testing':
waymo_tfrecords_dir = osp.join(waymo_root, 'testing')
prefix = '2'
else:
raise ValueError('Not supported split value.')
save_tmp_dir = tempfile.TemporaryDirectory()
waymo_results_save_dir = save_tmp_dir.name
waymo_results_final_path = f'{pklfile_prefix}.bin'
if 'pts_bbox' in result_files:
converter = KITTI2Waymo(result_files['pts_bbox'],
waymo_tfrecords_dir,
waymo_results_save_dir,
waymo_results_final_path, prefix)
else:
converter = KITTI2Waymo(result_files, waymo_tfrecords_dir,
waymo_results_save_dir,
waymo_results_final_path, prefix)
converter.convert()
save_tmp_dir.cleanup()
return result_files, tmp_dir
def evaluate(self,
results,
metric='waymo',
logger=None,
pklfile_prefix=None,
submission_prefix=None,
show=False,
out_dir=None):
"""Evaluation in KITTI protocol.
Args:
results (list[dict]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated.
Default: 'waymo'. Another supported metric is 'kitti'.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
pklfile_prefix (str | None): The prefix of pkl files. It includes
the file path and the prefix of filename, e.g., "a/b/prefix".
If not specified, a temp file will be created. Default: None.
submission_prefix (str | None): The prefix of submission datas.
If not specified, the submission data will not be generated.
show (bool): Whether to visualize.
Default: False.
out_dir (str): Path to save the visualization results.
Default: None.
Returns:
dict[str: float]: results of each evaluation metric
"""
assert ('waymo' in metric or 'kitti' in metric), \
f'invalid metric {metric}'
if 'kitti' in metric:
result_files, tmp_dir = self.format_results(
results,
pklfile_prefix,
submission_prefix,
data_format='kitti')
from mmdet3d.core.evaluation import kitti_eval
gt_annos = [info['annos'] for info in self.data_infos]
if isinstance(result_files, dict):
ap_dict = dict()
for name, result_files_ in result_files.items():
eval_types = ['bev', '3d']
ap_result_str, ap_dict_ = kitti_eval(
gt_annos,
result_files_,
self.CLASSES,
eval_types=eval_types)
for ap_type, ap in ap_dict_.items():
ap_dict[f'{name}/{ap_type}'] = float(
'{:.4f}'.format(ap))
print_log(
f'Results of {name}:\n' + ap_result_str, logger=logger)
else:
ap_result_str, ap_dict = kitti_eval(
gt_annos,
result_files,
self.CLASSES,
eval_types=['bev', '3d'])
print_log('\n' + ap_result_str, logger=logger)
if 'waymo' in metric:
waymo_root = osp.join(
self.data_root.split('kitti_format')[0], 'waymo_format')
if pklfile_prefix is None:
eval_tmp_dir = tempfile.TemporaryDirectory()
pklfile_prefix = osp.join(eval_tmp_dir.name, 'results')
else:
eval_tmp_dir = None
result_files, tmp_dir = self.format_results(
results,
pklfile_prefix,
submission_prefix,
data_format='waymo')
import subprocess
ret_bytes = subprocess.check_output(
'mmdet3d/core/evaluation/waymo_utils/' +
f'compute_detection_metrics_main {pklfile_prefix}.bin ' +
f'{waymo_root}/gt.bin',
shell=True)
ret_texts = ret_bytes.decode('utf-8')
print_log(ret_texts)
# parse the text to get ap_dict
ap_dict = {
'Vehicle/L1 mAP': 0,
'Vehicle/L1 mAPH': 0,
'Vehicle/L2 mAP': 0,
'Vehicle/L2 mAPH': 0,
'Pedestrian/L1 mAP': 0,
'Pedestrian/L1 mAPH': 0,
'Pedestrian/L2 mAP': 0,
'Pedestrian/L2 mAPH': 0,
'Sign/L1 mAP': 0,
'Sign/L1 mAPH': 0,
'Sign/L2 mAP': 0,
'Sign/L2 mAPH': 0,
'Cyclist/L1 mAP': 0,
'Cyclist/L1 mAPH': 0,
'Cyclist/L2 mAP': 0,
'Cyclist/L2 mAPH': 0,
'Overall/L1 mAP': 0,
'Overall/L1 mAPH': 0,
'Overall/L2 mAP': 0,
'Overall/L2 mAPH': 0
}
mAP_splits = ret_texts.split('mAP ')
mAPH_splits = ret_texts.split('mAPH ')
for idx, key in enumerate(ap_dict.keys()):
split_idx = int(idx / 2) + 1
if idx % 2 == 0: # mAP
ap_dict[key] = float(mAP_splits[split_idx].split(']')[0])
else: # mAPH
ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0])
ap_dict['Overall/L1 mAP'] = \
(ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] +
ap_dict['Cyclist/L1 mAP']) / 3
ap_dict['Overall/L1 mAPH'] = \
(ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] +
ap_dict['Cyclist/L1 mAPH']) / 3
ap_dict['Overall/L2 mAP'] = \
(ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] +
ap_dict['Cyclist/L2 mAP']) / 3
ap_dict['Overall/L2 mAPH'] = \
(ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] +
ap_dict['Cyclist/L2 mAPH']) / 3
if eval_tmp_dir is not None:
eval_tmp_dir.cleanup()
if tmp_dir is not None:
tmp_dir.cleanup()
if show:
self.show(results, out_dir)
return ap_dict
def bbox2result_kitti(self,
net_outputs,
class_names,
pklfile_prefix=None,
submission_prefix=None):
"""Convert results to kitti format for evaluation and test submission.
Args:
net_outputs (List[np.ndarray]): list of array storing the
bbox and score
class_nanes (List[String]): A list of class names
pklfile_prefix (str | None): The prefix of pkl file.
submission_prefix (str | None): The prefix of submission file.
Returns:
List[dict]: A list of dict have the kitti 3d format
"""
assert len(net_outputs) == len(self.data_infos), \
'invalid list length of network outputs'
if submission_prefix is not None:
mmcv.mkdir_or_exist(submission_prefix)
det_annos = []
print('\nConverting prediction to KITTI format')
for idx, pred_dicts in enumerate(
mmcv.track_iter_progress(net_outputs)):
annos = []
info = self.data_infos[idx]
sample_idx = info['image']['image_idx']
image_shape = info['image']['image_shape'][:2]
box_dict = self.convert_valid_bboxes(pred_dicts, info)
if len(box_dict['bbox']) > 0:
box_2d_preds = box_dict['bbox']
box_preds = box_dict['box3d_camera']
scores = box_dict['scores']
box_preds_lidar = box_dict['box3d_lidar']
label_preds = box_dict['label_preds']
anno = {
'name': [],
'truncated': [],
'occluded': [],
'alpha': [],
'bbox': [],
'dimensions': [],
'location': [],
'rotation_y': [],
'score': []
}
for box, box_lidar, bbox, score, label in zip(
box_preds, box_preds_lidar, box_2d_preds, scores,
label_preds):
bbox[2:] = np.minimum(bbox[2:], image_shape[::-1])
bbox[:2] = np.maximum(bbox[:2], [0, 0])
anno['name'].append(class_names[int(label)])
anno['truncated'].append(0.0)
anno['occluded'].append(0)
anno['alpha'].append(
-np.arctan2(-box_lidar[1], box_lidar[0]) + box[6])
anno['bbox'].append(bbox)
anno['dimensions'].append(box[3:6])
anno['location'].append(box[:3])
anno['rotation_y'].append(box[6])
anno['score'].append(score)
anno = {k: np.stack(v) for k, v in anno.items()}
annos.append(anno)
if submission_prefix is not None:
curr_file = f'{submission_prefix}/{sample_idx:07d}.txt'
with open(curr_file, 'w') as f:
bbox = anno['bbox']
loc = anno['location']
dims = anno['dimensions'] # lhw -> hwl
for idx in range(len(bbox)):
print(
'{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} '
'{:.4f} {:.4f} {:.4f} '
'{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.
format(anno['name'][idx], anno['alpha'][idx],
bbox[idx][0], bbox[idx][1],
bbox[idx][2], bbox[idx][3],
dims[idx][1], dims[idx][2],
dims[idx][0], loc[idx][0], loc[idx][1],
loc[idx][2], anno['rotation_y'][idx],
anno['score'][idx]),
file=f)
else:
annos.append({
'name': np.array([]),
'truncated': np.array([]),
'occluded': np.array([]),
'alpha': np.array([]),
'bbox': np.zeros([0, 4]),
'dimensions': np.zeros([0, 3]),
'location': np.zeros([0, 3]),
'rotation_y': np.array([]),
'score': np.array([]),
})
annos[-1]['sample_idx'] = np.array(
[sample_idx] * len(annos[-1]['score']), dtype=np.int64)
det_annos += annos
if pklfile_prefix is not None:
if not pklfile_prefix.endswith(('.pkl', '.pickle')):
out = f'{pklfile_prefix}.pkl'
mmcv.dump(det_annos, out)
print(f'Result is saved to {out}.')
return det_annos
def convert_valid_bboxes(self, box_dict, info):
"""Convert the boxes into valid format.
Args:
box_dict (dict): Bounding boxes to be converted.
- boxes_3d (:obj:``LiDARInstance3DBoxes``): 3D bounding boxes.
- scores_3d (np.ndarray): Scores of predicted boxes.
- labels_3d (np.ndarray): Class labels of predicted boxes.
info (dict): Dataset information dictionary.
Returns:
dict: Valid boxes after conversion.
- bbox (np.ndarray): 2D bounding boxes (in camera 0).
- box3d_camera (np.ndarray): 3D boxes in camera coordinates.
- box3d_lidar (np.ndarray): 3D boxes in lidar coordinates.
- scores (np.ndarray): Scores of predicted boxes.
- label_preds (np.ndarray): Class labels of predicted boxes.
- sample_idx (np.ndarray): Sample index.
"""
# TODO: refactor this function
box_preds = box_dict['boxes_3d']
scores = box_dict['scores_3d']
labels = box_dict['labels_3d']
sample_idx = info['image']['image_idx']
# TODO: remove the hack of yaw
box_preds.limit_yaw(offset=0.5, period=np.pi * 2)
if len(box_preds) == 0:
return dict(
bbox=np.zeros([0, 4]),
box3d_camera=np.zeros([0, 7]),
box3d_lidar=np.zeros([0, 7]),
scores=np.zeros([0]),
label_preds=np.zeros([0, 4]),
sample_idx=sample_idx)
rect = info['calib']['R0_rect'].astype(np.float32)
Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32)
P0 = info['calib']['P0'].astype(np.float32)
P0 = box_preds.tensor.new_tensor(P0)
box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c)
box_corners = box_preds_camera.corners
box_corners_in_image = points_cam2img(box_corners, P0)
# box_corners_in_image: [N, 8, 2]
minxy = torch.min(box_corners_in_image, dim=1)[0]
maxxy = torch.max(box_corners_in_image, dim=1)[0]
box_2d_preds = torch.cat([minxy, maxxy], dim=1)
# Post-processing
# check box_preds
limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range)
valid_pcd_inds = ((box_preds.center > limit_range[:3]) &
(box_preds.center < limit_range[3:]))
valid_inds = valid_pcd_inds.all(-1)
if valid_inds.sum() > 0:
return dict(
bbox=box_2d_preds[valid_inds, :].numpy(),
box3d_camera=box_preds_camera[valid_inds].tensor.numpy(),
box3d_lidar=box_preds[valid_inds].tensor.numpy(),
scores=scores[valid_inds].numpy(),
label_preds=labels[valid_inds].numpy(),
sample_idx=sample_idx,
)
else:
return dict(
bbox=np.zeros([0, 4]),
box3d_camera=np.zeros([0, 7]),
box3d_lidar=np.zeros([0, 7]),
scores=np.zeros([0]),
label_preds=np.zeros([0, 4]),
sample_idx=sample_idx,
)
================================================
FILE: mmdet3d/models/__init__.py
================================================
from .backbones import * # noqa: F401,F403
from .builder import (build_backbone, build_detector, build_fusion_layer,
build_head, build_loss, build_middle_encoder, build_neck,
build_roi_extractor, build_shared_head,
build_voxel_encoder)
from .dense_heads import * # noqa: F401,F403
from .detectors import * # noqa: F401,F403
from .fusion_layers import * # noqa: F401,F403
from .losses import * # noqa: F401,F403
from .middle_encoders import * # noqa: F401,F403
from .model_utils import * # noqa: F401,F403
from .necks import * # noqa: F401,F403
from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS
from .roi_heads import * # noqa: F401,F403
from .voxel_encoders import * # noqa: F401,F403
__all__ = [
'VOXEL_ENCODERS', 'MIDDLE_ENCODERS', 'FUSION_LAYERS', 'build_backbone',
'build_neck', 'build_roi_extractor', 'build_shared_head', 'build_head',
'build_loss', 'build_detector', 'build_fusion_layer',
'build_middle_encoder', 'build_voxel_encoder'
]
================================================
FILE: mmdet3d/models/backbones/DLA.py
================================================
import torch
import torch.nn as nn
import torch.utils.checkpoint as cp
from mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer,
constant_init, kaiming_init)
from mmcv.runner import load_checkpoint
from torch.nn.modules.batchnorm import _BatchNorm
from mmdet.utils import get_root_logger
from ..builder import BACKBONES
try:
from dcn_v2 import DCN
# from .DCNv2.dcn_v2 import DCN
except:
print('import DCN failed')
DCN = None
import numpy as np
import math
BN_MOMENTUM = 0.1
def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
return join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
def conv3x3(in_planes, out_planes, stride=1):
"3x3 convolution with padding"
return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False)
class BasicBlock(nn.Module):
def __init__(self, inplanes, planes, stride=1, dilation=1):
super(BasicBlock, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
stride=stride, padding=dilation,
bias=False, dilation=dilation)
self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
stride=1, padding=dilation,
bias=False, dilation=dilation)
self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.stride = stride
def forward(self, x, residual=None):
if residual is None:
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 2
def __init__(self, inplanes, planes, stride=1, dilation=1):
super(Bottleneck, self).__init__()
expansion = Bottleneck.expansion
bottle_planes = planes // expansion
self.conv1 = nn.Conv2d(inplanes, bottle_planes,
kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
stride=stride, padding=dilation,
bias=False, dilation=dilation)
self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
self.conv3 = nn.Conv2d(bottle_planes, planes,
kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.stride = stride
def forward(self, x, residual=None):
if residual is None:
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
out += residual
out = self.relu(out)
return out
class BottleneckX(nn.Module):
expansion = 2
cardinality = 32
def __init__(self, inplanes, planes, stride=1, dilation=1):
super(BottleneckX, self).__init__()
cardinality = BottleneckX.cardinality
# dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
# bottle_planes = dim * cardinality
bottle_planes = planes * cardinality // 32
self.conv1 = nn.Conv2d(inplanes, bottle_planes,
kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
stride=stride, padding=dilation, bias=False,
dilation=dilation, groups=cardinality)
self.bn2 = nn.BatchNorm2d(bottle_planes, momentum=BN_MOMENTUM)
self.conv3 = nn.Conv2d(bottle_planes, planes,
kernel_size=1, bias=False)
self.bn3 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.stride = stride
def forward(self, x, residual=None):
if residual is None:
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
out += residual
out = self.relu(out)
return out
class Root(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, residual):
super(Root, self).__init__()
self.conv = nn.Conv2d(
in_channels, out_channels, 1,
stride=1, bias=False, padding=(kernel_size - 1) // 2)
self.bn = nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
self.relu = nn.ReLU(inplace=True)
self.residual = residual
def forward(self, *x):
children = x
x = self.conv(torch.cat(x, 1))
x = self.bn(x)
if self.residual:
x += children[0]
x = self.relu(x)
return x
class Tree(nn.Module):
def __init__(self, levels, block, in_channels, out_channels, stride=1,
level_root=False, root_dim=0, root_kernel_size=1,
dilation=1, root_residual=False):
super(Tree, self).__init__()
if root_dim == 0:
root_dim = 2 * out_channels
if level_root:
root_dim += in_channels
if levels == 1:
self.tree1 = block(in_channels, out_channels, stride,
dilation=dilation)
self.tree2 = block(out_channels, out_channels, 1,
dilation=dilation)
else:
self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
stride, root_dim=0,
root_kernel_size=root_kernel_size,
dilation=dilation, root_residual=root_residual)
self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
root_dim=root_dim + out_channels,
root_kernel_size=root_kernel_size,
dilation=dilation, root_residual=root_residual)
if levels == 1:
self.root = Root(root_dim, out_channels, root_kernel_size,
root_residual)
self.level_root = level_root
self.root_dim = root_dim
self.downsample = None
self.project = None
self.levels = levels
if stride > 1:
self.downsample = nn.MaxPool2d(stride, stride=stride)
if in_channels != out_channels:
self.project = nn.Sequential(
nn.Conv2d(in_channels, out_channels,
kernel_size=1, stride=1, bias=False),
nn.BatchNorm2d(out_channels, momentum=BN_MOMENTUM)
)
def forward(self, x, residual=None, children=None):
children = [] if children is None else children
bottom = self.downsample(x) if self.downsample else x
residual = self.project(bottom) if self.project else bottom
if self.level_root:
children.append(bottom)
x1 = self.tree1(x, residual)
if self.levels == 1:
x2 = self.tree2(x1)
x = self.root(x2, x1, *children)
else:
children.append(x1)
x = self.tree2(x1, children=children)
return x
class DLA(nn.Module):
def __init__(self, levels, channels, num_classes=1000,
block=BasicBlock, residual_root=False, linear_root=False,
opt=None):
super(DLA, self).__init__()
self.channels = channels
self.num_classes = num_classes
self.base_layer = nn.Sequential(
nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
padding=3, bias=False),
nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
nn.ReLU(inplace=True))
self.level0 = self._make_conv_level(
channels[0], channels[0], levels[0])
self.level1 = self._make_conv_level(
channels[0], channels[1], levels[1], stride=2)
self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
level_root=False,
root_residual=residual_root)
self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
level_root=True, root_residual=residual_root)
self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
level_root=True, root_residual=residual_root)
self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
level_root=True, root_residual=residual_root)
if opt.pre_img:
self.pre_img_layer = nn.Sequential(
nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
padding=3, bias=False),
nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
nn.ReLU(inplace=True))
if opt.pre_hm:
self.pre_hm_layer = nn.Sequential(
nn.Conv2d(1, channels[0], kernel_size=7, stride=1,
padding=3, bias=False),
nn.BatchNorm2d(channels[0], momentum=BN_MOMENTUM),
nn.ReLU(inplace=True))
# for m in self.modules():
# if isinstance(m, nn.Conv2d):
# n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
# m.weight.data.normal_(0, math.sqrt(2. / n))
# elif isinstance(m, nn.BatchNorm2d):
# m.weight.data.fill_(1)
# m.bias.data.zero_()
def _make_level(self, block, inplanes, planes, blocks, stride=1):
downsample = None
if stride != 1 or inplanes != planes:
downsample = nn.Sequential(
nn.MaxPool2d(stride, stride=stride),
nn.Conv2d(inplanes, planes,
kernel_size=1, stride=1, bias=False),
nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
)
layers = []
layers.append(block(inplanes, planes, stride, downsample=downsample))
for i in range(1, blocks):
layers.append(block(inplanes, planes))
return nn.Sequential(*layers)
def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
modules = []
for i in range(convs):
modules.extend([
nn.Conv2d(inplanes, planes, kernel_size=3,
stride=stride if i == 0 else 1,
padding=dilation, bias=False, dilation=dilation),
nn.BatchNorm2d(planes, momentum=BN_MOMENTUM),
nn.ReLU(inplace=True)])
inplanes = planes
return nn.Sequential(*modules)
def forward(self, x, pre_img=None, pre_hm=None):
y = []
x = self.base_layer(x)
if pre_img is not None:
x = x + self.pre_img_layer(pre_img)
if pre_hm is not None:
x = x + self.pre_hm_layer(pre_hm)
for i in range(6):
x = getattr(self, 'level{}'.format(i))(x)
y.append(x)
return y
def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
# fc = self.fc
if name.endswith('.pth'):
model_weights = torch.load(data + name)
else:
model_url = get_model_url(data, name, hash)
model_weights = model_zoo.load_url(model_url)
num_classes = len(model_weights[list(model_weights.keys())[-1]])
self.fc = nn.Conv2d(
self.channels[-1], num_classes,
kernel_size=1, stride=1, padding=0, bias=True)
self.load_state_dict(model_weights, strict=False)
# self.fc = fc
def dla34(pretrained=True, **kwargs): # DLA-34
model = DLA([1, 1, 1, 2, 2, 1],
[16, 32, 64, 128, 256, 512],
block=BasicBlock, **kwargs)
if pretrained:
model.load_pretrained_model(
data='imagenet', name='dla34', hash='ba72cf86')
else:
print('Warning: No ImageNet pretrain!!')
return model
def dla102(pretrained=None, **kwargs): # DLA-102
Bottleneck.expansion = 2
model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
block=Bottleneck, residual_root=True, **kwargs)
if pretrained:
model.load_pretrained_model(
data='imagenet', name='dla102', hash='d94d9790')
return model
def dla46_c(pretrained=None, **kwargs): # DLA-46-C
Bottleneck.expansion = 2
model = DLA([1, 1, 1, 2, 2, 1],
[16, 32, 64, 64, 128, 256],
block=Bottleneck, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla46_c', hash='2bfd52c3')
return model
def dla46x_c(pretrained=None, **kwargs): # DLA-X-46-C
BottleneckX.expansion = 2
model = DLA([1, 1, 1, 2, 2, 1],
[16, 32, 64, 64, 128, 256],
block=BottleneckX, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla46x_c', hash='d761bae7')
return model
def dla60x_c(pretrained=None, **kwargs): # DLA-X-60-C
BottleneckX.expansion = 2
model = DLA([1, 1, 1, 2, 3, 1],
[16, 32, 64, 64, 128, 256],
block=BottleneckX, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla60x_c', hash='b870c45c')
return model
def dla60(pretrained=None, **kwargs): # DLA-60
Bottleneck.expansion = 2
model = DLA([1, 1, 1, 2, 3, 1],
[16, 32, 128, 256, 512, 1024],
block=Bottleneck, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla60', hash='24839fc4')
return model
def dla60x(pretrained=None, **kwargs): # DLA-X-60
BottleneckX.expansion = 2
model = DLA([1, 1, 1, 2, 3, 1],
[16, 32, 128, 256, 512, 1024],
block=BottleneckX, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla60x', hash='d15cacda')
return model
def dla102x(pretrained=None, **kwargs): # DLA-X-102
BottleneckX.expansion = 2
model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
block=BottleneckX, residual_root=True, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla102x', hash='ad62be81')
return model
def dla102x2(pretrained=None, **kwargs): # DLA-X-102 64
BottleneckX.cardinality = 64
model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
block=BottleneckX, residual_root=True, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla102x2', hash='262837b6')
return model
def dla169(pretrained=None, **kwargs): # DLA-169
Bottleneck.expansion = 2
model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
block=Bottleneck, residual_root=True, **kwargs)
if pretrained is not None:
model.load_pretrained_model(
data='imagenet', name='dla169', hash='0914e092')
return model
class Identity(nn.Module):
def __init__(self):
super(Identity, self).__init__()
def forward(self, x):
return x
def fill_fc_weights(layers):
for m in layers.modules():
if isinstance(m, nn.Conv2d):
if m.bias is not None:
nn.init.constant_(m.bias, 0)
def fill_up_weights(up):
w = up.weight.data
f = math.ceil(w.size(2) / 2)
c = (2 * f - 1 - f % 2) / (2. * f)
for i in range(w.size(2)):
for j in range(w.size(3)):
w[0, 0, i, j] = \
(1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c))
for c in range(1, w.size(0)):
w[c, 0, :, :] = w[0, 0, :, :]
class Conv(nn.Module):
def __init__(self, chi, cho):
super(Conv, self).__init__()
self.conv = nn.Sequential(
nn.Conv2d(chi, cho, kernel_size=1, stride=1, bias=False),
nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
nn.ReLU(inplace=True))
def forward(self, x):
return self.conv(x)
class GlobalConv(nn.Module):
def __init__(self, chi, cho, k=7, d=1):
super(GlobalConv, self).__init__()
gcl = nn.Sequential(
nn.Conv2d(chi, cho, kernel_size=(k, 1), stride=1, bias=False,
dilation=d, padding=(d * (k // 2), 0)),
nn.Conv2d(cho, cho, kernel_size=(1, k), stride=1, bias=False,
dilation=d, padding=(0, d * (k // 2))))
gcr = nn.Sequential(
nn.Conv2d(chi, cho, kernel_size=(1, k), stride=1, bias=False,
dilation=d, padding=(0, d * (k // 2))),
nn.Conv2d(cho, cho, kernel_size=(k, 1), stride=1, bias=False,
dilation=d, padding=(d * (k // 2), 0)))
fill_fc_weights(gcl)
fill_fc_weights(gcr)
self.gcl = gcl
self.gcr = gcr
self.act = nn.Sequential(
nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
nn.ReLU(inplace=True)
)
def forward(self, x):
x = self.gcl(x) + self.gcr(x)
x = self.act(x)
return x
class DeformConv(nn.Module):
def __init__(self, chi, cho):
super(DeformConv, self).__init__()
self.actf = nn.Sequential(
nn.BatchNorm2d(cho, momentum=BN_MOMENTUM),
nn.ReLU(inplace=True)
)
self.conv = DCN(chi, cho, kernel_size=(3, 3), stride=1, padding=1, dilation=1, deformable_groups=1)
def forward(self, x):
x = self.conv(x)
x = self.actf(x)
return x
class IDAUp(nn.Module):
def __init__(self, o, channels, up_f, node_type=(DeformConv, DeformConv)):
super(IDAUp, self).__init__()
for i in range(1, len(channels)):
c = channels[i]
f = int(up_f[i])
proj = node_type[0](c, o)
node = node_type[1](o, o)
up = nn.ConvTranspose2d(o, o, f * 2, stride=f,
padding=f // 2, output_padding=0,
groups=o, bias=False)
fill_up_weights(up)
setattr(self, 'proj_' + str(i), proj)
setattr(self, 'up_' + str(i), up)
setattr(self, 'node_' + str(i), node)
def forward(self, layers, startp, endp):
for i in range(startp + 1, endp):
upsample = getattr(self, 'up_' + str(i - startp))
project = getattr(self, 'proj_' + str(i - startp))
layers[i] = upsample(project(layers[i]))
node = getattr(self, 'node_' + str(i - startp))
layers[i] = node(layers[i] + layers[i - 1])
class DLAUp(nn.Module):
def __init__(self, startp, channels, scales, in_channels=None,
node_type=DeformConv):
super(DLAUp, self).__init__()
self.startp = startp
if in_channels is None:
in_channels = channels
self.channels = channels
channels = list(channels)
scales = np.array(scales, dtype=int)
for i in range(len(channels) - 1):
j = -i - 2
setattr(self, 'ida_{}'.format(i),
IDAUp(channels[j], in_channels[j:],
scales[j:] // scales[j],
node_type=node_type))
scales[j + 1:] = scales[j]
in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]]
def forward(self, layers):
out = [layers[-1]] # start with 32
for i in range(len(layers) - self.startp - 1):
ida = getattr(self, 'ida_{}'.format(i))
ida(layers, len(layers) - i - 2, len(layers))
out.insert(0, layers[-1])
return out
class Interpolate(nn.Module):
def __init__(self, scale, mode):
super(Interpolate, self).__init__()
self.scale = scale
self.mode = mode
def forward(self, x):
x = F.interpolate(x, scale_factor=self.scale, mode=self.mode, align_corners=False)
return x
DLA_NODE = {
'dcn': (DeformConv, DeformConv),
'gcn': (Conv, GlobalConv),
'conv': (Conv, Conv),
}
class BaseModel(nn.Module):
def __init__(self, heads, head_convs, num_stacks, last_channel, opt=None):
super(BaseModel, self).__init__()
if opt is not None and opt.head_kernel != 3:
print('Using head kernel:', opt.head_kernel)
head_kernel = opt.head_kernel
else:
head_kernel = 3
self.num_stacks = num_stacks
self.heads = heads
for head in self.heads:
classes = self.heads[head]
head_conv = head_convs[head]
if len(head_conv) > 0:
out = nn.Conv2d(head_conv[-1], classes,
kernel_size=1, stride=1, padding=0, bias=True)
conv = nn.Conv2d(last_channel, head_conv[0],
kernel_size=head_kernel,
padding=head_kernel // 2, bias=True)
convs = [conv]
for k in range(1, len(head_conv)):
convs.append(nn.Conv2d(head_conv[k - 1], head_conv[k],
kernel_size=1, bias=True))
if len(convs) == 1:
fc = nn.Sequential(conv, nn.ReLU(inplace=True), out)
elif len(convs) == 2:
fc = nn.Sequential(
convs[0], nn.ReLU(inplace=True),
convs[1], nn.ReLU(inplace=True), out)
elif len(convs) == 3:
fc = nn.Sequential(
convs[0], nn.ReLU(inplace=True),
convs[1], nn.ReLU(inplace=True),
convs[2], nn.ReLU(inplace=True), out)
elif len(convs) == 4:
fc = nn.Sequential(
convs[0], nn.ReLU(inplace=True),
convs[1], nn.ReLU(inplace=True),
convs[2], nn.ReLU(inplace=True),
convs[3], nn.ReLU(inplace=True), out)
if 'hm' in head:
fc[-1].bias.data.fill_(opt.prior_bias)
else:
fill_fc_weights(fc)
else:
fc = nn.Conv2d(last_channel, classes,
kernel_size=1, stride=1, padding=0, bias=True)
if 'hm' in head:
fc.bias.data.fill_(opt.prior_bias)
else:
fill_fc_weights(fc)
self.__setattr__(head, fc)
def img2feats(self, x):
raise NotImplementedError
def imgpre2feats(self, x, pre_img=None, pre_hm=None):
raise NotImplementedError
def forward(self, x, pre_img=None, pre_hm=None):
if (pre_hm is not None) or (pre_img is not None):
feats = self.imgpre2feats(x, pre_img, pre_hm)
else:
feats = self.img2feats(x)
return feats
# out = []
# if self.opt.model_output_list:
# for s in range(self.num_stacks):
# z = []
# for head in sorted(self.heads):
# z.append(self.__getattr__(head)(feats[s]))
# out.append(z)
# else:
# for s in range(self.num_stacks):
# z = {}
# for head in self.heads:
# z[head] = self.__getattr__(head)(feats[s])
# out.append(z)
# return out
@BACKBONES.register_module()
class DLASeg(BaseModel):
def __init__(self, num_layers, heads, head_convs):
opt = Opt()
super(DLASeg, self).__init__(
heads, head_convs, 1, 64 if num_layers == 34 else 128, opt=opt)
down_ratio = 4
self.opt = opt
self.node_type = DLA_NODE[opt.dla_node]
print('Using node type:', self.node_type)
self.first_level = int(np.log2(down_ratio))
self.last_level = 5
self.base = globals()['dla{}'.format(num_layers)](pretrained=False, opt=opt)
channels = self.base.channels
scales = [2 ** i for i in range(len(channels[self.first_level:]))]
self.dla_up = DLAUp(
self.first_level, channels[self.first_level:], scales,
node_type=self.node_type)
out_channel = channels[self.first_level]
self.ida_up = IDAUp(
out_channel, channels[self.first_level:self.last_level],
[2 ** i for i in range(self.last_level - self.first_level)],
node_type=self.node_type)
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
else:
pass
def img2feats(self, x):
x = self.base(x)
x = self.dla_up(x)
y = []
for i in range(self.last_level - self.first_level):
y.append(x[i].clone())
self.ida_up(y, 0, len(y))
return [y[-1]]
def imgpre2feats(self, x, pre_img=None, pre_hm=None):
x = self.base(x, pre_img, pre_hm)
x = self.dla_up(x)
y = []
for i in range(self.last_level - self.first_level):
y.append(x[i].clone())
self.ida_up(y, 0, len(y))
return [y[-1]]
class Opt:
head_kernel = 3
levels = [1, 1, 1, 2, 2, 1]
channels = [16, 32, 64, 128, 256, 512]
pre_img = False
pre_hm = False
dla_node = 'dcn'
model_output_list = False
# if __name__ == '__main__':
# from mmdet.models import DLASeg
# opt = Opt()
# model = DLASeg(34, {}, -1, Opt)
# checkpoints = torch.load('checkpoints/nuScenes_3Ddetection_e140.pth')
# model.load_state_dict(checkpoints['state_dict'], strict=False)
================================================
FILE: mmdet3d/models/backbones/__init__.py
================================================
from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt
from .multi_backbone import MultiBackbone
from .nostem_regnet import NoStemRegNet
from .pointnet2_sa_msg import PointNet2SAMSG
from .pointnet2_sa_ssg import PointNet2SASSG
from .second import SECOND
from .DLA import DLASeg
from .swin import SwinTransformer
__all__ = [
'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet',
'SECOND', 'PointNet2SASSG', 'PointNet2SAMSG', 'MultiBackbone', 'DLASeg',
'SwinTransformer'
]
================================================
FILE: mmdet3d/models/backbones/base_pointnet.py
================================================
from abc import ABCMeta
from mmcv.runner import load_checkpoint
from torch import nn as nn
class BasePointNet(nn.Module, metaclass=ABCMeta):
"""Base class for PointNet."""
def __init__(self):
super(BasePointNet, self).__init__()
self.fp16_enabled = False
def init_weights(self, pretrained=None):
"""Initialize the weights of PointNet backbone."""
# Do not initialize the conv layers
# to follow the original implementation
if isinstance(pretrained, str):
from mmdet3d.utils import get_root_logger
logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
@staticmethod
def _split_point_feats(points):
"""Split coordinates and features of input points.
Args:
points (torch.Tensor): Point coordinates with features,
with shape (B, N, 3 + input_feature_dim).
Returns:
torch.Tensor: Coordinates of input points.
torch.Tensor: Features of input points.
"""
xyz = points[..., 0:3].contiguous()
if points.size(-1) > 3:
features = points[..., 3:].transpose(1, 2).contiguous()
else:
features = None
return xyz, features
================================================
FILE: mmdet3d/models/backbones/multi_backbone.py
================================================
import copy
import torch
from mmcv.cnn import ConvModule
from mmcv.runner import auto_fp16, load_checkpoint
from torch import nn as nn
from mmdet.models import BACKBONES, build_backbone
@BACKBONES.register_module()
class MultiBackbone(nn.Module):
"""MultiBackbone with different configs.
Args:
num_streams (int): The number of backbones.
backbones (list or dict): A list of backbone configs.
aggregation_mlp_channels (list[int]): Specify the mlp layers
for feature aggregation.
conv_cfg (dict): Config dict of convolutional layers.
norm_cfg (dict): Config dict of normalization layers.
act_cfg (dict): Config dict of activation layers.
suffixes (list): A list of suffixes to rename the return dict
for each backbone.
"""
def __init__(self,
num_streams,
backbones,
aggregation_mlp_channels=None,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01),
act_cfg=dict(type='ReLU'),
suffixes=('net0', 'net1'),
**kwargs):
super().__init__()
assert isinstance(backbones, dict) or isinstance(backbones, list)
if isinstance(backbones, dict):
backbones_list = []
for ind in range(num_streams):
backbones_list.append(copy.deepcopy(backbones))
backbones = backbones_list
assert len(backbones) == num_streams
assert len(suffixes) == num_streams
self.backbone_list = nn.ModuleList()
# Rename the ret_dict with different suffixs.
self.suffixes = suffixes
out_channels = 0
for backbone_cfg in backbones:
out_channels += backbone_cfg['fp_channels'][-1][-1]
self.backbone_list.append(build_backbone(backbone_cfg))
# Feature aggregation layers
if aggregation_mlp_channels is None:
aggregation_mlp_channels = [
out_channels, out_channels // 2,
out_channels // len(self.backbone_list)
]
else:
aggregation_mlp_channels.insert(0, out_channels)
self.aggregation_layers = nn.Sequential()
for i in range(len(aggregation_mlp_channels) - 1):
self.aggregation_layers.add_module(
f'layer{i}',
ConvModule(
aggregation_mlp_channels[i],
aggregation_mlp_channels[i + 1],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
bias=True,
inplace=True))
def init_weights(self, pretrained=None):
"""Initialize the weights of PointNet++ backbone."""
# Do not initialize the conv layers
# to follow the original implementation
if isinstance(pretrained, str):
from mmdet3d.utils import get_root_logger
logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
@auto_fp16()
def forward(self, points):
"""Forward pass.
Args:
points (torch.Tensor): point coordinates with features,
with shape (B, N, 3 + input_feature_dim).
Returns:
dict[str, list[torch.Tensor]]: Outputs from multiple backbones.
- fp_xyz[suffix] (list[torch.Tensor]): The coordinates of
each fp features.
- fp_features[suffix] (list[torch.Tensor]): The features
from each Feature Propagate Layers.
- fp_indices[suffix] (list[torch.Tensor]): Indices of the
input points.
- hd_feature (torch.Tensor): The aggregation feature
from multiple backbones.
"""
ret = {}
fp_features = []
for ind in range(len(self.backbone_list)):
cur_ret = self.backbone_list[ind](points)
cur_suffix = self.suffixes[ind]
fp_features.append(cur_ret['fp_features'][-1])
if cur_suffix != '':
for k in cur_ret.keys():
cur_ret[k + '_' + cur_suffix] = cur_ret.pop(k)
ret.update(cur_ret)
# Combine the features here
hd_feature = torch.cat(fp_features, dim=1)
hd_feature = self.aggregation_layers(hd_feature)
ret['hd_feature'] = hd_feature
return ret
================================================
FILE: mmdet3d/models/backbones/nostem_regnet.py
================================================
from mmdet.models.backbones import RegNet
from ..builder import BACKBONES
@BACKBONES.register_module()
class NoStemRegNet(RegNet):
"""RegNet backbone without Stem for 3D detection.
More details can be found in `paper `_ .
Args:
arch (dict): The parameter of RegNets.
- w0 (int): Initial width.
- wa (float): Slope of width.
- wm (float): Quantization parameter to quantize the width.
- depth (int): Depth of the backbone.
- group_w (int): Width of group.
- bot_mul (float): Bottleneck ratio, i.e. expansion of bottlneck.
strides (Sequence[int]): Strides of the first block of each stage.
base_channels (int): Base channels after stem layer.
in_channels (int): Number of input image channels. Normally 3.
dilations (Sequence[int]): Dilation of each stage.
out_indices (Sequence[int]): Output from which stages.
style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
layer is the 3x3 conv layer, otherwise the stride-two layer is
the first 1x1 conv layer.
frozen_stages (int): Stages to be frozen (all param fixed). -1 means
not freezing any parameters.
norm_cfg (dict): Dictionary to construct and config norm layer.
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only.
with_cp (bool): Use checkpoint or not. Using checkpoint will save some
memory while slowing down the training speed.
zero_init_residual (bool): Whether to use zero init for last norm layer
in resblocks to let them behave as identity.
Example:
>>> from mmdet3d.models import NoStemRegNet
>>> import torch
>>> self = NoStemRegNet(
arch=dict(
w0=88,
wa=26.31,
wm=2.25,
group_w=48,
depth=25,
bot_mul=1.0))
>>> self.eval()
>>> inputs = torch.rand(1, 64, 16, 16)
>>> level_outputs = self.forward(inputs)
>>> for level_out in level_outputs:
... print(tuple(level_out.shape))
(1, 96, 8, 8)
(1, 192, 4, 4)
(1, 432, 2, 2)
(1, 1008, 1, 1)
"""
def __init__(self, arch, **kwargs):
super(NoStemRegNet, self).__init__(arch, **kwargs)
def _make_stem_layer(self, in_channels, base_channels):
"""Override the original function that do not initialize a stem layer
since 3D detector's voxel encoder works like a stem layer."""
return
def forward(self, x):
"""Forward function of backbone.
Args:
x (torch.Tensor): Features in shape (N, C, H, W).
Returns:
tuple[torch.Tensor]: Multi-scale features.
"""
outs = []
for i, layer_name in enumerate(self.res_layers):
res_layer = getattr(self, layer_name)
x = res_layer(x)
if i in self.out_indices:
outs.append(x)
return tuple(outs)
================================================
FILE: mmdet3d/models/backbones/pointnet2_sa_msg.py
================================================
import torch
from mmcv.cnn import ConvModule
from mmcv.runner import auto_fp16
from torch import nn as nn
from mmdet3d.ops import build_sa_module
from mmdet.models import BACKBONES
from .base_pointnet import BasePointNet
@BACKBONES.register_module()
class PointNet2SAMSG(BasePointNet):
"""PointNet2 with Multi-scale grouping.
Args:
in_channels (int): Input channels of point cloud.
num_points (tuple[int]): The number of points which each SA
module samples.
radii (tuple[float]): Sampling radii of each SA module.
num_samples (tuple[int]): The number of samples for ball
query in each SA module.
sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
aggregation_channels (tuple[int]): Out channels of aggregation
multi-scale grouping features.
fps_mods (tuple[int]): Mod of FPS for each SA module.
fps_sample_range_lists (tuple[tuple[int]]): The number of sampling
points which each SA module samples.
dilated_group (tuple[bool]): Whether to use dilated ball query for
out_indices (Sequence[int]): Output from which stages.
norm_cfg (dict): Config of normalization layer.
sa_cfg (dict): Config of set abstraction module, which may contain
the following keys and values:
- pool_mod (str): Pool method ('max' or 'avg') for SA modules.
- use_xyz (bool): Whether to use xyz as a part of features.
- normalize_xyz (bool): Whether to normalize xyz with radii in
each SA module.
"""
def __init__(self,
in_channels,
num_points=(2048, 1024, 512, 256),
radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)),
num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)),
sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)),
((64, 64, 128), (64, 64, 128), (64, 96, 128)),
((128, 128, 256), (128, 192, 256), (128, 256,
256))),
aggregation_channels=(64, 128, 256),
fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')),
fps_sample_range_lists=((-1), (-1), (512, -1)),
dilated_group=(True, True, True),
out_indices=(2, ),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModuleMSG',
pool_mod='max',
use_xyz=True,
normalize_xyz=False)):
super().__init__()
self.num_sa = len(sa_channels)
self.out_indices = out_indices
assert max(out_indices) < self.num_sa
assert len(num_points) == len(radii) == len(num_samples) == len(
sa_channels) == len(aggregation_channels)
self.SA_modules = nn.ModuleList()
self.aggregation_mlps = nn.ModuleList()
sa_in_channel = in_channels - 3 # number of channels without xyz
skip_channel_list = [sa_in_channel]
for sa_index in range(self.num_sa):
cur_sa_mlps = list(sa_channels[sa_index])
sa_out_channel = 0
for radius_index in range(len(radii[sa_index])):
cur_sa_mlps[radius_index] = [sa_in_channel] + list(
cur_sa_mlps[radius_index])
sa_out_channel += cur_sa_mlps[radius_index][-1]
if isinstance(fps_mods[sa_index], tuple):
cur_fps_mod = list(fps_mods[sa_index])
else:
cur_fps_mod = list([fps_mods[sa_index]])
if isinstance(fps_sample_range_lists[sa_index], tuple):
cur_fps_sample_range_list = list(
fps_sample_range_lists[sa_index])
else:
cur_fps_sample_range_list = list(
[fps_sample_range_lists[sa_index]])
self.SA_modules.append(
build_sa_module(
num_point=num_points[sa_index],
radii=radii[sa_index],
sample_nums=num_samples[sa_index],
mlp_channels=cur_sa_mlps,
fps_mod=cur_fps_mod,
fps_sample_range_list=cur_fps_sample_range_list,
dilated_group=dilated_group[sa_index],
norm_cfg=norm_cfg,
cfg=sa_cfg,
bias=True))
skip_channel_list.append(sa_out_channel)
self.aggregation_mlps.append(
ConvModule(
sa_out_channel,
aggregation_channels[sa_index],
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
kernel_size=1,
bias=True))
sa_in_channel = aggregation_channels[sa_index]
@auto_fp16(apply_to=('points', ))
def forward(self, points):
"""Forward pass.
Args:
points (torch.Tensor): point coordinates with features,
with shape (B, N, 3 + input_feature_dim).
Returns:
dict[str, torch.Tensor]: Outputs of the last SA module.
- sa_xyz (torch.Tensor): The coordinates of sa features.
- sa_features (torch.Tensor): The features from the
last Set Aggregation Layers.
- sa_indices (torch.Tensor): Indices of the \
input points.
"""
xyz, features = self._split_point_feats(points)
batch, num_points = xyz.shape[:2]
indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
batch, 1).long()
sa_xyz = [xyz]
sa_features = [features]
sa_indices = [indices]
out_sa_xyz = []
out_sa_features = []
out_sa_indices = []
for i in range(self.num_sa):
cur_xyz, cur_features, cur_indices = self.SA_modules[i](
sa_xyz[i], sa_features[i])
cur_features = self.aggregation_mlps[i](cur_features)
sa_xyz.append(cur_xyz)
sa_features.append(cur_features)
sa_indices.append(
torch.gather(sa_indices[-1], 1, cur_indices.long()))
if i in self.out_indices:
out_sa_xyz.append(sa_xyz[-1])
out_sa_features.append(sa_features[-1])
out_sa_indices.append(sa_indices[-1])
return dict(
sa_xyz=out_sa_xyz,
sa_features=out_sa_features,
sa_indices=out_sa_indices)
================================================
FILE: mmdet3d/models/backbones/pointnet2_sa_ssg.py
================================================
import torch
from mmcv.runner import auto_fp16
from torch import nn as nn
from mmdet3d.ops import PointFPModule, build_sa_module
from mmdet.models import BACKBONES
from .base_pointnet import BasePointNet
@BACKBONES.register_module()
class PointNet2SASSG(BasePointNet):
"""PointNet2 with Single-scale grouping.
Args:
in_channels (int): Input channels of point cloud.
num_points (tuple[int]): The number of points which each SA
module samples.
radius (tuple[float]): Sampling radii of each SA module.
num_samples (tuple[int]): The number of samples for ball
query in each SA module.
sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module.
fp_channels (tuple[tuple[int]]): Out channels of each mlp in FP module.
norm_cfg (dict): Config of normalization layer.
sa_cfg (dict): Config of set abstraction module, which may contain
the following keys and values:
- pool_mod (str): Pool method ('max' or 'avg') for SA modules.
- use_xyz (bool): Whether to use xyz as a part of features.
- normalize_xyz (bool): Whether to normalize xyz with radii in
each SA module.
"""
def __init__(self,
in_channels,
num_points=(2048, 1024, 512, 256),
radius=(0.2, 0.4, 0.8, 1.2),
num_samples=(64, 32, 16, 16),
sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256),
(128, 128, 256)),
fp_channels=((256, 256), (256, 256)),
norm_cfg=dict(type='BN2d'),
sa_cfg=dict(
type='PointSAModule',
pool_mod='max',
use_xyz=True,
normalize_xyz=True)):
super().__init__()
self.num_sa = len(sa_channels)
self.num_fp = len(fp_channels)
assert len(num_points) == len(radius) == len(num_samples) == len(
sa_channels)
assert len(sa_channels) >= len(fp_channels)
self.SA_modules = nn.ModuleList()
sa_in_channel = in_channels - 3 # number of channels without xyz
skip_channel_list = [sa_in_channel]
for sa_index in range(self.num_sa):
cur_sa_mlps = list(sa_channels[sa_index])
cur_sa_mlps = [sa_in_channel] + cur_sa_mlps
sa_out_channel = cur_sa_mlps[-1]
self.SA_modules.append(
build_sa_module(
num_point=num_points[sa_index],
radius=radius[sa_index],
num_sample=num_samples[sa_index],
mlp_channels=cur_sa_mlps,
norm_cfg=norm_cfg,
cfg=sa_cfg))
skip_channel_list.append(sa_out_channel)
sa_in_channel = sa_out_channel
self.FP_modules = nn.ModuleList()
fp_source_channel = skip_channel_list.pop()
fp_target_channel = skip_channel_list.pop()
for fp_index in range(len(fp_channels)):
cur_fp_mlps = list(fp_channels[fp_index])
cur_fp_mlps = [fp_source_channel + fp_target_channel] + cur_fp_mlps
self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps))
if fp_index != len(fp_channels) - 1:
fp_source_channel = cur_fp_mlps[-1]
fp_target_channel = skip_channel_list.pop()
@auto_fp16(apply_to=('points', ))
def forward(self, points):
"""Forward pass.
Args:
points (torch.Tensor): point coordinates with features,
with shape (B, N, 3 + input_feature_dim).
Returns:
dict[str, list[torch.Tensor]]: Outputs after SA and FP modules.
- fp_xyz (list[torch.Tensor]): The coordinates of \
each fp features.
- fp_features (list[torch.Tensor]): The features \
from each Feature Propagate Layers.
- fp_indices (list[torch.Tensor]): Indices of the \
input points.
"""
xyz, features = self._split_point_feats(points)
batch, num_points = xyz.shape[:2]
indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat(
batch, 1).long()
sa_xyz = [xyz]
sa_features = [features]
sa_indices = [indices]
for i in range(self.num_sa):
cur_xyz, cur_features, cur_indices = self.SA_modules[i](
sa_xyz[i], sa_features[i])
sa_xyz.append(cur_xyz)
sa_features.append(cur_features)
sa_indices.append(
torch.gather(sa_indices[-1], 1, cur_indices.long()))
fp_xyz = [sa_xyz[-1]]
fp_features = [sa_features[-1]]
fp_indices = [sa_indices[-1]]
for i in range(self.num_fp):
fp_features.append(self.FP_modules[i](
sa_xyz[self.num_sa - i - 1], sa_xyz[self.num_sa - i],
sa_features[self.num_sa - i - 1], fp_features[-1]))
fp_xyz.append(sa_xyz[self.num_sa - i - 1])
fp_indices.append(sa_indices[self.num_sa - i - 1])
ret = dict(
fp_xyz=fp_xyz, fp_features=fp_features, fp_indices=fp_indices)
return ret
================================================
FILE: mmdet3d/models/backbones/second.py
================================================
from mmcv.cnn import build_conv_layer, build_norm_layer
from mmcv.runner import load_checkpoint
from torch import nn as nn
from mmdet.models import BACKBONES
@BACKBONES.register_module()
class SECOND(nn.Module):
"""Backbone network for SECOND/PointPillars/PartA2/MVXNet.
Args:
in_channels (int): Input channels.
out_channels (list[int]): Output channels for multi-scale feature maps.
layer_nums (list[int]): Number of layers in each stage.
layer_strides (list[int]): Strides of each stage.
norm_cfg (dict): Config dict of normalization layers.
conv_cfg (dict): Config dict of convolutional layers.
"""
def __init__(self,
in_channels=128,
out_channels=[128, 128, 256],
layer_nums=[3, 5, 5],
layer_strides=[2, 2, 2],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
conv_cfg=dict(type='Conv2d', bias=False)):
super(SECOND, self).__init__()
assert len(layer_strides) == len(layer_nums)
assert len(out_channels) == len(layer_nums)
in_filters = [in_channels, *out_channels[:-1]]
# note that when stride > 1, conv2d with same padding isn't
# equal to pad-conv2d. we should use pad-conv2d.
blocks = []
for i, layer_num in enumerate(layer_nums):
block = [
build_conv_layer(
conv_cfg,
in_filters[i],
out_channels[i],
3,
stride=layer_strides[i],
padding=1),
build_norm_layer(norm_cfg, out_channels[i])[1],
nn.ReLU(inplace=True),
]
for j in range(layer_num):
block.append(
build_conv_layer(
conv_cfg,
out_channels[i],
out_channels[i],
3,
padding=1))
block.append(build_norm_layer(norm_cfg, out_channels[i])[1])
block.append(nn.ReLU(inplace=True))
block = nn.Sequential(*block)
blocks.append(block)
self.blocks = nn.ModuleList(blocks)
def init_weights(self, pretrained=None):
"""Initialize weights of the 2D backbone."""
# Do not initialize the conv layers
# to follow the original implementation
if isinstance(pretrained, str):
from mmdet3d.utils import get_root_logger
logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
def forward(self, x):
"""Forward function.
Args:
x (torch.Tensor): Input with shape (N, C, H, W).
Returns:
tuple[torch.Tensor]: Multi-scale features.
"""
outs = []
for i in range(len(self.blocks)):
x = self.blocks[i](x)
outs.append(x)
return tuple(outs)
================================================
FILE: mmdet3d/models/backbones/swin.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
from collections import OrderedDict
from copy import deepcopy
from typing import Sequence, Iterable, Optional
from torch import Tensor
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint as cp
from mmcv.cnn import build_norm_layer, constant_init, build_conv_layer, build_activation_layer, xavier_init
from mmcv.runner import BaseModule, _load_checkpoint
from mmcv.utils import get_logger
from mmdet.models.builder import BACKBONES
from mmdet3d.models.utils.drop import build_dropout
from mmdet3d.models.utils.transformer import FFN, to_2tuple, ModuleList
from mmdet.utils import get_root_logger
def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
b: float) -> Tensor:
# Method based on
# https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
# Modified from
# https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1. + math.erf(x / math.sqrt(2.))) / 2.
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn(
'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
'The distribution of values may be incorrect.',
stacklevel=2)
with torch.no_grad():
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
lower = norm_cdf((a - mean) / std)
upper = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [lower, upper], then translate
# to [2lower-1, 2upper-1].
tensor.uniform_(2 * lower - 1, 2 * upper - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
return tensor
def trunc_normal_(tensor: Tensor,
mean: float = 0.,
std: float = 1.,
a: float = -2.,
b: float = 2.) -> Tensor:
r"""Fills the input Tensor with values drawn from a truncated normal
distribution. The values are effectively drawn from the normal distribution
:math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values outside
:math:`[a, b]` redrawn until they are within the bounds. The method used
for generating the random values works best when :math:`a \leq \text{mean}
\leq b`.
Modified from
https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
Args:
tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
mean (float): the mean of the normal distribution.
std (float): the standard deviation of the normal distribution.
a (float): the minimum cutoff value.
b (float): the maximum cutoff value.
"""
return _no_grad_trunc_normal_(tensor, mean, std, a, b)
def trunc_normal_init(module: nn.Module,
mean: float = 0,
std: float = 1,
a: float = -2,
b: float = 2,
bias: float = 0) -> None:
if hasattr(module, 'weight') and module.weight is not None:
trunc_normal_(module.weight, mean, std, a, b) # type: ignore
if hasattr(module, 'bias') and module.bias is not None:
nn.init.constant_(module.bias, bias) # type: ignore
class AdaptivePadding(nn.Module):
"""Applies padding to input (if needed) so that input can get fully covered
by filter you specified. It support two modes "same" and "corner". The
"same" mode is same with "SAME" padding mode in TensorFlow, pad zero around
input. The "corner" mode would pad zero to bottom right.
Args:
kernel_size (int | tuple): Size of the kernel:
stride (int | tuple): Stride of the filter. Default: 1:
dilation (int | tuple): Spacing between kernel elements.
Default: 1
padding (str): Support "same" and "corner", "corner" mode
would pad zero to bottom right, and "same" mode would
pad zero around input. Default: "corner".
Example:
>>> kernel_size = 16
>>> stride = 16
>>> dilation = 1
>>> input = torch.rand(1, 1, 15, 17)
>>> adap_pad = AdaptivePadding(
>>> kernel_size=kernel_size,
>>> stride=stride,
>>> dilation=dilation,
>>> padding="corner")
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
>>> input = torch.rand(1, 1, 16, 17)
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
"""
def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
super(AdaptivePadding, self).__init__()
assert padding in ('same', 'corner')
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
padding = to_2tuple(padding)
dilation = to_2tuple(dilation)
self.padding = padding
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
def get_pad_shape(self, input_shape):
input_h, input_w = input_shape
kernel_h, kernel_w = self.kernel_size
stride_h, stride_w = self.stride
output_h = math.ceil(input_h / stride_h)
output_w = math.ceil(input_w / stride_w)
pad_h = max((output_h - 1) * stride_h +
(kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
pad_w = max((output_w - 1) * stride_w +
(kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
return pad_h, pad_w
def forward(self, x):
pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
if pad_h > 0 or pad_w > 0:
if self.padding == 'corner':
x = F.pad(x, [0, pad_w, 0, pad_h])
elif self.padding == 'same':
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
pad_h - pad_h // 2
])
return x
class PatchEmbed(BaseModule):
"""Image to Patch Embedding.
We use a conv layer to implement PatchEmbed.
Args:
in_channels (int): The num of input channels. Default: 3
embed_dims (int): The dimensions of embedding. Default: 768
conv_type (str): The config dict for embedding
conv layer type selection. Default: "Conv2d.
kernel_size (int): The kernel_size of embedding conv. Default: 16.
stride (int): The slide stride of embedding conv.
Default: None (Would be set as `kernel_size`).
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int): The dilation rate of embedding conv. Default: 1.
bias (bool): Bias of embed conv. Default: True.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: None.
input_size (int | tuple | None): The size of input, which will be
used to calculate the out size. Only work when `dynamic_size`
is False. Default: None.
init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
Default: None.
"""
def __init__(
self,
in_channels=3,
embed_dims=768,
conv_type='Conv2d',
kernel_size=16,
stride=16,
padding='corner',
dilation=1,
bias=True,
norm_cfg=None,
input_size=None,
init_cfg=None,
):
super(PatchEmbed, self).__init__(init_cfg=init_cfg)
self.embed_dims = embed_dims
if stride is None:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adap_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of conv
padding = 0
else:
self.adap_padding = None
padding = to_2tuple(padding)
self.projection = build_conv_layer(
dict(type=conv_type),
in_channels=in_channels,
out_channels=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
else:
self.norm = None
if input_size:
input_size = to_2tuple(input_size)
# `init_out_size` would be used outside to
# calculate the num_patches
# when `use_abs_pos_embed` outside
self.init_input_size = input_size
if self.adap_padding:
pad_h, pad_w = self.adap_padding.get_pad_shape(input_size)
input_h, input_w = input_size
input_h = input_h + pad_h
input_w = input_w + pad_w
input_size = (input_h, input_w)
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
(kernel_size[0] - 1) - 1) // stride[0] + 1
w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
(kernel_size[1] - 1) - 1) // stride[1] + 1
self.init_out_size = (h_out, w_out)
else:
self.init_input_size = None
self.init_out_size = None
def forward(self, x):
"""
Args:
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
- out_size (tuple[int]): Spatial shape of x, arrange as
(out_h, out_w).
"""
if self.adap_padding:
x = self.adap_padding(x)
x = self.projection(x)
out_size = (x.shape[2], x.shape[3])
x = x.flatten(2).transpose(1, 2)
if self.norm is not None:
x = self.norm(x)
return x, out_size
class PatchMerging(BaseModule):
"""Merge patch feature map.
This layer groups feature map by kernel_size, and applies norm and linear
layers to the grouped feature map. Our implementation uses `nn.Unfold` to
merge patch, which is about 25% faster than original implementation.
Instead, we need to modify pretrained models for compatibility.
Args:
in_channels (int): The num of input channels.
to gets fully covered by filter and stride you specified..
Default: True.
out_channels (int): The num of output channels.
kernel_size (int | tuple, optional): the kernel size in the unfold
layer. Defaults to 2.
stride (int | tuple, optional): the stride of the sliding blocks in the
unfold layer. Default: None. (Would be set as `kernel_size`)
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int | tuple, optional): dilation parameter in the unfold
layer. Default: 1.
bias (bool, optional): Whether to add bias in linear layer or not.
Defaults: False.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (dict, optional): The extra config for initialization.
Default: None.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=2,
stride=None,
padding='corner',
dilation=1,
bias=False,
norm_cfg=dict(type='LN'),
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.in_channels = in_channels
self.out_channels = out_channels
if stride:
stride = stride
else:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adap_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of unfold
padding = 0
else:
self.adap_padding = None
padding = to_2tuple(padding)
self.sampler = nn.Unfold(
kernel_size=kernel_size,
dilation=dilation,
padding=padding,
stride=stride)
sample_dim = kernel_size[0] * kernel_size[1] * in_channels
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
else:
self.norm = None
self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
def forward(self, x, input_size):
"""
Args:
x (Tensor): Has shape (B, H*W, C_in).
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
Default: None.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
- out_size (tuple[int]): Spatial shape of x, arrange as
(Merged_H, Merged_W).
"""
B, L, C = x.shape
assert isinstance(input_size, Sequence), f'Expect ' \
f'input_size is ' \
f'`Sequence` ' \
f'but get {input_size}'
H, W = input_size
assert L == H * W, 'input feature has wrong size'
x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
# Use nn.Unfold to merge patch. About 25% faster than original method,
# but need to modify pretrained model for compatibility
if self.adap_padding:
x = self.adap_padding(x)
H, W = x.shape[-2:]
x = self.sampler(x)
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
(self.sampler.kernel_size[0] - 1) -
1) // self.sampler.stride[0] + 1
out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
(self.sampler.kernel_size[1] - 1) -
1) // self.sampler.stride[1] + 1
output_size = (out_h, out_w)
x = x.transpose(1, 2) # B, H/2*W/2, 4*C
x = self.norm(x) if self.norm else x
x = self.reduction(x)
return x, output_size
def swin_converter(ckpt):
new_ckpt = OrderedDict()
def correct_unfold_reduction_order(x):
out_channel, in_channel = x.shape
x = x.reshape(out_channel, 4, in_channel // 4)
x = x[:, [0, 2, 1, 3], :].transpose(1,
2).reshape(out_channel, in_channel)
return x
def correct_unfold_norm_order(x):
in_channel = x.shape[0]
x = x.reshape(4, in_channel // 4)
x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel)
return x
for k, v in ckpt.items():
if k.startswith('head'):
continue
elif k.startswith('layers'):
new_v = v
if 'attn.' in k:
new_k = k.replace('attn.', 'attn.w_msa.')
elif 'mlp.' in k:
if 'mlp.fc1.' in k:
new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.')
elif 'mlp.fc2.' in k:
new_k = k.replace('mlp.fc2.', 'ffn.layers.1.')
else:
new_k = k.replace('mlp.', 'ffn.')
elif 'downsample' in k:
new_k = k
if 'reduction.' in k:
new_v = correct_unfold_reduction_order(v)
elif 'norm.' in k:
new_v = correct_unfold_norm_order(v)
else:
new_k = k
new_k = new_k.replace('layers', 'stages', 1)
elif k.startswith('patch_embed'):
new_v = v
if 'proj' in k:
new_k = k.replace('proj', 'projection')
else:
new_k = k
else:
new_v = v
new_k = k
new_ckpt['backbone.' + new_k] = new_v
return new_ckpt
class WindowMSA(BaseModule):
"""Window based multi-head self-attention (W-MSA) module with relative
position bias.
Args:
embed_dims (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (tuple[int]): The height and width of the window.
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: True.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
attn_drop_rate (float, optional): Dropout ratio of attention weight.
Default: 0.0
proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
init_cfg (dict | None, optional): The Config for initialization.
Default: None.
"""
def __init__(self,
embed_dims,
num_heads,
window_size,
qkv_bias=True,
qk_scale=None,
attn_drop_rate=0.,
proj_drop_rate=0.,
init_cfg=None):
super().__init__()
self.embed_dims = embed_dims
self.window_size = window_size # Wh, Ww
self.num_heads = num_heads
head_embed_dims = embed_dims // num_heads
self.scale = qk_scale or head_embed_dims**-0.5
self.init_cfg = init_cfg
# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
num_heads)) # 2*Wh-1 * 2*Ww-1, nH
# About 2x faster than original impl
Wh, Ww = self.window_size
rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
rel_position_index = rel_index_coords + rel_index_coords.T
rel_position_index = rel_position_index.flip(1).contiguous()
self.register_buffer('relative_position_index', rel_position_index)
self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop_rate)
self.proj = nn.Linear(embed_dims, embed_dims)
self.proj_drop = nn.Dropout(proj_drop_rate)
self.softmax = nn.Softmax(dim=-1)
def init_weights(self):
trunc_normal_(self.relative_position_bias_table, std=0.02)
def forward(self, x, mask=None):
"""
Args:
x (tensor): input features with shape of (num_windows*B, N, C)
mask (tensor | None, Optional): mask with shape of (num_windows,
Wh*Ww, Wh*Ww), value should be between (-inf, 0].
"""
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
C // self.num_heads).permute(2, 0, 3, 1, 4)
# make torchscript happy (cannot use tensor as tuple)
q, k, v = qkv[0], qkv[1], qkv[2]
q = q * self.scale
attn = (q @ k.transpose(-2, -1))
relative_position_bias = self.relative_position_bias_table[
self.relative_position_index.view(-1)].view(
self.window_size[0] * self.window_size[1],
self.window_size[0] * self.window_size[1],
-1) # Wh*Ww,Wh*Ww,nH
relative_position_bias = relative_position_bias.permute(
2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
attn = attn + relative_position_bias.unsqueeze(0)
if mask is not None:
nW = mask.shape[0]
attn = attn.view(B // nW, nW, self.num_heads, N,
N) + mask.unsqueeze(1).unsqueeze(0)
attn = attn.view(-1, self.num_heads, N, N)
attn = self.softmax(attn)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
@staticmethod
def double_step_seq(step1, len1, step2, len2):
seq1 = torch.arange(0, step1 * len1, step1)
seq2 = torch.arange(0, step2 * len2, step2)
return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
class ShiftWindowMSA(BaseModule):
"""Shifted Window Multihead Self-Attention Module.
Args:
embed_dims (int): Number of input channels.
num_heads (int): Number of attention heads.
window_size (int): The height and width of the window.
shift_size (int, optional): The shift step of each window towards
right-bottom. If zero, act as regular window-msa. Defaults to 0.
qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
Default: True
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Defaults: None.
attn_drop_rate (float, optional): Dropout ratio of attention weight.
Defaults: 0.
proj_drop_rate (float, optional): Dropout ratio of output.
Defaults: 0.
dropout_layer (dict, optional): The dropout_layer used before output.
Defaults: dict(type='DropPath', drop_prob=0.).
init_cfg (dict, optional): The extra config for initialization.
Default: None.
"""
def __init__(self,
embed_dims,
num_heads,
window_size,
shift_size=0,
qkv_bias=True,
qk_scale=None,
attn_drop_rate=0,
proj_drop_rate=0,
dropout_layer=dict(type='DropPath', drop_prob=0.),
init_cfg=None):
super().__init__(init_cfg)
self.window_size = window_size
self.shift_size = shift_size
assert 0 <= self.shift_size < self.window_size
self.w_msa = WindowMSA(
embed_dims=embed_dims,
num_heads=num_heads,
window_size=to_2tuple(window_size),
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop_rate=attn_drop_rate,
proj_drop_rate=proj_drop_rate,
init_cfg=None)
self.drop = build_dropout(dropout_layer)
def forward(self, query, hw_shape):
B, L, C = query.shape
H, W = hw_shape
assert L == H * W, 'input feature has wrong size'
query = query.view(B, H, W, C)
# pad feature maps to multiples of window size
pad_r = (self.window_size - W % self.window_size) % self.window_size
pad_b = (self.window_size - H % self.window_size) % self.window_size
query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
H_pad, W_pad = query.shape[1], query.shape[2]
# cyclic shift
if self.shift_size > 0:
shifted_query = torch.roll(
query,
shifts=(-self.shift_size, -self.shift_size),
dims=(1, 2))
# calculate attention mask for SW-MSA
img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
h_slices = (slice(0, -self.window_size),
slice(-self.window_size,
-self.shift_size), slice(-self.shift_size, None))
w_slices = (slice(0, -self.window_size),
slice(-self.window_size,
-self.shift_size), slice(-self.shift_size, None))
cnt = 0
for h in h_slices:
for w in w_slices:
img_mask[:, h, w, :] = cnt
cnt += 1
# nW, window_size, window_size, 1
mask_windows = self.window_partition(img_mask)
mask_windows = mask_windows.view(
-1, self.window_size * self.window_size)
attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
attn_mask = attn_mask.masked_fill(attn_mask != 0,
float(-100.0)).masked_fill(
attn_mask == 0, float(0.0))
else:
shifted_query = query
attn_mask = None
# nW*B, window_size, window_size, C
query_windows = self.window_partition(shifted_query)
# nW*B, window_size*window_size, C
query_windows = query_windows.view(-1, self.window_size**2, C)
# W-MSA/SW-MSA (nW*B, window_size*window_size, C)
attn_windows = self.w_msa(query_windows, mask=attn_mask)
# merge windows
attn_windows = attn_windows.view(-1, self.window_size,
self.window_size, C)
# B H' W' C
shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
# reverse cyclic shift
if self.shift_size > 0:
x = torch.roll(
shifted_x,
shifts=(self.shift_size, self.shift_size),
dims=(1, 2))
else:
x = shifted_x
if pad_r > 0 or pad_b:
x = x[:, :H, :W, :].contiguous()
x = x.view(B, H * W, C)
x = self.drop(x)
return x
def window_reverse(self, windows, H, W):
"""
Args:
windows: (num_windows*B, window_size, window_size, C)
H (int): Height of image
W (int): Width of image
Returns:
x: (B, H, W, C)
"""
window_size = self.window_size
B = int(windows.shape[0] / (H * W / window_size / window_size))
x = windows.view(B, H // window_size, W // window_size, window_size,
window_size, -1)
x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
return x
def window_partition(self, x):
"""
Args:
x: (B, H, W, C)
Returns:
windows: (num_windows*B, window_size, window_size, C)
"""
B, H, W, C = x.shape
window_size = self.window_size
x = x.view(B, H // window_size, window_size, W // window_size,
window_size, C)
windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
windows = windows.view(-1, window_size, window_size, C)
return windows
class SwinBlock(BaseModule):
""""
Args:
embed_dims (int): The feature dimension.
num_heads (int): Parallel attention heads.
feedforward_channels (int): The hidden dimension for FFNs.
window_size (int, optional): The local window scale. Default: 7.
shift (bool, optional): whether to shift window or not. Default False.
qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
drop_rate (float, optional): Dropout rate. Default: 0.
attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
drop_path_rate (float, optional): Stochastic depth rate. Default: 0.
act_cfg (dict, optional): The config dict of activation function.
Default: dict(type='GELU').
norm_cfg (dict, optional): The config dict of normalization.
Default: dict(type='LN').
with_cp (bool, optional): Use checkpoint or not. Using checkpoint
will save some memory while slowing down the training speed.
Default: False.
init_cfg (dict | list | None, optional): The init config.
Default: None.
"""
def __init__(self,
embed_dims,
num_heads,
feedforward_channels,
window_size=7,
shift=False,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
with_cp=False,
init_cfg=None):
super(SwinBlock, self).__init__()
self.init_cfg = init_cfg
self.with_cp = with_cp
self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
self.attn = ShiftWindowMSA(
embed_dims=embed_dims,
num_heads=num_heads,
window_size=window_size,
shift_size=window_size // 2 if shift else 0,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
attn_drop_rate=attn_drop_rate,
proj_drop_rate=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
init_cfg=None)
self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
self.ffn = FFN(
embed_dims=embed_dims,
feedforward_channels=feedforward_channels,
num_fcs=2,
ffn_drop=drop_rate,
dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
act_cfg=act_cfg,
add_identity=True,
init_cfg=None)
def forward(self, x, hw_shape):
def _inner_forward(x):
identity = x
x = self.norm1(x)
x = self.attn(x, hw_shape)
x = x + identity
identity = x
x = self.norm2(x)
x = self.ffn(x, identity=identity)
return x
if self.with_cp and x.requires_grad:
x = cp.checkpoint(_inner_forward, x)
else:
x = _inner_forward(x)
return x
class SwinBlockSequence(BaseModule):
"""Implements one stage in Swin Transformer.
Args:
embed_dims (int): The feature dimension.
num_heads (int): Parallel attention heads.
feedforward_channels (int): The hidden dimension for FFNs.
depth (int): The number of blocks in this stage.
window_size (int, optional): The local window scale. Default: 7.
qkv_bias (bool, optional): enable bias for qkv if True. Default: True.
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
drop_rate (float, optional): Dropout rate. Default: 0.
attn_drop_rate (float, optional): Attention dropout rate. Default: 0.
drop_path_rate (float | list[float], optional): Stochastic depth
rate. Default: 0.
downsample (BaseModule | None, optional): The downsample operation
module. Default: None.
act_cfg (dict, optional): The config dict of activation function.
Default: dict(type='GELU').
norm_cfg (dict, optional): The config dict of normalization.
Default: dict(type='LN').
with_cp (bool, optional): Use checkpoint or not. Using checkpoint
will save some memory while slowing down the training speed.
Default: False.
init_cfg (dict | list | None, optional): The init config.
Default: None.
"""
def __init__(self,
embed_dims,
num_heads,
feedforward_channels,
depth,
window_size=7,
qkv_bias=True,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
downsample=None,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
with_cp=False,
init_cfg=None):
super().__init__(init_cfg=init_cfg)
if isinstance(drop_path_rate, list):
drop_path_rates = drop_path_rate
assert len(drop_path_rates) == depth
else:
drop_path_rates = [deepcopy(drop_path_rate) for _ in range(depth)]
self.blocks = ModuleList()
for i in range(depth):
block = SwinBlock(
embed_dims=embed_dims,
num_heads=num_heads,
feedforward_channels=feedforward_channels,
window_size=window_size,
shift=False if i % 2 == 0 else True,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate,
drop_path_rate=drop_path_rates[i],
act_cfg=act_cfg,
norm_cfg=norm_cfg,
with_cp=with_cp,
init_cfg=None)
self.blocks.append(block)
self.downsample = downsample
def forward(self, x, hw_shape):
for block in self.blocks:
x = block(x, hw_shape)
if self.downsample:
x_down, down_hw_shape = self.downsample(x, hw_shape)
return x_down, down_hw_shape, x, hw_shape
else:
return x, hw_shape, x, hw_shape
@BACKBONES.register_module()
class SwinTransformer(BaseModule):
""" Swin Transformer
A PyTorch implement of : `Swin Transformer:
Hierarchical Vision Transformer using Shifted Windows` -
https://arxiv.org/abs/2103.14030
Inspiration from
https://github.com/microsoft/Swin-Transformer
Args:
pretrain_img_size (int | tuple[int]): The size of input image when
pretrain. Defaults: 224.
in_channels (int): The num of input channels.
Defaults: 3.
embed_dims (int): The feature dimension. Default: 96.
patch_size (int | tuple[int]): Patch size. Default: 4.
window_size (int): Window size. Default: 7.
mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
Default: 4.
depths (tuple[int]): Depths of each Swin Transformer stage.
Default: (2, 2, 6, 2).
num_heads (tuple[int]): Parallel attention heads of each Swin
Transformer stage. Default: (3, 6, 12, 24).
strides (tuple[int]): The patch merging or patch embedding stride of
each Swin Transformer stage. (In swin, we set kernel size equal to
stride.) Default: (4, 2, 2, 2).
out_indices (tuple[int]): Output from which stages.
Default: (0, 1, 2, 3).
qkv_bias (bool, optional): If True, add a learnable bias to query, key,
value. Default: True
qk_scale (float | None, optional): Override default qk scale of
head_dim ** -0.5 if set. Default: None.
patch_norm (bool): If add a norm layer for patch embed and patch
merging. Default: True.
drop_rate (float): Dropout rate. Defaults: 0.
attn_drop_rate (float): Attention dropout rate. Default: 0.
drop_path_rate (float): Stochastic depth rate. Defaults: 0.1.
use_abs_pos_embed (bool): If True, add absolute position embedding to
the patch embedding. Defaults: False.
act_cfg (dict): Config dict for activation layer.
Default: dict(type='GELU').
norm_cfg (dict): Config dict for normalization layer at
output of backone. Defaults: dict(type='LN').
with_cp (bool, optional): Use checkpoint or not. Using checkpoint
will save some memory while slowing down the training speed.
Default: False.
pretrained (str, optional): model pretrained path. Default: None.
convert_weights (bool): The flag indicates whether the
pre-trained model is from the original repo. We may need
to convert some keys to make it compatible.
Default: False.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
Default: -1 (-1 means not freezing any parameters).
init_cfg (dict, optional): The Config for initialization.
Defaults to None.
"""
def __init__(self,
pretrain_img_size=224,
in_channels=3,
embed_dims=96,
patch_size=4,
window_size=7,
mlp_ratio=4,
depths=(2, 2, 6, 2),
num_heads=(3, 6, 12, 24),
strides=(4, 2, 2, 2),
out_indices=(0, 1, 2, 3),
qkv_bias=True,
qk_scale=None,
patch_norm=True,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.1,
use_abs_pos_embed=False,
act_cfg=dict(type='GELU'),
norm_cfg=dict(type='LN'),
with_cp=False,
pretrained=None,
convert_weights=False,
frozen_stages=-1,
init_cfg=None):
self.convert_weights = convert_weights
self.frozen_stages = frozen_stages
if isinstance(pretrain_img_size, int):
pretrain_img_size = to_2tuple(pretrain_img_size)
elif isinstance(pretrain_img_size, tuple):
if len(pretrain_img_size) == 1:
pretrain_img_size = to_2tuple(pretrain_img_size[0])
assert len(pretrain_img_size) == 2, \
f'The size of image should have length 1 or 2, ' \
f'but got {len(pretrain_img_size)}'
assert not (init_cfg and pretrained), \
'init_cfg and pretrained cannot be specified at the same time'
if isinstance(pretrained, str):
warnings.warn('DeprecationWarning: pretrained is deprecated, '
'please use "init_cfg" instead')
self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
elif pretrained is None:
self.init_cfg = init_cfg
else:
raise TypeError('pretrained must be a str or None')
super(SwinTransformer, self).__init__(init_cfg=init_cfg)
num_layers = len(depths)
self.out_indices = out_indices
self.use_abs_pos_embed = use_abs_pos_embed
assert strides[0] == patch_size, 'Use non-overlapping patch embed.'
self.patch_embed = PatchEmbed(
in_channels=in_channels,
embed_dims=embed_dims,
conv_type='Conv2d',
kernel_size=patch_size,
stride=strides[0],
norm_cfg=norm_cfg if patch_norm else None,
init_cfg=None)
if self.use_abs_pos_embed:
patch_row = pretrain_img_size[0] // patch_size
patch_col = pretrain_img_size[1] // patch_size
self.absolute_pos_embed = nn.Parameter(
torch.zeros((1, embed_dims, patch_row, patch_col)))
self.drop_after_pos = nn.Dropout(p=drop_rate)
# set stochastic depth decay rule
total_depth = sum(depths)
dpr = [
x.item() for x in torch.linspace(0, drop_path_rate, total_depth)
]
self.stages = ModuleList()
in_channels = embed_dims
for i in range(num_layers):
if i < num_layers - 1:
downsample = PatchMerging(
in_channels=in_channels,
out_channels=2 * in_channels,
stride=strides[i + 1],
norm_cfg=norm_cfg if patch_norm else None,
init_cfg=None)
else:
downsample = None
stage = SwinBlockSequence(
embed_dims=in_channels,
num_heads=num_heads[i],
feedforward_channels=mlp_ratio * in_channels,
depth=depths[i],
window_size=window_size,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop_rate=drop_rate,
attn_drop_rate=attn_drop_rate,
drop_path_rate=dpr[sum(depths[:i]):sum(depths[:i + 1])],
downsample=downsample,
act_cfg=act_cfg,
norm_cfg=norm_cfg,
with_cp=with_cp,
init_cfg=None)
self.stages.append(stage)
if downsample:
in_channels = downsample.out_channels
self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)]
# Add a norm layer for each output
for i in out_indices:
layer = build_norm_layer(norm_cfg, self.num_features[i])[1]
layer_name = f'norm{i}'
self.add_module(layer_name, layer)
def train(self, mode=True):
"""Convert the model into training mode while keep layers freezed."""
super(SwinTransformer, self).train(mode)
self._freeze_stages()
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.patch_embed.eval()
for param in self.patch_embed.parameters():
param.requires_grad = False
if self.use_abs_pos_embed:
self.absolute_pos_embed.requires_grad = False
self.drop_after_pos.eval()
for i in range(1, self.frozen_stages + 1):
if (i - 1) in self.out_indices:
norm_layer = getattr(self, f'norm{i-1}')
norm_layer.eval()
for param in norm_layer.parameters():
param.requires_grad = False
m = self.stages[i - 1]
m.eval()
for param in m.parameters():
param.requires_grad = False
def init_weights(self, pretrained=None):
logger = get_root_logger()
if pretrained is None:
logger.warn(f'No pre-trained weights for '
f'{self.__class__.__name__}, '
f'training start from scratch')
if self.use_abs_pos_embed:
trunc_normal_(self.absolute_pos_embed, std=0.02)
for m in self.modules():
if isinstance(m, nn.Linear):
trunc_normal_init(m, std=.02, bias=0.)
elif isinstance(m, nn.LayerNorm):
constant_init(m, 1.0)
else:
# assert 'checkpoint' in self.init_cfg, f'Only support ' \
# f'specify `Pretrained` in ' \
# f'`init_cfg` in ' \
# f'{self.__class__.__name__} '
ckpt = _load_checkpoint(pretrained, logger=logger, map_location='cpu')
if 'state_dict' in ckpt:
_state_dict = ckpt['state_dict']
elif 'model' in ckpt:
_state_dict = ckpt['model']
else:
_state_dict = ckpt
if self.convert_weights:
# supported loading weight from original repo,
_state_dict = swin_converter(_state_dict)
state_dict = OrderedDict()
for k, v in _state_dict.items():
if k.startswith('backbone.'):
state_dict[k[9:]] = v
# strip prefix of state_dict
if list(state_dict.keys())[0].startswith('module.'):
state_dict = {k[7:]: v for k, v in state_dict.items()}
# reshape absolute position embedding
if state_dict.get('absolute_pos_embed') is not None:
absolute_pos_embed = state_dict['absolute_pos_embed']
N1, L, C1 = absolute_pos_embed.size()
N2, C2, H, W = self.absolute_pos_embed.size()
if N1 != N2 or C1 != C2 or L != H * W:
logger.warning('Error in loading absolute_pos_embed, pass')
else:
state_dict['absolute_pos_embed'] = absolute_pos_embed.view(
N2, H, W, C2).permute(0, 3, 1, 2).contiguous()
# interpolate position bias table if needed
relative_position_bias_table_keys = [
k for k in state_dict.keys()
if 'relative_position_bias_table' in k
]
for table_key in relative_position_bias_table_keys:
import pdb
pdb.set_trace()
table_pretrained = state_dict[table_key]
table_current = self.state_dict()[table_key]
L1, nH1 = table_pretrained.size()
L2, nH2 = table_current.size()
if nH1 != nH2:
logger.warning(f'Error in loading {table_key}, pass')
elif L1 != L2:
S1 = int(L1**0.5)
S2 = int(L2**0.5)
table_pretrained_resized = F.interpolate(
table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
size=(S2, S2),
mode='bicubic')
state_dict[table_key] = table_pretrained_resized.view(
nH2, L2).permute(1, 0).contiguous()
# load state_dict
self.load_state_dict(state_dict, False)
def forward(self, x):
x, hw_shape = self.patch_embed(x)
if self.use_abs_pos_embed:
h, w = self.absolute_pos_embed.shape[1:3]
if hw_shape[0] != h or hw_shape[1] != w:
absolute_pos_embed = F.interpolate(
self.absolute_pos_embed,
size=hw_shape,
mode='bicubic',
align_corners=False).flatten(2).transpose(1, 2)
else:
absolute_pos_embed = self.absolute_pos_embed.flatten(
2).transpose(1, 2)
x = x + absolute_pos_embed
x = self.drop_after_pos(x)
outs = []
for i, stage in enumerate(self.stages):
x, hw_shape, out, out_hw_shape = stage(x, hw_shape)
if i in self.out_indices:
norm_layer = getattr(self, f'norm{i}')
out = norm_layer(out)
out = out.view(-1, *out_hw_shape,
self.num_features[i]).permute(0, 3, 1,
2).contiguous()
outs.append(out)
return outs
================================================
FILE: mmdet3d/models/builder.py
================================================
import warnings
from mmdet.models.builder import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS,
ROI_EXTRACTORS, SHARED_HEADS, build)
from .registry import FUSION_LAYERS, MIDDLE_ENCODERS, VOXEL_ENCODERS
def build_backbone(cfg):
"""Build backbone."""
return build(cfg, BACKBONES)
def build_neck(cfg):
"""Build neck."""
return build(cfg, NECKS)
def build_roi_extractor(cfg):
"""Build RoI feature extractor."""
return build(cfg, ROI_EXTRACTORS)
def build_shared_head(cfg):
"""Build shared head of detector."""
return build(cfg, SHARED_HEADS)
def build_head(cfg):
"""Build head."""
return build(cfg, HEADS)
def build_loss(cfg):
"""Build loss function."""
return build(cfg, LOSSES)
def build_detector(cfg, train_cfg=None, test_cfg=None):
"""Build detector."""
if train_cfg is not None or test_cfg is not None:
warnings.warn(
'train_cfg and test_cfg is deprecated, '
'please specify them in model', UserWarning)
assert cfg.get('train_cfg') is None or train_cfg is None, \
'train_cfg specified in both outer field and model field '
assert cfg.get('test_cfg') is None or test_cfg is None, \
'test_cfg specified in both outer field and model field '
return build(cfg, DETECTORS, dict(train_cfg=train_cfg, test_cfg=test_cfg))
def build_voxel_encoder(cfg):
"""Build voxel encoder."""
return build(cfg, VOXEL_ENCODERS)
def build_middle_encoder(cfg):
"""Build middle level encoder."""
return build(cfg, MIDDLE_ENCODERS)
def build_fusion_layer(cfg):
"""Build fusion layer."""
return build(cfg, FUSION_LAYERS)
================================================
FILE: mmdet3d/models/dense_heads/__init__.py
================================================
from .anchor3d_head import Anchor3DHead
from .base_conv_bbox_head import BaseConvBboxHead
from .centerpoint_head import CenterHead
from .free_anchor3d_head import FreeAnchor3DHead
from .parta2_rpn_head import PartA2RPNHead
from .shape_aware_head import ShapeAwareHead
from .ssd_3d_head import SSD3DHead
from .vote_head import VoteHead
from .transfusion_head import TransFusionHead
from .sparsefusion_head_deform import SparseFusionHead2D_Deform
__all__ = [
'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead',
'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead',
'TransFusionHead', 'SparseFusionHead2D_Deform'
]
================================================
FILE: mmdet3d/models/dense_heads/anchor3d_head.py
================================================
import numpy as np
import torch
from mmcv.cnn import bias_init_with_prob, normal_init
from mmcv.runner import force_fp32
from torch import nn as nn
from mmdet3d.core import (PseudoSampler, box3d_multiclass_nms, limit_period,
xywhr2xyxyr)
from mmdet.core import (build_anchor_generator, build_assigner,
build_bbox_coder, build_sampler, multi_apply)
from mmdet.models import HEADS
from ..builder import build_loss
from .train_mixins import AnchorTrainMixin
@HEADS.register_module()
class Anchor3DHead(nn.Module, AnchorTrainMixin):
"""Anchor head for SECOND/PointPillars/MVXNet/PartA2.
Args:
num_classes (int): Number of classes.
in_channels (int): Number of channels in the input feature map.
train_cfg (dict): Train configs.
test_cfg (dict): Test configs.
feat_channels (int): Number of channels of the feature map.
use_direction_classifier (bool): Whether to add a direction classifier.
anchor_generator(dict): Config dict of anchor generator.
assigner_per_size (bool): Whether to do assignment for each separate
anchor size.
assign_per_class (bool): Whether to do assignment for each class.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
dir_offset (float | int): The offset of BEV rotation angles.
(TODO: may be moved into box coder)
dir_limit_offset (float | int): The limited range of BEV
rotation angles. (TODO: may be moved into box coder)
bbox_coder (dict): Config dict of box coders.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
"""
def __init__(self,
num_classes,
in_channels,
train_cfg,
test_cfg,
feat_channels=256,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
strides=[2],
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.57],
custom_values=[],
reshape_out=False),
assigner_per_size=False,
assign_per_class=False,
diff_rad_by_sin=True,
dir_offset=0,
dir_limit_offset=1,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)):
super().__init__()
self.in_channels = in_channels
self.num_classes = num_classes
self.feat_channels = feat_channels
self.diff_rad_by_sin = diff_rad_by_sin
self.use_direction_classifier = use_direction_classifier
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.assigner_per_size = assigner_per_size
self.assign_per_class = assign_per_class
self.dir_offset = dir_offset
self.dir_limit_offset = dir_limit_offset
self.fp16_enabled = False
# build anchor generator
self.anchor_generator = build_anchor_generator(anchor_generator)
# In 3D detection, the anchor stride is connected with anchor size
self.num_anchors = self.anchor_generator.num_base_anchors
# build box coder
self.bbox_coder = build_bbox_coder(bbox_coder)
self.box_code_size = self.bbox_coder.code_size
# build loss function
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC']
if not self.use_sigmoid_cls:
self.num_classes += 1
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_dir = build_loss(loss_dir)
self.fp16_enabled = False
self._init_layers()
self._init_assigner_sampler()
def _init_assigner_sampler(self):
"""Initialize the target assigner and sampler of the head."""
if self.train_cfg is None:
return
if self.sampling:
self.bbox_sampler = build_sampler(self.train_cfg.sampler)
else:
self.bbox_sampler = PseudoSampler()
if isinstance(self.train_cfg.assigner, dict):
self.bbox_assigner = build_assigner(self.train_cfg.assigner)
elif isinstance(self.train_cfg.assigner, list):
self.bbox_assigner = [
build_assigner(res) for res in self.train_cfg.assigner
]
def _init_layers(self):
"""Initialize neural network layers of the head."""
self.cls_out_channels = self.num_anchors * self.num_classes
self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1)
self.conv_reg = nn.Conv2d(self.feat_channels,
self.num_anchors * self.box_code_size, 1)
if self.use_direction_classifier:
self.conv_dir_cls = nn.Conv2d(self.feat_channels,
self.num_anchors * 2, 1)
def init_weights(self):
"""Initialize the weights of head."""
bias_cls = bias_init_with_prob(0.01)
normal_init(self.conv_cls, std=0.01, bias=bias_cls)
normal_init(self.conv_reg, std=0.01)
def forward_single(self, x):
"""Forward function on a single-scale feature map.
Args:
x (torch.Tensor): Input features.
Returns:
tuple[torch.Tensor]: Contain score of each class, bbox \
regression and direction classification predictions.
"""
cls_score = self.conv_cls(x)
bbox_pred = self.conv_reg(x)
dir_cls_preds = None
if self.use_direction_classifier:
dir_cls_preds = self.conv_dir_cls(x)
return cls_score, bbox_pred, dir_cls_preds
def forward(self, feats):
"""Forward pass.
Args:
feats (list[torch.Tensor]): Multi-level features, e.g.,
features produced by FPN.
Returns:
tuple[list[torch.Tensor]]: Multi-level class score, bbox \
and direction predictions.
"""
return multi_apply(self.forward_single, feats)
def get_anchors(self, featmap_sizes, input_metas, device='cuda'):
"""Get anchors according to feature map sizes.
Args:
featmap_sizes (list[tuple]): Multi-level feature map sizes.
input_metas (list[dict]): contain pcd and img's meta info.
device (str): device of current module.
Returns:
list[list[torch.Tensor]]: Anchors of each image, valid flags \
of each image.
"""
num_imgs = len(input_metas)
# since feature map sizes of all images are the same, we only compute
# anchors for one time
multi_level_anchors = self.anchor_generator.grid_anchors(
featmap_sizes, device=device)
anchor_list = [multi_level_anchors for _ in range(num_imgs)]
return anchor_list
def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
label_weights, bbox_targets, bbox_weights, dir_targets,
dir_weights, num_total_samples):
"""Calculate loss of Single-level results.
Args:
cls_score (torch.Tensor): Class score in single-level.
bbox_pred (torch.Tensor): Bbox prediction in single-level.
dir_cls_preds (torch.Tensor): Predictions of direction class
in single-level.
labels (torch.Tensor): Labels of class.
label_weights (torch.Tensor): Weights of class loss.
bbox_targets (torch.Tensor): Targets of bbox predictions.
bbox_weights (torch.Tensor): Weights of bbox loss.
dir_targets (torch.Tensor): Targets of direction predictions.
dir_weights (torch.Tensor): Weights of direction loss.
num_total_samples (int): The number of valid samples.
Returns:
tuple[torch.Tensor]: Losses of class, bbox \
and direction, respectively.
"""
# classification loss
if num_total_samples is None:
num_total_samples = int(cls_score.shape[0])
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes)
assert labels.max().item() <= self.num_classes
loss_cls = self.loss_cls(
cls_score, labels, label_weights, avg_factor=num_total_samples)
# regression loss
bbox_pred = bbox_pred.permute(0, 2, 3,
1).reshape(-1, self.box_code_size)
bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
bg_class_ind = self.num_classes
pos_inds = ((labels >= 0)
& (labels < bg_class_ind)).nonzero(
as_tuple=False).reshape(-1)
num_pos = len(pos_inds)
pos_bbox_pred = bbox_pred[pos_inds]
pos_bbox_targets = bbox_targets[pos_inds]
pos_bbox_weights = bbox_weights[pos_inds]
# dir loss
if self.use_direction_classifier:
dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2)
dir_targets = dir_targets.reshape(-1)
dir_weights = dir_weights.reshape(-1)
pos_dir_cls_preds = dir_cls_preds[pos_inds]
pos_dir_targets = dir_targets[pos_inds]
pos_dir_weights = dir_weights[pos_inds]
if num_pos > 0:
code_weight = self.train_cfg.get('code_weight', None)
if code_weight:
pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor(
code_weight)
if self.diff_rad_by_sin:
pos_bbox_pred, pos_bbox_targets = self.add_sin_difference(
pos_bbox_pred, pos_bbox_targets)
loss_bbox = self.loss_bbox(
pos_bbox_pred,
pos_bbox_targets,
pos_bbox_weights,
avg_factor=num_total_samples)
# direction classification loss
loss_dir = None
if self.use_direction_classifier:
loss_dir = self.loss_dir(
pos_dir_cls_preds,
pos_dir_targets,
pos_dir_weights,
avg_factor=num_total_samples)
else:
loss_bbox = pos_bbox_pred.sum()
if self.use_direction_classifier:
loss_dir = pos_dir_cls_preds.sum()
return loss_cls, loss_bbox, loss_dir
@staticmethod
def add_sin_difference(boxes1, boxes2):
"""Convert the rotation difference to difference in sine function.
Args:
boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7
and the 7th dimension is rotation dimension.
boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and
the 7th dimension is rotation dimension.
Returns:
tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th \
dimensions are changed.
"""
rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos(
boxes2[..., 6:7])
rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[...,
6:7])
boxes1 = torch.cat(
[boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1)
boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]],
dim=-1)
return boxes1, boxes2
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
gt_bboxes,
gt_labels,
input_metas,
gt_bboxes_ignore=None):
"""Calculate losses.
Args:
cls_scores (list[torch.Tensor]): Multi-level class scores.
bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
dir_cls_preds (list[torch.Tensor]): Multi-level direction
class predictions.
gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
of each sample.
gt_labels (list[torch.Tensor]): Gt labels of each sample.
input_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict[str, list[torch.Tensor]]: Classification, bbox, and \
direction losses of each level.
- loss_cls (list[torch.Tensor]): Classification losses.
- loss_bbox (list[torch.Tensor]): Box regression losses.
- loss_dir (list[torch.Tensor]): Direction classification \
losses.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
device = cls_scores[0].device
anchor_list = self.get_anchors(
featmap_sizes, input_metas, device=device)
label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1
cls_reg_targets = self.anchor_target_3d(
anchor_list,
gt_bboxes,
input_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
num_classes=self.num_classes,
label_channels=label_channels,
sampling=self.sampling)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
dir_targets_list, dir_weights_list, num_total_pos,
num_total_neg) = cls_reg_targets
num_total_samples = (
num_total_pos + num_total_neg if self.sampling else num_total_pos)
# num_total_samples = None
losses_cls, losses_bbox, losses_dir = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
dir_cls_preds,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
dir_targets_list,
dir_weights_list,
num_total_samples=num_total_samples)
return dict(
loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
input_metas,
cfg=None,
rescale=False):
"""Get bboxes of anchor head.
Args:
cls_scores (list[torch.Tensor]): Multi-level class scores.
bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
dir_cls_preds (list[torch.Tensor]): Multi-level direction
class predictions.
input_metas (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config.
rescale (list[torch.Tensor]): Whether th rescale bbox.
Returns:
list[tuple]: Prediction resultes of batches.
"""
assert len(cls_scores) == len(bbox_preds)
assert len(cls_scores) == len(dir_cls_preds)
num_levels = len(cls_scores)
featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)]
device = cls_scores[0].device
mlvl_anchors = self.anchor_generator.grid_anchors(
featmap_sizes, device=device)
mlvl_anchors = [
anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors
]
result_list = []
for img_id in range(len(input_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
dir_cls_pred_list = [
dir_cls_preds[i][img_id].detach() for i in range(num_levels)
]
input_meta = input_metas[img_id]
proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
dir_cls_pred_list, mlvl_anchors,
input_meta, cfg, rescale)
result_list.append(proposals)
return result_list
def get_bboxes_single(self,
cls_scores,
bbox_preds,
dir_cls_preds,
mlvl_anchors,
input_meta,
cfg=None,
rescale=False):
"""Get bboxes of single branch.
Args:
cls_scores (torch.Tensor): Class score in single batch.
bbox_preds (torch.Tensor): Bbox prediction in single batch.
dir_cls_preds (torch.Tensor): Predictions of direction class
in single batch.
mlvl_anchors (List[torch.Tensor]): Multi-level anchors
in single batch.
input_meta (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config.
rescale (list[torch.Tensor]): whether th rescale bbox.
Returns:
tuple: Contain predictions of single batch.
- bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
- scores (torch.Tensor): Class score of each bbox.
- labels (torch.Tensor): Label of each bbox.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
mlvl_dir_scores = []
for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
cls_score = cls_score.permute(1, 2,
0).reshape(-1, self.num_classes)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_pred = bbox_pred.permute(1, 2,
0).reshape(-1, self.box_code_size)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
max_scores, _ = scores[:, :-1].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
dir_cls_score = dir_cls_score[topk_inds]
bboxes = self.bbox_coder.decode(anchors, bbox_pred)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_dir_scores.append(dir_cls_score)
mlvl_bboxes = torch.cat(mlvl_bboxes)
mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
mlvl_bboxes, box_dim=self.box_code_size).bev)
mlvl_scores = torch.cat(mlvl_scores)
mlvl_dir_scores = torch.cat(mlvl_dir_scores)
if self.use_sigmoid_cls:
# Add a dummy background class to the front when using sigmoid
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
score_thr = cfg.get('score_thr', 0)
results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_scores, score_thr, cfg.max_num,
cfg, mlvl_dir_scores)
bboxes, scores, labels, dir_scores = results
if bboxes.shape[0] > 0:
dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
self.dir_limit_offset, np.pi)
bboxes[..., 6] = (
dir_rot + self.dir_offset +
np.pi * dir_scores.to(bboxes.dtype))
bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
return bboxes, scores, labels
================================================
FILE: mmdet3d/models/dense_heads/base_conv_bbox_head.py
================================================
from mmcv.cnn import ConvModule
from mmcv.cnn.bricks import build_conv_layer
from torch import nn as nn
from mmdet.models.builder import HEADS
@HEADS.register_module()
class BaseConvBboxHead(nn.Module):
r"""More general bbox head, with shared conv layers and two optional
separated branches.
.. code-block:: none
/-> cls convs -> cls_score
shared convs
\-> reg convs -> bbox_pred
"""
def __init__(self,
in_channels=0,
shared_conv_channels=(),
cls_conv_channels=(),
num_cls_out_channels=0,
reg_conv_channels=(),
num_reg_out_channels=0,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU'),
bias='auto',
*args,
**kwargs):
super(BaseConvBboxHead, self).__init__(*args, **kwargs)
assert in_channels > 0
assert num_cls_out_channels > 0
assert num_reg_out_channels > 0
self.in_channels = in_channels
self.shared_conv_channels = shared_conv_channels
self.cls_conv_channels = cls_conv_channels
self.num_cls_out_channels = num_cls_out_channels
self.reg_conv_channels = reg_conv_channels
self.num_reg_out_channels = num_reg_out_channels
self.conv_cfg = conv_cfg
self.norm_cfg = norm_cfg
self.act_cfg = act_cfg
self.bias = bias
# add shared convs
if len(self.shared_conv_channels) > 0:
self.shared_convs = self._add_conv_branch(
self.in_channels, self.shared_conv_channels)
out_channels = self.shared_conv_channels[-1]
else:
out_channels = self.in_channels
# add cls specific branch
prev_channel = out_channels
if len(self.cls_conv_channels) > 0:
self.cls_convs = self._add_conv_branch(prev_channel,
self.cls_conv_channels)
prev_channel = self.cls_conv_channels[-1]
self.conv_cls = build_conv_layer(
conv_cfg,
in_channels=prev_channel,
out_channels=num_cls_out_channels,
kernel_size=1)
# add reg specific branch
prev_channel = out_channels
if len(self.reg_conv_channels) > 0:
self.reg_convs = self._add_conv_branch(prev_channel,
self.reg_conv_channels)
prev_channel = self.reg_conv_channels[-1]
self.conv_reg = build_conv_layer(
conv_cfg,
in_channels=prev_channel,
out_channels=num_reg_out_channels,
kernel_size=1)
def _add_conv_branch(self, in_channels, conv_channels):
"""Add shared or separable branch."""
conv_spec = [in_channels] + list(conv_channels)
# add branch specific conv layers
conv_layers = nn.Sequential()
for i in range(len(conv_spec) - 1):
conv_layers.add_module(
f'layer{i}',
ConvModule(
conv_spec[i],
conv_spec[i + 1],
kernel_size=1,
padding=0,
conv_cfg=self.conv_cfg,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg,
bias=self.bias,
inplace=True))
return conv_layers
def init_weights(self):
# conv layers are already initialized by ConvModule
pass
def forward(self, feats):
"""Forward.
Args:
feats (Tensor): Input features
Returns:
Tensor: Class scores predictions
Tensor: Regression predictions
"""
# shared part
if len(self.shared_conv_channels) > 0:
x = self.shared_convs(feats)
# separate branches
x_cls = x
x_reg = x
if len(self.cls_conv_channels) > 0:
x_cls = self.cls_convs(x_cls)
cls_score = self.conv_cls(x_cls)
if len(self.reg_conv_channels) > 0:
x_reg = self.reg_convs(x_reg)
bbox_pred = self.conv_reg(x_reg)
return cls_score, bbox_pred
================================================
FILE: mmdet3d/models/dense_heads/centerpoint_head.py
================================================
import copy
import numpy as np
import torch
from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init
from mmcv.runner import force_fp32
from torch import nn
from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius,
xywhr2xyxyr)
from mmdet3d.models import builder
from mmdet3d.models.builder import HEADS, build_loss
from mmdet3d.models.utils import clip_sigmoid
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu
from mmdet.core import build_bbox_coder, multi_apply
@HEADS.register_module()
class SeparateHead(nn.Module):
"""SeparateHead for CenterHead.
Args:
in_channels (int): Input channels for conv_layer.
heads (dict): Conv information.
head_conv (int): Output channels.
Default: 64.
final_kernal (int): Kernal size for the last conv layer.
Deafult: 1.
init_bias (float): Initial bias. Default: -2.19.
conv_cfg (dict): Config of conv layer.
Default: dict(type='Conv2d')
norm_cfg (dict): Config of norm layer.
Default: dict(type='BN2d').
bias (str): Type of bias. Default: 'auto'.
"""
def __init__(self,
in_channels,
heads,
head_conv=64,
final_kernel=1,
init_bias=-2.19,
conv_cfg=dict(type='Conv2d'),
norm_cfg=dict(type='BN2d'),
bias='auto',
**kwargs):
super(SeparateHead, self).__init__()
self.heads = heads
self.init_bias = init_bias
for head in self.heads:
classes, num_conv = self.heads[head]
conv_layers = []
c_in = in_channels
for i in range(num_conv - 1):
conv_layers.append(
ConvModule(
c_in,
head_conv,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=bias,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg))
c_in = head_conv
conv_layers.append(
build_conv_layer(
conv_cfg,
head_conv,
classes,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=True))
conv_layers = nn.Sequential(*conv_layers)
self.__setattr__(head, conv_layers)
def init_weights(self):
"""Initialize weights."""
for head in self.heads:
if head == 'heatmap':
self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
else:
for m in self.__getattr__(head).modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
def forward(self, x):
"""Forward function for SepHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
dict[str: torch.Tensor]: contains the following keys:
-reg (torch.Tensor): 2D regression value with the \
shape of [B, 2, H, W].
-height (torch.Tensor): Height value with the \
shape of [B, 1, H, W].
-dim (torch.Tensor): Size value with the shape \
of [B, 3, H, W].
-rot (torch.Tensor): Rotation value with the \
shape of [B, 2, H, W].
-vel (torch.Tensor): Velocity value with the \
shape of [B, 2, H, W].
-heatmap (torch.Tensor): Heatmap with the shape of \
[B, N, H, W].
"""
ret_dict = dict()
for head in self.heads:
ret_dict[head] = self.__getattr__(head)(x)
return ret_dict
@HEADS.register_module()
class DCNSeparateHead(nn.Module):
r"""DCNSeparateHead for CenterHead.
.. code-block:: none
/-----> DCN for heatmap task -----> heatmap task.
feature
\-----> DCN for regression tasks -----> regression tasks
Args:
in_channels (int): Input channels for conv_layer.
heads (dict): Conv information.
dcn_config (dict): Config of dcn layer.
num_cls (int): Output channels.
Default: 64.
final_kernal (int): Kernal size for the last conv layer.
Deafult: 1.
init_bias (float): Initial bias. Default: -2.19.
conv_cfg (dict): Config of conv layer.
Default: dict(type='Conv2d')
norm_cfg (dict): Config of norm layer.
Default: dict(type='BN2d').
bias (str): Type of bias. Default: 'auto'.
""" # noqa: W605
def __init__(self,
in_channels,
num_cls,
heads,
dcn_config,
head_conv=64,
final_kernel=1,
init_bias=-2.19,
conv_cfg=dict(type='Conv2d'),
norm_cfg=dict(type='BN2d'),
bias='auto',
**kwargs):
super(DCNSeparateHead, self).__init__()
if 'heatmap' in heads:
heads.pop('heatmap')
# feature adaptation with dcn
# use separate features for classification / regression
self.feature_adapt_cls = build_conv_layer(dcn_config)
self.feature_adapt_reg = build_conv_layer(dcn_config)
# heatmap prediction head
cls_head = [
ConvModule(
in_channels,
head_conv,
kernel_size=3,
padding=1,
conv_cfg=conv_cfg,
bias=bias,
norm_cfg=norm_cfg),
build_conv_layer(
conv_cfg,
head_conv,
num_cls,
kernel_size=3,
stride=1,
padding=1,
bias=bias)
]
self.cls_head = nn.Sequential(*cls_head)
self.init_bias = init_bias
# other regression target
self.task_head = SeparateHead(
in_channels,
heads,
head_conv=head_conv,
final_kernel=final_kernel,
bias=bias)
def init_weights(self):
"""Initialize weights."""
self.cls_head[-1].bias.data.fill_(self.init_bias)
self.task_head.init_weights()
def forward(self, x):
"""Forward function for DCNSepHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
dict[str: torch.Tensor]: contains the following keys:
-reg (torch.Tensor): 2D regression value with the \
shape of [B, 2, H, W].
-height (torch.Tensor): Height value with the \
shape of [B, 1, H, W].
-dim (torch.Tensor): Size value with the shape \
of [B, 3, H, W].
-rot (torch.Tensor): Rotation value with the \
shape of [B, 2, H, W].
-vel (torch.Tensor): Velocity value with the \
shape of [B, 2, H, W].
-heatmap (torch.Tensor): Heatmap with the shape of \
[B, N, H, W].
"""
center_feat = self.feature_adapt_cls(x)
reg_feat = self.feature_adapt_reg(x)
cls_score = self.cls_head(center_feat)
ret = self.task_head(reg_feat)
ret['heatmap'] = cls_score
return ret
@HEADS.register_module()
class CenterHead(nn.Module):
"""CenterHead for CenterPoint.
Args:
mode (str): Mode of the head. Default: '3d'.
in_channels (list[int] | int): Channels of the input feature map.
Default: [128].
tasks (list[dict]): Task information including class number
and class names. Default: None.
dataset (str): Name of the dataset. Default: 'nuscenes'.
weight (float): Weight for location loss. Default: 0.25.
code_weights (list[int]): Code weights for location loss. Default: [].
common_heads (dict): Conv information for common heads.
Default: dict().
loss_cls (dict): Config of classification loss function.
Default: dict(type='GaussianFocalLoss', reduction='mean').
loss_bbox (dict): Config of regression loss function.
Default: dict(type='L1Loss', reduction='none').
separate_head (dict): Config of separate head. Default: dict(
type='SeparateHead', init_bias=-2.19, final_kernel=3)
share_conv_channel (int): Output channels for share_conv_layer.
Default: 64.
num_heatmap_convs (int): Number of conv layers for heatmap conv layer.
Default: 2.
conv_cfg (dict): Config of conv layer.
Default: dict(type='Conv2d')
norm_cfg (dict): Config of norm layer.
Default: dict(type='BN2d').
bias (str): Type of bias. Default: 'auto'.
"""
def __init__(self,
in_channels=[128],
tasks=None,
train_cfg=None,
test_cfg=None,
bbox_coder=None,
common_heads=dict(),
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(
type='L1Loss', reduction='none', loss_weight=0.25),
separate_head=dict(
type='SeparateHead', init_bias=-2.19, final_kernel=3),
share_conv_channel=64,
num_heatmap_convs=2,
conv_cfg=dict(type='Conv2d'),
norm_cfg=dict(type='BN2d'),
bias='auto',
norm_bbox=True):
super(CenterHead, self).__init__()
num_classes = [len(t['class_names']) for t in tasks]
self.class_names = [t['class_names'] for t in tasks]
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.in_channels = in_channels
self.num_classes = num_classes
self.norm_bbox = norm_bbox
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.bbox_coder = build_bbox_coder(bbox_coder)
self.num_anchor_per_locs = [n for n in num_classes]
self.fp16_enabled = False
# a shared convolution
self.shared_conv = ConvModule(
in_channels,
share_conv_channel,
kernel_size=3,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=bias)
self.task_heads = nn.ModuleList()
for num_cls in num_classes:
heads = copy.deepcopy(common_heads)
heads.update(dict(heatmap=(num_cls, num_heatmap_convs)))
separate_head.update(
in_channels=share_conv_channel, heads=heads, num_cls=num_cls)
self.task_heads.append(builder.build_head(separate_head))
def init_weights(self):
"""Initialize weights."""
for task_head in self.task_heads:
task_head.init_weights()
def forward_single(self, x):
"""Forward function for CenterPoint.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
list[dict]: Output results for tasks.
"""
ret_dicts = []
x = self.shared_conv(x)
for task in self.task_heads:
ret_dicts.append(task(x))
return ret_dicts
def forward(self, feats):
"""Forward pass.
Args:
feats (list[torch.Tensor]): Multi-level features, e.g.,
features produced by FPN.
Returns:
tuple(list[dict]): Output results for tasks.
"""
return multi_apply(self.forward_single, feats)
def _gather_feat(self, feat, ind, mask=None):
"""Gather feature map.
Given feature map and index, return indexed feature map.
Args:
feat (torch.tensor): Feature map with the shape of [B, H*W, 10].
ind (torch.Tensor): Index of the ground truth boxes with the
shape of [B, max_obj].
mask (torch.Tensor): Mask of the feature map with the shape
of [B, max_obj]. Default: None.
Returns:
torch.Tensor: Feature map after gathering with the shape
of [B, max_obj, 10].
"""
dim = feat.size(2)
ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim)
feat = feat.gather(1, ind)
if mask is not None:
mask = mask.unsqueeze(2).expand_as(feat)
feat = feat[mask]
feat = feat.view(-1, dim)
return feat
def get_targets(self, gt_bboxes_3d, gt_labels_3d):
"""Generate targets.
Args:
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
Returns:
Returns:
tuple[list[torch.Tensor]]: Tuple of target including \
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the \
position of the valid boxes.
- list[torch.Tensor]: Masks indicating which \
boxes are valid.
"""
heatmaps, anno_boxes, inds, masks = multi_apply(
self.get_targets_single, gt_bboxes_3d, gt_labels_3d)
# transpose heatmaps, because the dimension of tensors in each task is
# different, we have to use numpy instead of torch to do the transpose.
heatmaps = np.array(heatmaps).transpose(1, 0).tolist()
heatmaps = [torch.stack(hms_) for hms_ in heatmaps]
# transpose anno_boxes
anno_boxes = np.array(anno_boxes).transpose(1, 0).tolist()
anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes]
# transpose inds
inds = np.array(inds).transpose(1, 0).tolist()
inds = [torch.stack(inds_) for inds_ in inds]
# transpose inds
masks = np.array(masks).transpose(1, 0).tolist()
masks = [torch.stack(masks_) for masks_ in masks]
return heatmaps, anno_boxes, inds, masks
def get_targets_single(self, gt_bboxes_3d, gt_labels_3d):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
Returns:
tuple[list[torch.Tensor]]: Tuple of target including \
the following results in order.
- list[torch.Tensor]: Heatmap scores.
- list[torch.Tensor]: Ground truth boxes.
- list[torch.Tensor]: Indexes indicating the position \
of the valid boxes.
- list[torch.Tensor]: Masks indicating which boxes \
are valid.
"""
device = gt_labels_3d.device
gt_bboxes_3d = torch.cat(
(gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]),
dim=1).to(device)
max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg']
grid_size = torch.tensor(self.train_cfg['grid_size'])
pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
voxel_size = torch.tensor(self.train_cfg['voxel_size'])
feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor']
# reorganize the gt_dict by tasks
task_masks = []
flag = 0
for class_name in self.class_names:
task_masks.append([
torch.where(gt_labels_3d == class_name.index(i) + flag)
for i in class_name
])
flag += len(class_name)
task_boxes = []
task_classes = []
flag2 = 0
for idx, mask in enumerate(task_masks):
task_box = []
task_class = []
for m in mask:
task_box.append(gt_bboxes_3d[m])
# 0 is background for each task, so we need to add 1 here.
task_class.append(gt_labels_3d[m] + 1 - flag2)
task_boxes.append(torch.cat(task_box, axis=0).to(device))
task_classes.append(torch.cat(task_class).long().to(device))
flag2 += len(mask)
draw_gaussian = draw_heatmap_gaussian
heatmaps, anno_boxes, inds, masks = [], [], [], []
for idx, task_head in enumerate(self.task_heads):
heatmap = gt_bboxes_3d.new_zeros(
(len(self.class_names[idx]), feature_map_size[1],
feature_map_size[0]))
anno_box = gt_bboxes_3d.new_zeros((max_objs, 10),
dtype=torch.float32)
ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64)
mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8)
num_objs = min(task_boxes[idx].shape[0], max_objs)
for k in range(num_objs):
cls_id = task_classes[idx][k] - 1
width = task_boxes[idx][k][3]
length = task_boxes[idx][k][4]
width = width / voxel_size[0] / self.train_cfg[
'out_size_factor']
length = length / voxel_size[1] / self.train_cfg[
'out_size_factor']
if width > 0 and length > 0:
radius = gaussian_radius(
(length, width),
min_overlap=self.train_cfg['gaussian_overlap'])
radius = max(self.train_cfg['min_radius'], int(radius))
# be really careful for the coordinate system of
# your box annotation.
x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][
1], task_boxes[idx][k][2]
coor_x = (
x - pc_range[0]
) / voxel_size[0] / self.train_cfg['out_size_factor']
coor_y = (
y - pc_range[1]
) / voxel_size[1] / self.train_cfg['out_size_factor']
center = torch.tensor([coor_x, coor_y],
dtype=torch.float32,
device=device)
center_int = center.to(torch.int32)
# throw out not in range objects to avoid out of array
# area when creating the heatmap
if not (0 <= center_int[0] < feature_map_size[0]
and 0 <= center_int[1] < feature_map_size[1]):
continue
draw_gaussian(heatmap[cls_id], center_int, radius)
new_idx = k
x, y = center_int[0], center_int[1]
assert (y * feature_map_size[0] + x <
feature_map_size[0] * feature_map_size[1])
ind[new_idx] = y * feature_map_size[0] + x
mask[new_idx] = 1
# TODO: support other outdoor dataset
vx, vy = task_boxes[idx][k][7:]
rot = task_boxes[idx][k][6]
box_dim = task_boxes[idx][k][3:6]
if self.norm_bbox:
box_dim = box_dim.log()
anno_box[new_idx] = torch.cat([
center - torch.tensor([x, y], device=device),
z.unsqueeze(0), box_dim,
torch.sin(rot).unsqueeze(0),
torch.cos(rot).unsqueeze(0),
vx.unsqueeze(0),
vy.unsqueeze(0)
])
heatmaps.append(heatmap)
anno_boxes.append(anno_box)
masks.append(mask)
inds.append(ind)
return heatmaps, anno_boxes, inds, masks
@force_fp32(apply_to=('preds_dicts'))
def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
"""Loss function for CenterHead.
Args:
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
preds_dicts (dict): Output of forward function.
Returns:
dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
"""
heatmaps, anno_boxes, inds, masks = self.get_targets(
gt_bboxes_3d, gt_labels_3d)
loss_dict = dict()
for task_id, preds_dict in enumerate(preds_dicts):
# heatmap focal loss
preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap'])
num_pos = heatmaps[task_id].eq(1).float().sum().item()
loss_heatmap = self.loss_cls(
preds_dict[0]['heatmap'],
heatmaps[task_id],
avg_factor=max(num_pos, 1))
target_box = anno_boxes[task_id]
# reconstruct the anno_box from multiple reg heads
preds_dict[0]['anno_box'] = torch.cat(
(preds_dict[0]['reg'], preds_dict[0]['height'],
preds_dict[0]['dim'], preds_dict[0]['rot'],
preds_dict[0]['vel']),
dim=1)
# Regression loss for dimension, offset, height, rotation
ind = inds[task_id]
num = masks[task_id].float().sum()
pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous()
pred = pred.view(pred.size(0), -1, pred.size(3))
pred = self._gather_feat(pred, ind)
mask = masks[task_id].unsqueeze(2).expand_as(target_box).float()
isnotnan = (~torch.isnan(target_box)).float()
mask *= isnotnan
code_weights = self.train_cfg.get('code_weights', None)
bbox_weights = mask * mask.new_tensor(code_weights)
loss_bbox = self.loss_bbox(
pred, target_box, bbox_weights, avg_factor=(num + 1e-4))
loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap
loss_dict[f'task{task_id}.loss_bbox'] = loss_bbox
return loss_dict
def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
img_metas (list[dict]): Point cloud and image's meta info.
Returns:
list[dict]: Decoded bbox, scores and labels after nms.
"""
rets = []
for task_id, preds_dict in enumerate(preds_dicts):
num_class_with_bg = self.num_classes[task_id]
batch_size = preds_dict[0]['heatmap'].shape[0]
batch_heatmap = preds_dict[0]['heatmap'].sigmoid()
batch_reg = preds_dict[0]['reg']
batch_hei = preds_dict[0]['height']
if self.norm_bbox:
batch_dim = torch.exp(preds_dict[0]['dim'])
else:
batch_dim = preds_dict[0]['dim']
batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1)
batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1)
if 'vel' in preds_dict[0]:
batch_vel = preds_dict[0]['vel']
else:
batch_vel = None
temp = self.bbox_coder.decode(
batch_heatmap,
batch_rots,
batch_rotc,
batch_hei,
batch_dim,
batch_vel,
reg=batch_reg,
task_id=task_id)
assert self.test_cfg['nms_type'] in ['circle', 'rotate']
batch_reg_preds = [box['bboxes'] for box in temp]
batch_cls_preds = [box['scores'] for box in temp]
batch_cls_labels = [box['labels'] for box in temp]
if self.test_cfg['nms_type'] == 'circle':
ret_task = []
for i in range(batch_size):
boxes3d = temp[i]['bboxes']
scores = temp[i]['scores']
labels = temp[i]['labels']
centers = boxes3d[:, [0, 1]]
boxes = torch.cat([centers, scores.view(-1, 1)], dim=1)
keep = torch.tensor(
circle_nms(
boxes.detach().cpu().numpy(),
self.test_cfg['min_radius'][task_id],
post_max_size=self.test_cfg['post_max_size']),
dtype=torch.long,
device=boxes.device)
boxes3d = boxes3d[keep]
scores = scores[keep]
labels = labels[keep]
ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
ret_task.append(ret)
rets.append(ret_task)
else:
rets.append(
self.get_task_detections(num_class_with_bg,
batch_cls_preds, batch_reg_preds,
batch_cls_labels, img_metas))
# Merge branches results
num_samples = len(rets[0])
ret_list = []
for i in range(num_samples):
for k in rets[0][i].keys():
if k == 'bboxes':
bboxes = torch.cat([ret[i][k] for ret in rets])
bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5
bboxes = img_metas[i]['box_type_3d'](
bboxes, self.bbox_coder.code_size)
elif k == 'scores':
scores = torch.cat([ret[i][k] for ret in rets])
elif k == 'labels':
flag = 0
for j, num_class in enumerate(self.num_classes):
rets[j][i][k] += flag
flag += num_class
labels = torch.cat([ret[i][k].int() for ret in rets])
ret_list.append([bboxes, scores, labels])
return ret_list
def get_task_detections(self, num_class_with_bg, batch_cls_preds,
batch_reg_preds, batch_cls_labels, img_metas):
"""Rotate nms for each task.
Args:
num_class_with_bg (int): Number of classes for the current task.
batch_cls_preds (list[torch.Tensor]): Prediction score with the
shape of [N].
batch_reg_preds (list[torch.Tensor]): Prediction bbox with the
shape of [N, 9].
batch_cls_labels (list[torch.Tensor]): Prediction label with the
shape of [N].
img_metas (list[dict]): Meta information of each sample.
Returns:
list[dict[str: torch.Tensor]]: contains the following keys:
-bboxes (torch.Tensor): Prediction bboxes after nms with the \
shape of [N, 9].
-scores (torch.Tensor): Prediction scores after nms with the \
shape of [N].
-labels (torch.Tensor): Prediction labels after nms with the \
shape of [N].
"""
predictions_dicts = []
post_center_range = self.test_cfg['post_center_limit_range']
if len(post_center_range) > 0:
post_center_range = torch.tensor(
post_center_range,
dtype=batch_reg_preds[0].dtype,
device=batch_reg_preds[0].device)
for i, (box_preds, cls_preds, cls_labels) in enumerate(
zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)):
# Apply NMS in birdeye view
# get highest score per prediction, than apply nms
# to remove overlapped box.
if num_class_with_bg == 1:
top_scores = cls_preds.squeeze(-1)
top_labels = torch.zeros(
cls_preds.shape[0],
device=cls_preds.device,
dtype=torch.long)
else:
top_labels = cls_labels.long()
top_scores = cls_preds.squeeze(-1)
if self.test_cfg['score_threshold'] > 0.0:
thresh = torch.tensor(
[self.test_cfg['score_threshold']],
device=cls_preds.device).type_as(cls_preds)
top_scores_keep = top_scores >= thresh
top_scores = top_scores.masked_select(top_scores_keep)
if top_scores.shape[0] != 0:
if self.test_cfg['score_threshold'] > 0.0:
box_preds = box_preds[top_scores_keep]
top_labels = top_labels[top_scores_keep]
boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](
box_preds[:, :], self.bbox_coder.code_size).bev)
# the nms in 3d detection just remove overlap boxes.
selected = nms_gpu(
boxes_for_nms,
top_scores,
thresh=self.test_cfg['nms_thr'],
pre_maxsize=self.test_cfg['pre_max_size'],
post_max_size=self.test_cfg['post_max_size'])
else:
selected = []
# if selected is not None:
selected_boxes = box_preds[selected]
selected_labels = top_labels[selected]
selected_scores = top_scores[selected]
# finally generate predictions.
if selected_boxes.shape[0] != 0:
box_preds = selected_boxes
scores = selected_scores
label_preds = selected_labels
final_box_preds = box_preds
final_scores = scores
final_labels = label_preds
if post_center_range is not None:
mask = (final_box_preds[:, :3] >=
post_center_range[:3]).all(1)
mask &= (final_box_preds[:, :3] <=
post_center_range[3:]).all(1)
predictions_dict = dict(
bboxes=final_box_preds[mask],
scores=final_scores[mask],
labels=final_labels[mask])
else:
predictions_dict = dict(
bboxes=final_box_preds,
scores=final_scores,
labels=final_labels)
else:
dtype = batch_reg_preds[0].dtype
device = batch_reg_preds[0].device
predictions_dict = dict(
bboxes=torch.zeros([0, self.bbox_coder.code_size],
dtype=dtype,
device=device),
scores=torch.zeros([0], dtype=dtype, device=device),
labels=torch.zeros([0],
dtype=top_labels.dtype,
device=device))
predictions_dicts.append(predictions_dict)
return predictions_dicts
================================================
FILE: mmdet3d/models/dense_heads/free_anchor3d_head.py
================================================
import torch
from mmcv.runner import force_fp32
from torch.nn import functional as F
from mmdet3d.core.bbox import bbox_overlaps_nearest_3d
from mmdet.models import HEADS
from .anchor3d_head import Anchor3DHead
from .train_mixins import get_direction_target
@HEADS.register_module()
class FreeAnchor3DHead(Anchor3DHead):
r"""`FreeAnchor `_ head for 3D detection.
Note:
This implementation is directly modified from the `mmdet implementation
`_ # noqa
We find it also works on 3D detection with minor modification, i.e.,
different hyper-parameters and a additional direction classifier.
Args:
pre_anchor_topk (int): Number of boxes that be token in each bag.
bbox_thr (float): The threshold of the saturated linear function. It is
usually the same with the IoU threshold used in NMS.
gamma (float): Gamma parameter in focal loss.
alpha (float): Alpha parameter in focal loss.
kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`.
"""
def __init__(self,
pre_anchor_topk=50,
bbox_thr=0.6,
gamma=2.0,
alpha=0.5,
**kwargs):
super().__init__(**kwargs)
self.pre_anchor_topk = pre_anchor_topk
self.bbox_thr = bbox_thr
self.gamma = gamma
self.alpha = alpha
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
gt_bboxes,
gt_labels,
input_metas,
gt_bboxes_ignore=None):
"""Calculate loss of FreeAnchor head.
Args:
cls_scores (list[torch.Tensor]): Classification scores of
different samples.
bbox_preds (list[torch.Tensor]): Box predictions of
different samples
dir_cls_preds (list[torch.Tensor]): Direction predictions of
different samples
gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes.
gt_labels (list[torch.Tensor]): Ground truth labels.
input_metas (list[dict]): List of input meta information.
gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth boxes that should be ignored. Defaults to None.
Returns:
dict[str, torch.Tensor]: Loss items.
- positive_bag_loss (torch.Tensor): Loss of positive samples.
- negative_bag_loss (torch.Tensor): Loss of negative samples.
"""
featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores]
assert len(featmap_sizes) == self.anchor_generator.num_levels
anchor_list = self.get_anchors(featmap_sizes, input_metas)
anchors = [torch.cat(anchor) for anchor in anchor_list]
# concatenate each level
cls_scores = [
cls_score.permute(0, 2, 3, 1).reshape(
cls_score.size(0), -1, self.num_classes)
for cls_score in cls_scores
]
bbox_preds = [
bbox_pred.permute(0, 2, 3, 1).reshape(
bbox_pred.size(0), -1, self.box_code_size)
for bbox_pred in bbox_preds
]
dir_cls_preds = [
dir_cls_pred.permute(0, 2, 3,
1).reshape(dir_cls_pred.size(0), -1, 2)
for dir_cls_pred in dir_cls_preds
]
cls_scores = torch.cat(cls_scores, dim=1)
bbox_preds = torch.cat(bbox_preds, dim=1)
dir_cls_preds = torch.cat(dir_cls_preds, dim=1)
cls_prob = torch.sigmoid(cls_scores)
box_prob = []
num_pos = 0
positive_losses = []
for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_, bbox_preds_,
dir_cls_preds_) in enumerate(
zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds,
dir_cls_preds)):
gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device)
with torch.no_grad():
# box_localization: a_{j}^{loc}, shape: [j, 4]
pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_)
# object_box_iou: IoU_{ij}^{loc}, shape: [i, j]
object_box_iou = bbox_overlaps_nearest_3d(
gt_bboxes_, pred_boxes)
# object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j]
t1 = self.bbox_thr
t2 = object_box_iou.max(
dim=1, keepdim=True).values.clamp(min=t1 + 1e-12)
object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp(
min=0, max=1)
# object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j]
num_obj = gt_labels_.size(0)
indices = torch.stack(
[torch.arange(num_obj).type_as(gt_labels_), gt_labels_],
dim=0)
object_cls_box_prob = torch.sparse_coo_tensor(
indices, object_box_prob)
# image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j]
"""
from "start" to "end" implement:
image_box_iou = torch.sparse.max(object_cls_box_prob,
dim=0).t()
"""
# start
box_cls_prob = torch.sparse.sum(
object_cls_box_prob, dim=0).to_dense()
indices = torch.nonzero(box_cls_prob, as_tuple=False).t_()
if indices.numel() == 0:
image_box_prob = torch.zeros(
anchors_.size(0),
self.num_classes).type_as(object_box_prob)
else:
nonzero_box_prob = torch.where(
(gt_labels_.unsqueeze(dim=-1) == indices[0]),
object_box_prob[:, indices[1]],
torch.tensor(
[0]).type_as(object_box_prob)).max(dim=0).values
# upmap to shape [j, c]
image_box_prob = torch.sparse_coo_tensor(
indices.flip([0]),
nonzero_box_prob,
size=(anchors_.size(0), self.num_classes)).to_dense()
# end
box_prob.append(image_box_prob)
# construct bags for objects
match_quality_matrix = bbox_overlaps_nearest_3d(
gt_bboxes_, anchors_)
_, matched = torch.topk(
match_quality_matrix,
self.pre_anchor_topk,
dim=1,
sorted=False)
del match_quality_matrix
# matched_cls_prob: P_{ij}^{cls}
matched_cls_prob = torch.gather(
cls_prob_[matched], 2,
gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk,
1)).squeeze(2)
# matched_box_prob: P_{ij}^{loc}
matched_anchors = anchors_[matched]
matched_object_targets = self.bbox_coder.encode(
matched_anchors,
gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors))
# direction classification loss
loss_dir = None
if self.use_direction_classifier:
# also calculate direction prob: P_{ij}^{dir}
matched_dir_targets = get_direction_target(
matched_anchors,
matched_object_targets,
self.dir_offset,
one_hot=False)
loss_dir = self.loss_dir(
dir_cls_preds_[matched].transpose(-2, -1),
matched_dir_targets,
reduction_override='none')
# generate bbox weights
if self.diff_rad_by_sin:
bbox_preds_[matched], matched_object_targets = \
self.add_sin_difference(
bbox_preds_[matched], matched_object_targets)
bbox_weights = matched_anchors.new_ones(matched_anchors.size())
# Use pop is not right, check performance
code_weight = self.train_cfg.get('code_weight', None)
if code_weight:
bbox_weights = bbox_weights * bbox_weights.new_tensor(
code_weight)
loss_bbox = self.loss_bbox(
bbox_preds_[matched],
matched_object_targets,
bbox_weights,
reduction_override='none').sum(-1)
if loss_dir is not None:
loss_bbox += loss_dir
matched_box_prob = torch.exp(-loss_bbox)
# positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )}
num_pos += len(gt_bboxes_)
positive_losses.append(
self.positive_bag_loss(matched_cls_prob, matched_box_prob))
positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos)
# box_prob: P{a_{j} \in A_{+}}
box_prob = torch.stack(box_prob, dim=0)
# negative_loss:
# \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B||
negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max(
1, num_pos * self.pre_anchor_topk)
losses = {
'positive_bag_loss': positive_loss,
'negative_bag_loss': negative_loss
}
return losses
def positive_bag_loss(self, matched_cls_prob, matched_box_prob):
"""Generate positive bag loss.
Args:
matched_cls_prob (torch.Tensor): Classification probability
of matched positive samples.
matched_box_prob (torch.Tensor): Bounding box probability
of matched positive samples.
Returns:
torch.Tensor: Loss of positive samples.
"""
# bag_prob = Mean-max(matched_prob)
matched_prob = matched_cls_prob * matched_box_prob
weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None)
weight /= weight.sum(dim=1).unsqueeze(dim=-1)
bag_prob = (weight * matched_prob).sum(dim=1)
# positive_bag_loss = -self.alpha * log(bag_prob)
bag_prob = bag_prob.clamp(0, 1) # to avoid bug of BCE, check
return self.alpha * F.binary_cross_entropy(
bag_prob, torch.ones_like(bag_prob), reduction='none')
def negative_bag_loss(self, cls_prob, box_prob):
"""Generate negative bag loss.
Args:
cls_prob (torch.Tensor): Classification probability
of negative samples.
box_prob (torch.Tensor): Bounding box probability
of negative samples.
Returns:
torch.Tensor: Loss of negative samples.
"""
prob = cls_prob * (1 - box_prob)
prob = prob.clamp(0, 1) # to avoid bug of BCE, check
negative_bag_loss = prob**self.gamma * F.binary_cross_entropy(
prob, torch.zeros_like(prob), reduction='none')
return (1 - self.alpha) * negative_bag_loss
================================================
FILE: mmdet3d/models/dense_heads/parta2_rpn_head.py
================================================
from __future__ import division
import numpy as np
import torch
from mmcv.runner import force_fp32
from mmdet3d.core import limit_period, xywhr2xyxyr
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
from mmdet.models import HEADS
from .anchor3d_head import Anchor3DHead
@HEADS.register_module()
class PartA2RPNHead(Anchor3DHead):
"""RPN head for PartA2.
Note:
The main difference between the PartA2 RPN head and the Anchor3DHead
lies in their output during inference. PartA2 RPN head further returns
the original classification score for the second stage since the bbox
head in RoI head does not do classification task.
Different from RPN heads in 2D detectors, this RPN head does
multi-class classification task and uses FocalLoss like the SECOND and
PointPillars do. But this head uses class agnostic nms rather than
multi-class nms.
Args:
num_classes (int): Number of classes.
in_channels (int): Number of channels in the input feature map.
train_cfg (dict): Train configs.
test_cfg (dict): Test configs.
feat_channels (int): Number of channels of the feature map.
use_direction_classifier (bool): Whether to add a direction classifier.
anchor_generator(dict): Config dict of anchor generator.
assigner_per_size (bool): Whether to do assignment for each separate
anchor size.
assign_per_class (bool): Whether to do assignment for each class.
diff_rad_by_sin (bool): Whether to change the difference into sin
difference for box regression loss.
dir_offset (float | int): The offset of BEV rotation angles
(TODO: may be moved into box coder)
dir_limit_offset (float | int): The limited range of BEV
rotation angles. (TODO: may be moved into box coder)
bbox_coder (dict): Config dict of box coders.
loss_cls (dict): Config of classification loss.
loss_bbox (dict): Config of localization loss.
loss_dir (dict): Config of direction classifier loss.
"""
def __init__(self,
num_classes,
in_channels,
train_cfg,
test_cfg,
feat_channels=256,
use_direction_classifier=True,
anchor_generator=dict(
type='Anchor3DRangeGenerator',
range=[0, -39.68, -1.78, 69.12, 39.68, -1.78],
strides=[2],
sizes=[[1.6, 3.9, 1.56]],
rotations=[0, 1.57],
custom_values=[],
reshape_out=False),
assigner_per_size=False,
assign_per_class=False,
diff_rad_by_sin=True,
dir_offset=0,
dir_limit_offset=1,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2)):
super().__init__(num_classes, in_channels, train_cfg, test_cfg,
feat_channels, use_direction_classifier,
anchor_generator, assigner_per_size, assign_per_class,
diff_rad_by_sin, dir_offset, dir_limit_offset,
bbox_coder, loss_cls, loss_bbox, loss_dir)
@force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds'))
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
gt_bboxes,
gt_labels,
input_metas,
gt_bboxes_ignore=None):
"""Calculate losses.
Args:
cls_scores (list[torch.Tensor]): Multi-level class scores.
bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
dir_cls_preds (list[torch.Tensor]): Multi-level direction
class predictions.
gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes \
of each sample.
gt_labels (list[torch.Tensor]): Labels of each sample.
input_metas (list[dict]): Point cloud and image's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict[str, list[torch.Tensor]]: Classification, bbox, and \
direction losses of each level.
- loss_rpn_cls (list[torch.Tensor]): Classification losses.
- loss_rpn_bbox (list[torch.Tensor]): Box regression losses.
- loss_rpn_dir (list[torch.Tensor]): Direction classification \
losses.
"""
loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds,
gt_bboxes, gt_labels, input_metas,
gt_bboxes_ignore)
# change the loss key names to avoid conflict
return dict(
loss_rpn_cls=loss_dict['loss_cls'],
loss_rpn_bbox=loss_dict['loss_bbox'],
loss_rpn_dir=loss_dict['loss_dir'])
def get_bboxes_single(self,
cls_scores,
bbox_preds,
dir_cls_preds,
mlvl_anchors,
input_meta,
cfg,
rescale=False):
"""Get bboxes of single branch.
Args:
cls_scores (torch.Tensor): Class score in single batch.
bbox_preds (torch.Tensor): Bbox prediction in single batch.
dir_cls_preds (torch.Tensor): Predictions of direction class
in single batch.
mlvl_anchors (List[torch.Tensor]): Multi-level anchors
in single batch.
input_meta (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config.
rescale (list[torch.Tensor]): whether th rescale bbox.
Returns:
dict: Predictions of single batch containing the following keys:
- boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
- scores_3d (torch.Tensor): Score of each bbox.
- labels_3d (torch.Tensor): Label of each bbox.
- cls_preds (torch.Tensor): Class score of each bbox.
"""
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_max_scores = []
mlvl_label_pred = []
mlvl_dir_scores = []
mlvl_cls_score = []
for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
assert cls_score.size()[-2:] == bbox_pred.size()[-2:]
assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:]
dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2)
dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
cls_score = cls_score.permute(1, 2,
0).reshape(-1, self.num_classes)
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
bbox_pred = bbox_pred.permute(1, 2,
0).reshape(-1, self.box_code_size)
nms_pre = cfg.get('nms_pre', -1)
if self.use_sigmoid_cls:
max_scores, pred_labels = scores.max(dim=1)
else:
max_scores, pred_labels = scores[:, :-1].max(dim=1)
# get topk
if nms_pre > 0 and scores.shape[0] > nms_pre:
topk_scores, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
max_scores = topk_scores
cls_score = scores[topk_inds, :]
dir_cls_score = dir_cls_score[topk_inds]
pred_labels = pred_labels[topk_inds]
bboxes = self.bbox_coder.decode(anchors, bbox_pred)
mlvl_bboxes.append(bboxes)
mlvl_max_scores.append(max_scores)
mlvl_cls_score.append(cls_score)
mlvl_label_pred.append(pred_labels)
mlvl_dir_scores.append(dir_cls_score)
mlvl_bboxes = torch.cat(mlvl_bboxes)
mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
mlvl_bboxes, box_dim=self.box_code_size).bev)
mlvl_max_scores = torch.cat(mlvl_max_scores)
mlvl_label_pred = torch.cat(mlvl_label_pred)
mlvl_dir_scores = torch.cat(mlvl_dir_scores)
# shape [k, num_class] before sigmoid
# PartA2 need to keep raw classification score
# becase the bbox head in the second stage does not have
# classification branch,
# roi head need this score as classification score
mlvl_cls_score = torch.cat(mlvl_cls_score)
score_thr = cfg.get('score_thr', 0)
result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_max_scores, mlvl_label_pred,
mlvl_cls_score, mlvl_dir_scores,
score_thr, cfg.nms_post, cfg,
input_meta)
return result
def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_max_scores, mlvl_label_pred, mlvl_cls_score,
mlvl_dir_scores, score_thr, max_num, cfg,
input_meta):
"""Class agnostic nms for single batch.
Args:
mlvl_bboxes (torch.Tensor): Bboxes from Multi-level.
mlvl_bboxes_for_nms (torch.Tensor): Bboxes for nms
(bev or minmax boxes) from Multi-level.
mlvl_max_scores (torch.Tensor): Max scores of Multi-level bbox.
mlvl_label_pred (torch.Tensor): Class predictions
of Multi-level bbox.
mlvl_cls_score (torch.Tensor): Class scores of
Multi-level bbox.
mlvl_dir_scores (torch.Tensor): Direction scores of
Multi-level bbox.
score_thr (int): Score threshold.
max_num (int): Max number of bboxes after nms.
cfg (None | :obj:`ConfigDict`): Training or testing config.
input_meta (dict): Contain pcd and img's meta info.
Returns:
dict: Predictions of single batch. Contain the keys:
- boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
- scores_3d (torch.Tensor): Score of each bbox.
- labels_3d (torch.Tensor): Label of each bbox.
- cls_preds (torch.Tensor): Class score of each bbox.
"""
bboxes = []
scores = []
labels = []
dir_scores = []
cls_scores = []
score_thr_inds = mlvl_max_scores > score_thr
_scores = mlvl_max_scores[score_thr_inds]
_bboxes_for_nms = mlvl_bboxes_for_nms[score_thr_inds, :]
if cfg.use_rotate_nms:
nms_func = nms_gpu
else:
nms_func = nms_normal_gpu
selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr)
_mlvl_bboxes = mlvl_bboxes[score_thr_inds, :]
_mlvl_dir_scores = mlvl_dir_scores[score_thr_inds]
_mlvl_label_pred = mlvl_label_pred[score_thr_inds]
_mlvl_cls_score = mlvl_cls_score[score_thr_inds]
if len(selected) > 0:
bboxes.append(_mlvl_bboxes[selected])
scores.append(_scores[selected])
labels.append(_mlvl_label_pred[selected])
cls_scores.append(_mlvl_cls_score[selected])
dir_scores.append(_mlvl_dir_scores[selected])
dir_rot = limit_period(bboxes[-1][..., 6] - self.dir_offset,
self.dir_limit_offset, np.pi)
bboxes[-1][..., 6] = (
dir_rot + self.dir_offset +
np.pi * dir_scores[-1].to(bboxes[-1].dtype))
if bboxes:
bboxes = torch.cat(bboxes, dim=0)
scores = torch.cat(scores, dim=0)
cls_scores = torch.cat(cls_scores, dim=0)
labels = torch.cat(labels, dim=0)
dir_scores = torch.cat(dir_scores, dim=0)
if bboxes.shape[0] > max_num:
_, inds = scores.sort(descending=True)
inds = inds[:max_num]
bboxes = bboxes[inds, :]
labels = labels[inds]
scores = scores[inds]
cls_scores = cls_scores[inds]
bboxes = input_meta['box_type_3d'](
bboxes, box_dim=self.box_code_size)
return dict(
boxes_3d=bboxes,
scores_3d=scores,
labels_3d=labels,
cls_preds=cls_scores # raw scores [max_num, cls_num]
)
else:
return dict(
boxes_3d=input_meta['box_type_3d'](
mlvl_bboxes.new_zeros([0, self.box_code_size]),
box_dim=self.box_code_size),
scores_3d=mlvl_bboxes.new_zeros([0]),
labels_3d=mlvl_bboxes.new_zeros([0]),
cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]]))
================================================
FILE: mmdet3d/models/dense_heads/shape_aware_head.py
================================================
import numpy as np
import torch
from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init
from torch import nn as nn
from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr
from mmdet.core import multi_apply
from mmdet.models import HEADS
from ..builder import build_head
from .anchor3d_head import Anchor3DHead
@HEADS.register_module()
class BaseShapeHead(nn.Module):
"""Base Shape-aware Head in Shape Signature Network.
Note:
This base shape-aware grouping head uses default settings for small
objects. For large and huge objects, it is recommended to use
heavier heads, like (64, 64, 64) and (128, 128, 64, 64, 64) in
shared conv channels, (2, 1, 1) and (2, 1, 2, 1, 1) in shared
conv strides. For tiny objects, we can use smaller heads, like
(32, 32) channels and (1, 1) strides.
Args:
num_cls (int): Number of classes.
num_base_anchors (int): Number of anchors per location.
box_code_size (int): The dimension of boxes to be encoded.
in_channels (int): Input channels for convolutional layers.
shared_conv_channels (tuple): Channels for shared convolutional \
layers. Default: (64, 64). \
shared_conv_strides (tuple): Strides for shared convolutional \
layers. Default: (1, 1).
use_direction_classifier (bool, optional): Whether to use direction \
classifier. Default: True.
conv_cfg (dict): Config of conv layer. Default: dict(type='Conv2d')
norm_cfg (dict): Config of norm layer. Default: dict(type='BN2d').
bias (bool|str, optional): Type of bias. Default: False.
"""
def __init__(self,
num_cls,
num_base_anchors,
box_code_size,
in_channels,
shared_conv_channels=(64, 64),
shared_conv_strides=(1, 1),
use_direction_classifier=True,
conv_cfg=dict(type='Conv2d'),
norm_cfg=dict(type='BN2d'),
bias=False):
super().__init__()
self.num_cls = num_cls
self.num_base_anchors = num_base_anchors
self.use_direction_classifier = use_direction_classifier
self.box_code_size = box_code_size
assert len(shared_conv_channels) == len(shared_conv_strides), \
'Lengths of channels and strides list should be equal.'
self.shared_conv_channels = [in_channels] + list(shared_conv_channels)
self.shared_conv_strides = list(shared_conv_strides)
shared_conv = []
for i in range(len(self.shared_conv_strides)):
shared_conv.append(
ConvModule(
self.shared_conv_channels[i],
self.shared_conv_channels[i + 1],
kernel_size=3,
stride=self.shared_conv_strides[i],
padding=1,
conv_cfg=conv_cfg,
bias=bias,
norm_cfg=norm_cfg))
self.shared_conv = nn.Sequential(*shared_conv)
out_channels = self.shared_conv_channels[-1]
self.conv_cls = nn.Conv2d(out_channels, num_base_anchors * num_cls, 1)
self.conv_reg = nn.Conv2d(out_channels,
num_base_anchors * box_code_size, 1)
if use_direction_classifier:
self.conv_dir_cls = nn.Conv2d(out_channels, num_base_anchors * 2,
1)
def init_weights(self):
"""Initialize weights."""
bias_cls = bias_init_with_prob(0.01)
# shared conv layers have already been initialized by ConvModule
normal_init(self.conv_cls, std=0.01, bias=bias_cls)
normal_init(self.conv_reg, std=0.01)
if self.use_direction_classifier:
normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls)
def forward(self, x):
"""Forward function for SmallHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, C, H, W].
Returns:
dict[torch.Tensor]: Contain score of each class, bbox \
regression and direction classification predictions. \
Note that all the returned tensors are reshaped as \
[bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins]. \
It is more convenient to concat anchors for different \
classes even though they have different feature map sizes.
"""
x = self.shared_conv(x)
cls_score = self.conv_cls(x)
bbox_pred = self.conv_reg(x)
featmap_size = bbox_pred.shape[-2:]
H, W = featmap_size
B = bbox_pred.shape[0]
cls_score = cls_score.view(-1, self.num_base_anchors, self.num_cls, H,
W).permute(0, 1, 3, 4,
2).reshape(B, -1, self.num_cls)
bbox_pred = bbox_pred.view(-1, self.num_base_anchors,
self.box_code_size, H, W).permute(
0, 1, 3, 4,
2).reshape(B, -1, self.box_code_size)
dir_cls_preds = None
if self.use_direction_classifier:
dir_cls_preds = self.conv_dir_cls(x)
dir_cls_preds = dir_cls_preds.view(-1, self.num_base_anchors, 2, H,
W).permute(0, 1, 3, 4,
2).reshape(B, -1, 2)
ret = dict(
cls_score=cls_score,
bbox_pred=bbox_pred,
dir_cls_preds=dir_cls_preds,
featmap_size=featmap_size)
return ret
@HEADS.register_module()
class ShapeAwareHead(Anchor3DHead):
"""Shape-aware grouping head for SSN.
Args:
tasks (dict): Shape-aware groups of multi-class objects.
assign_per_class (bool, optional): Whether to do assignment for each \
class. Default: True.
kwargs (dict): Other arguments are the same as those in \
:class:`Anchor3DHead`.
"""
def __init__(self, tasks, assign_per_class=True, **kwargs):
self.tasks = tasks
self.featmap_sizes = []
super().__init__(assign_per_class=assign_per_class, **kwargs)
def _init_layers(self):
"""Initialize neural network layers of the head."""
self.heads = nn.ModuleList()
cls_ptr = 0
for task in self.tasks:
sizes = self.anchor_generator.sizes[cls_ptr:cls_ptr +
task['num_class']]
num_size = torch.tensor(sizes).reshape(-1, 3).size(0)
num_rot = len(self.anchor_generator.rotations)
num_base_anchors = num_rot * num_size
branch = dict(
type='BaseShapeHead',
num_cls=self.num_classes,
num_base_anchors=num_base_anchors,
box_code_size=self.box_code_size,
in_channels=self.in_channels,
shared_conv_channels=task['shared_conv_channels'],
shared_conv_strides=task['shared_conv_strides'])
self.heads.append(build_head(branch))
cls_ptr += task['num_class']
def init_weights(self):
"""Initialize the weights of head."""
for head in self.heads:
head.init_weights()
def forward_single(self, x):
"""Forward function on a single-scale feature map.
Args:
x (torch.Tensor): Input features.
Returns:
tuple[torch.Tensor]: Contain score of each class, bbox \
regression and direction classification predictions.
"""
results = []
for head in self.heads:
results.append(head(x))
cls_score = torch.cat([result['cls_score'] for result in results],
dim=1)
bbox_pred = torch.cat([result['bbox_pred'] for result in results],
dim=1)
dir_cls_preds = None
if self.use_direction_classifier:
dir_cls_preds = torch.cat(
[result['dir_cls_preds'] for result in results], dim=1)
self.featmap_sizes = []
for i, task in enumerate(self.tasks):
for _ in range(task['num_class']):
self.featmap_sizes.append(results[i]['featmap_size'])
assert len(self.featmap_sizes) == len(self.anchor_generator.ranges), \
'Length of feature map sizes must be equal to length of ' + \
'different ranges of anchor generator.'
return cls_score, bbox_pred, dir_cls_preds
def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels,
label_weights, bbox_targets, bbox_weights, dir_targets,
dir_weights, num_total_samples):
"""Calculate loss of Single-level results.
Args:
cls_score (torch.Tensor): Class score in single-level.
bbox_pred (torch.Tensor): Bbox prediction in single-level.
dir_cls_preds (torch.Tensor): Predictions of direction class
in single-level.
labels (torch.Tensor): Labels of class.
label_weights (torch.Tensor): Weights of class loss.
bbox_targets (torch.Tensor): Targets of bbox predictions.
bbox_weights (torch.Tensor): Weights of bbox loss.
dir_targets (torch.Tensor): Targets of direction predictions.
dir_weights (torch.Tensor): Weights of direction loss.
num_total_samples (int): The number of valid samples.
Returns:
tuple[torch.Tensor]: Losses of class, bbox \
and direction, respectively.
"""
# classification loss
if num_total_samples is None:
num_total_samples = int(cls_score.shape[0])
labels = labels.reshape(-1)
label_weights = label_weights.reshape(-1)
cls_score = cls_score.reshape(-1, self.num_classes)
loss_cls = self.loss_cls(
cls_score, labels, label_weights, avg_factor=num_total_samples)
# regression loss
bbox_targets = bbox_targets.reshape(-1, self.box_code_size)
bbox_weights = bbox_weights.reshape(-1, self.box_code_size)
code_weight = self.train_cfg.get('code_weight', None)
if code_weight:
bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight)
bbox_pred = bbox_pred.reshape(-1, self.box_code_size)
if self.diff_rad_by_sin:
bbox_pred, bbox_targets = self.add_sin_difference(
bbox_pred, bbox_targets)
loss_bbox = self.loss_bbox(
bbox_pred,
bbox_targets,
bbox_weights,
avg_factor=num_total_samples)
# direction classification loss
loss_dir = None
if self.use_direction_classifier:
dir_cls_preds = dir_cls_preds.reshape(-1, 2)
dir_targets = dir_targets.reshape(-1)
dir_weights = dir_weights.reshape(-1)
loss_dir = self.loss_dir(
dir_cls_preds,
dir_targets,
dir_weights,
avg_factor=num_total_samples)
return loss_cls, loss_bbox, loss_dir
def loss(self,
cls_scores,
bbox_preds,
dir_cls_preds,
gt_bboxes,
gt_labels,
input_metas,
gt_bboxes_ignore=None):
"""Calculate losses.
Args:
cls_scores (list[torch.Tensor]): Multi-level class scores.
bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
dir_cls_preds (list[torch.Tensor]): Multi-level direction
class predictions.
gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes
of each sample.
gt_labels (list[torch.Tensor]): Gt labels of each sample.
input_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict[str, list[torch.Tensor]]: Classification, bbox, and \
direction losses of each level.
- loss_cls (list[torch.Tensor]): Classification losses.
- loss_bbox (list[torch.Tensor]): Box regression losses.
- loss_dir (list[torch.Tensor]): Direction classification \
losses.
"""
device = cls_scores[0].device
anchor_list = self.get_anchors(
self.featmap_sizes, input_metas, device=device)
cls_reg_targets = self.anchor_target_3d(
anchor_list,
gt_bboxes,
input_metas,
gt_bboxes_ignore_list=gt_bboxes_ignore,
gt_labels_list=gt_labels,
num_classes=self.num_classes,
sampling=self.sampling)
if cls_reg_targets is None:
return None
(labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
dir_targets_list, dir_weights_list, num_total_pos,
num_total_neg) = cls_reg_targets
num_total_samples = (
num_total_pos + num_total_neg if self.sampling else num_total_pos)
# num_total_samples = None
losses_cls, losses_bbox, losses_dir = multi_apply(
self.loss_single,
cls_scores,
bbox_preds,
dir_cls_preds,
labels_list,
label_weights_list,
bbox_targets_list,
bbox_weights_list,
dir_targets_list,
dir_weights_list,
num_total_samples=num_total_samples)
return dict(
loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir)
def get_bboxes(self,
cls_scores,
bbox_preds,
dir_cls_preds,
input_metas,
cfg=None,
rescale=False):
"""Get bboxes of anchor head.
Args:
cls_scores (list[torch.Tensor]): Multi-level class scores.
bbox_preds (list[torch.Tensor]): Multi-level bbox predictions.
dir_cls_preds (list[torch.Tensor]): Multi-level direction
class predictions.
input_metas (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config.
Default: None.
rescale (list[torch.Tensor], optional): Whether to rescale bbox.
Default: False.
Returns:
list[tuple]: Prediction resultes of batches.
"""
assert len(cls_scores) == len(bbox_preds)
assert len(cls_scores) == len(dir_cls_preds)
num_levels = len(cls_scores)
assert num_levels == 1, 'Only support single level inference.'
device = cls_scores[0].device
mlvl_anchors = self.anchor_generator.grid_anchors(
self.featmap_sizes, device=device)
# `anchor` is a list of anchors for different classes
mlvl_anchors = [torch.cat(anchor, dim=0) for anchor in mlvl_anchors]
result_list = []
for img_id in range(len(input_metas)):
cls_score_list = [
cls_scores[i][img_id].detach() for i in range(num_levels)
]
bbox_pred_list = [
bbox_preds[i][img_id].detach() for i in range(num_levels)
]
dir_cls_pred_list = [
dir_cls_preds[i][img_id].detach() for i in range(num_levels)
]
input_meta = input_metas[img_id]
proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list,
dir_cls_pred_list, mlvl_anchors,
input_meta, cfg, rescale)
result_list.append(proposals)
return result_list
def get_bboxes_single(self,
cls_scores,
bbox_preds,
dir_cls_preds,
mlvl_anchors,
input_meta,
cfg=None,
rescale=False):
"""Get bboxes of single branch.
Args:
cls_scores (torch.Tensor): Class score in single batch.
bbox_preds (torch.Tensor): Bbox prediction in single batch.
dir_cls_preds (torch.Tensor): Predictions of direction class
in single batch.
mlvl_anchors (List[torch.Tensor]): Multi-level anchors
in single batch.
input_meta (list[dict]): Contain pcd and img's meta info.
cfg (None | :obj:`ConfigDict`): Training or testing config.
rescale (list[torch.Tensor], optional): whether to rescale bbox. \
Default: False.
Returns:
tuple: Contain predictions of single batch.
- bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes.
- scores (torch.Tensor): Class score of each bbox.
- labels (torch.Tensor): Label of each bbox.
"""
cfg = self.test_cfg if cfg is None else cfg
assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors)
mlvl_bboxes = []
mlvl_scores = []
mlvl_dir_scores = []
for cls_score, bbox_pred, dir_cls_pred, anchors in zip(
cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors):
assert cls_score.size()[-2] == bbox_pred.size()[-2]
assert cls_score.size()[-2] == dir_cls_pred.size()[-2]
dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1]
if self.use_sigmoid_cls:
scores = cls_score.sigmoid()
else:
scores = cls_score.softmax(-1)
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
if self.use_sigmoid_cls:
max_scores, _ = scores.max(dim=1)
else:
max_scores, _ = scores[:, :-1].max(dim=1)
_, topk_inds = max_scores.topk(nms_pre)
anchors = anchors[topk_inds, :]
bbox_pred = bbox_pred[topk_inds, :]
scores = scores[topk_inds, :]
dir_cls_score = dir_cls_score[topk_inds]
bboxes = self.bbox_coder.decode(anchors, bbox_pred)
mlvl_bboxes.append(bboxes)
mlvl_scores.append(scores)
mlvl_dir_scores.append(dir_cls_score)
mlvl_bboxes = torch.cat(mlvl_bboxes)
mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
mlvl_bboxes, box_dim=self.box_code_size).bev)
mlvl_scores = torch.cat(mlvl_scores)
mlvl_dir_scores = torch.cat(mlvl_dir_scores)
if self.use_sigmoid_cls:
# Add a dummy background class to the front when using sigmoid
padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1)
mlvl_scores = torch.cat([mlvl_scores, padding], dim=1)
score_thr = cfg.get('score_thr', 0)
results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms,
mlvl_scores, score_thr, cfg.max_num,
cfg, mlvl_dir_scores)
bboxes, scores, labels, dir_scores = results
if bboxes.shape[0] > 0:
dir_rot = limit_period(bboxes[..., 6] - self.dir_offset,
self.dir_limit_offset, np.pi)
bboxes[..., 6] = (
dir_rot + self.dir_offset +
np.pi * dir_scores.to(bboxes.dtype))
bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size)
return bboxes, scores, labels
================================================
FILE: mmdet3d/models/dense_heads/sparsefusion_head_deform.py
================================================
import copy
import numpy as np
import torch
import functools
import pickle
import os
from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init
from mmcv.runner import force_fp32
from torch import nn
import torch.nn.functional as F
import time
from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius,
xywhr2xyxyr, limit_period, PseudoSampler, BboxOverlaps3D)
from mmdet3d.models.builder import HEADS, build_loss
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu
from mmdet3d.models.utils import clip_sigmoid, inverse_sigmoid
from mmdet3d.models.fusion_layers import apply_3d_transformation
from mmdet.core import build_bbox_coder, multi_apply, build_assigner, build_sampler, AssignResult
from mmdet3d.models.utils import FFN, TransformerDecoderLayer, PositionEmbeddingLearned, PositionEmbeddingLearnedwoNorm,\
PointTransformer2D_3D, ImageTransformer_Cam_3D_MS, ProjectionLayerNorm, FusionTransformer2D_3D_Self, \
ViewTransformer, DepthEncoderResNet, LayerNorm, ConvLN, FFNLN, normalize_pos
from mmdet3d.models.utils.ops.modules import MSDeformAttn
from mmdet3d.models.utils.deformable_decoder import DeformableTransformerDecoderLayer
@HEADS.register_module()
class SparseFusionHead2D_Deform(nn.Module):
def __init__(self,
num_views=0,
in_channels_img=64,
out_size_factor_img=4,
num_proposals=128,
num_img_proposals=128,
in_channels=128 * 3,
hidden_channel=128,
num_classes=4,
# config for Transformer
num_pts_decoder_layers=1,
num_img_decoder_layers=1,
num_fusion_decoder_layers=1,
num_heads=8,
initialize_by_heatmap=True,
semantic_transfer=True,
cross_only=True,
range_num=5,
cross_heatmap_layer=1,
img_heatmap_layer=2,
img_reg_layer=3,
nms_kernel_size=3,
img_nms_kernel_size=3,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
# config for FFN
common_heads=dict(),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
bias='auto',
# loss
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_bbox=dict(type='L1Loss', reduction='mean'),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean'),
loss_heatmap_2d=dict(type='GaussianFocalLoss', reduction='mean'),
loss_center_2d=dict(type='L1Loss', reduction='mean'),
# others
train_cfg=None,
test_cfg=None,
bbox_coder=None,
bbox_2d_coder=None,
use_camera='se',
level_num=4,
img_reg_bn=False,
geometric_transfer=True,
view_transform=True,
depth_input_channel=2,
):
super(SparseFusionHead2D_Deform, self).__init__()
self.num_proposals = num_proposals
self.num_img_proposals = num_img_proposals
self.num_classes = num_classes
self.bbox_coder = build_bbox_coder(bbox_coder)
self.bbox_2d_coder = build_bbox_coder(bbox_2d_coder)
self.bn_momentum = bn_momentum
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.initialize_by_heatmap = initialize_by_heatmap
self.semantic_transfer = semantic_transfer
self.cross_only = cross_only
self.level_num = level_num
self.in_channels_img = in_channels_img
self.view_transform = view_transform
self.range_num = range_num
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_heatmap = build_loss(loss_heatmap)
self.loss_heatmap_2d = build_loss(loss_heatmap_2d)
self.loss_center_2d = build_loss(loss_center_2d)
self.num_img_decoder_layers = num_img_decoder_layers
self.num_pts_decoder_layers = num_pts_decoder_layers
self.num_fusion_decoder_layers = num_fusion_decoder_layers
self.hidden_channel = hidden_channel
self.sampling = False
self.out_size_factor_img = out_size_factor_img
self.geometric_transfer = geometric_transfer
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
if not self.use_sigmoid_cls:
self.num_classes += 1
heads3d = copy.deepcopy(common_heads)
heads3d.update(dict(heatmap=(self.num_classes, 2)))
pts_prediction_heads = FFN(hidden_channel, heads3d, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)
fusion_heads = dict(center=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2), heatmap=(self.num_classes, 2))
fusion_prediction_heads = FFN(hidden_channel, fusion_heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)
heads2d = dict(center_2d=(2, img_reg_layer, img_reg_bn), depth_2d=(1, img_reg_layer, img_reg_bn), cls=(self.num_classes, 2),
dim_2d=(3, img_reg_layer, img_reg_bn), rot_2d=(2, img_reg_layer, img_reg_bn), vel_2d=(2, img_reg_layer, img_reg_bn)
)
# img_prediction_heads = FFN(hidden_channel, heads2d, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)
img_prediction_heads = FFNLN(hidden_channel, heads2d, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)
pts_query_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(num_pts_decoder_layers)]
pts_key_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(num_pts_decoder_layers)]
self.point_transformer = PointTransformer2D_3D(
hidden_channel=hidden_channel, num_heads=num_heads, num_decoder_layers=num_pts_decoder_layers,
prediction_heads=pts_prediction_heads, ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg,
query_pos=pts_query_pos_embed, key_pos=pts_key_pos_embed
)
img_query_pos_embed = [PositionEmbeddingLearnedwoNorm(2, hidden_channel) for _ in range(num_img_decoder_layers)]
img_key_pos_embed = [PositionEmbeddingLearnedwoNorm(2, hidden_channel) for _ in range(num_img_decoder_layers)]
self.img_transformer = ImageTransformer_Cam_3D_MS(
hidden_channel=hidden_channel, num_heads=num_heads, num_decoder_layers=num_img_decoder_layers, out_size_factor_img=out_size_factor_img,
num_views=num_views, prediction_heads=img_prediction_heads, ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg,
query_pos=img_query_pos_embed, key_pos=img_key_pos_embed
)
if view_transform:
heads_view = dict(center_view=(2, 2), height_view=(1, 2), dim_view=(3, 2), rot_view=(2, 2),
vel_view=(2, 2), heatmap_view=(self.num_classes, 2))
view_prediction_heads = FFN(hidden_channel, heads_view, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)
# view_prediction_heads = FFNLN(hidden_channel, heads_view, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias)
view_query_pos_embed = PositionEmbeddingLearnedwoNorm(9, hidden_channel)
view_key_pos_embed = PositionEmbeddingLearnedwoNorm(9, hidden_channel)
view_projection = ProjectionLayerNorm(hidden_channel)
self.view_transformer = ViewTransformer(
hidden_channel=hidden_channel, num_heads=num_heads, prediction_heads=view_prediction_heads,
ffn_channel=ffn_channel, dropout=dropout, activation=activation, test_cfg=test_cfg,
query_pos=view_query_pos_embed, key_pos=view_key_pos_embed, view_projection=view_projection,
use_camera=use_camera
)
fusion_query_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(self.num_fusion_decoder_layers)]
fusion_key_pos_embed = [PositionEmbeddingLearned(2, hidden_channel) for _ in range(self.num_fusion_decoder_layers)]
fuse_pts_projection = ProjectionLayerNorm(hidden_channel)
fuse_img_projection = ProjectionLayerNorm(hidden_channel)
self.fusion_transformer = FusionTransformer2D_3D_Self(
hidden_channel=hidden_channel, num_heads=num_heads, num_decoder_layers=num_fusion_decoder_layers,
prediction_heads=fusion_prediction_heads, ffn_channel=ffn_channel, dropout=dropout,
activation=activation, test_cfg=test_cfg, query_pos=fusion_query_pos_embed, key_pos=fusion_query_pos_embed,
pts_projection=fuse_pts_projection, img_projection=fuse_img_projection,
num_proposals=num_proposals
)
if self.initialize_by_heatmap and self.semantic_transfer:
self.heatmap_pts_proj = nn.Sequential(
nn.Linear(hidden_channel, hidden_channel),
nn.LayerNorm(hidden_channel)
)
self.heatmap_img_proj = nn.Sequential(
nn.Linear(hidden_channel, hidden_channel),
nn.LayerNorm(hidden_channel)
)
self.cross_heatmap_head = self.build_heatmap_LN(hidden_channel, bias, num_classes, layer_num=cross_heatmap_layer)
colattn_query_pos = PositionEmbeddingLearnedwoNorm(3, hidden_channel)
colattn_key_pos = PositionEmbeddingLearnedwoNorm(2, hidden_channel)
self.cross_heatmap_decoder = DeformableTransformerDecoderLayer(
hidden_channel, num_heads, dim_feedforward=ffn_channel, dropout=dropout, activation=activation,
self_posembed=colattn_query_pos, cross_posembed=colattn_key_pos, cross_only=False
)
self.reduce_conv = ConvLN(
hidden_channel+1, hidden_channel, kernel_size=3, padding=1
)
# a shared convolution
self.shared_conv = build_conv_layer(
dict(type='Conv2d'),
in_channels,
hidden_channel,
kernel_size=3,
padding=1,
bias=bias,
)
# transformer decoder layers for object query with LiDAR feature
self.num_views = num_views
if self.geometric_transfer:
self.shared_conv_img = nn.Identity()
blocks = [1] * self.level_num
assert len(blocks) == self.level_num
self.depth_resnet = DepthEncoderResNet(depth_input_channel, in_channels_img, hidden_channel, depth_layers=blocks)
else:
self.shared_conv_img = build_conv_layer(
dict(type='Conv2d'),
in_channels_img, # channel of img feature map
hidden_channel,
kernel_size=3,
padding=1,
bias=bias,
)
# Position Embedding for Cross-Attention, which is re-used during training
x_size = self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor']
y_size = self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor']
self.bev_pos = self.create_2D_grid(x_size, y_size)
if self.initialize_by_heatmap:
self.heatmap_head = self.build_heatmap(hidden_channel, bias, num_classes)
self.img_heatmap_head = nn.ModuleList()
for lvl in range(self.level_num):
self.img_heatmap_head.append(self.build_heatmap_LN(hidden_channel, bias, num_classes, layer_num=img_heatmap_layer))
self.class_encoding = nn.Conv1d(num_classes, hidden_channel, 1)
self.img_class_encoding = nn.Conv1d(num_classes, hidden_channel, 1)
else:
# query feature
self.pts_query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_proposals))
self.pts_query_pos = nn.Parameter(torch.rand([1, self.num_proposals, 2])*torch.Tensor([x_size, y_size]).reshape(1, 1, 2), requires_grad=True)
self.img_query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_img_proposals))
self.img_query_pos = nn.Parameter(torch.rand([1, self.num_img_proposals, 2]), requires_grad=True)
self.img_query_pos = inverse_sigmoid(self.img_query_pos)
self.nms_kernel_size = nms_kernel_size
self.img_nms_kernel_size = img_nms_kernel_size
self.img_feat_pos = None
self.img_feat_collapsed_pos = None
self.init_weights()
self._init_assigner_sampler()
def create_2D_grid(self, x_size, y_size):
meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]]
batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
batch_x = batch_x + 0.5
batch_y = batch_y + 0.5
coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None]
coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1)
return coord_base
def init_bn_momentum(self):
for m in self.modules():
if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
m.momentum = self.bn_momentum
def init_weights(self):
# initialize transformer
for m in self.parameters():
if m.dim() > 1:
nn.init.xavier_uniform_(m)
for m in self.modules():
if isinstance(m, MSDeformAttn):
m._reset_parameters()
self.init_bn_momentum()
if self.geometric_transfer:
level_pos = torch.zeros([self.level_num, self.hidden_channel])
else:
level_pos = torch.zeros([self.level_num, self.in_channels_img])
self.level_pos = nn.Parameter(level_pos, requires_grad=True)
torch.nn.init.normal_(self.level_pos)
def _init_assigner_sampler(self):
"""Initialize the target assigner and sampler of the head."""
if self.train_cfg is None:
return
if self.sampling:
self.bbox_sampler = build_sampler(self.train_cfg.sampler)
else:
self.bbox_sampler = PseudoSampler()
if isinstance(self.train_cfg.assigner, dict):
self.bbox_assigner = build_assigner(self.train_cfg.assigner)
elif isinstance(self.train_cfg.assigner, list):
self.bbox_assigner = [
build_assigner(res) for res in self.train_cfg.assigner
]
if isinstance(self.train_cfg.assigner_2d, dict):
self.bbox_assigner_2d = build_assigner(self.train_cfg.assigner_2d)
elif isinstance(self.train_cfg.assigner_2d, list):
self.bbox_assigner_2d = [
build_assigner(res) for res in self.train_cfg.assigner_2d
]
def forward_single(self, inputs, img_inputs, img_metas, sparse_depth):
"""
Args:
inputs (torch.Tensor): Input feature map with the shape of
[B, C, 128(H), 128(W)]. (consistent with L748)
img_inputs (torch.Tensor): Input feature map with the shape of
[B*num_view, C, image_H, image_W]
sparse_depth (torch.Tensor): Input normalized depth with the shape of
[B, num_views, num_scales, depth_C, depth_H, depth_W]
Returns:
list[dict]: Output results for tasks.
"""
batch_size = inputs.shape[0]
sparse_depth = sparse_depth[:, :, 0, :2]
if self.geometric_transfer:
sparse_depth = sparse_depth.view(batch_size*self.num_views, 1, -1, sparse_depth.shape[-2], sparse_depth.shape[-1])
img_inputs = self.depth_resnet(sparse_depth[:, 0], img_inputs)
img_feats = []
for i in range(self.level_num):
img_inputs_level = img_inputs[i] + self.level_pos[i].reshape(1, self.level_pos[i].shape[0], 1, 1)
img_feat = self.shared_conv_img(img_inputs_level)
img_feats.append(img_feat)
input_padding_mask = self.construct_input_padding_mask(img_feats, img_metas)
# input_padding_mask = None
img_feats_pos = []
normal_img_feats_pos = []
for lvl in range(self.level_num):
h, w = img_feats[lvl].shape[-2], img_feats[lvl].shape[-1]
img_feat_pos = self.create_2D_grid(h, w).to(img_feats[lvl].device) # (1, h*w, 2)
img_feats_pos.append(img_feat_pos)
normal_img_feat_pos = normalize_pos(img_feat_pos, w, h) # (1, h*w, 2)
normal_img_feats_pos.append(normal_img_feat_pos)
normal_img_feats_pos_stack = torch.cat(normal_img_feats_pos, dim=1) # (1, h*w (sum), 2)
self.normal_img_feats_pos_stack = normal_img_feats_pos_stack
normal_img_feats_pos_repeat = normal_img_feats_pos_stack.repeat(batch_size, 1, 1)
proj_matrix = self.construct_projection_matrix(img_metas, normal_img_feats_pos_stack.device)
inputs, min_voxel_height, max_voxel_height = inputs[:, :-2], inputs[:, -2], inputs[:, -1]
lidar_feat = self.shared_conv(inputs) # [BS, C, H, W]
#################################
# image to BEV
#################################
lidar_feat_flatten = lidar_feat.view(batch_size, lidar_feat.shape[1], -1) # [BS, C, H*W]
bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, H*W, 2]
if self.initialize_by_heatmap:
if self.semantic_transfer:
img_feat_cross = []
for level in range(self.level_num):
img_feat_cross.append(img_feats[level].clone())
else:
img_feat_cross = None
heatmap, dense_heatmap, pts_top_proposals_class, pts_top_proposals_index = self.generate_heatmap(lidar_feat.clone(), min_voxel_height, max_voxel_height, batch_size, img_metas, proj_matrix['lidar2img_rt'], img_feat_cross, input_padding_mask)
pts_query_feat = lidar_feat_flatten.gather(
index=pts_top_proposals_index[:, None, :].expand(-1, lidar_feat_flatten.shape[1], -1), dim=-1
) # [BS, C, num_proposals]
# add category embedding
one_hot = F.one_hot(pts_top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1) # [BS, num_classes, num_proposals]
query_cat_encoding = self.class_encoding(one_hot.float()) # [BS, C, num_proposals]
self.query_labels = pts_top_proposals_class
pts_query_feat += query_cat_encoding
pts_query_pos = bev_pos.gather(
index=pts_top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]), dim=1
) # [BS, num_proposals, 2]
else:
pts_query_feat = self.pts_query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals]
pts_query_pos = self.pts_query_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, num_proposals, 2]
if self.initialize_by_heatmap:
img_feats_heatmap = []
for lvl in range(self.level_num):
img_feats_heatmap.append(img_feats[lvl].clone())
img_heatmap, img_dense_heatmap, img_top_proposals_class, img_top_proposals_index, img_top_proposals_view_idx, img_top_proposals_pos_id = \
self.generate_heatmap_img(img_feats_heatmap, batch_size)
img_feats_flatten = []
for lvl in range(self.level_num):
img_feat = img_feats[lvl]
h, w = img_feat.shape[-2], img_feat.shape[-1]
img_feat_flatten = img_feat.reshape(batch_size, self.num_views, self.hidden_channel, h * w)
img_feat_flatten = img_feat_flatten.permute(0, 2, 1, 3) # [BS, C, num_view, h*w]
img_feats_flatten.append(img_feat_flatten)
img_feat_stack = torch.cat(img_feats_flatten, dim=-1) # [BS, C, num_view, h*w (sum)]
img_feat_stack = img_feat_stack.view(batch_size, self.hidden_channel, self.num_views*img_feat_stack.shape[-1])
normal_img_query_pos = normal_img_feats_pos_repeat.gather(
index=img_top_proposals_pos_id[:, None, :].permute(0, 2, 1).expand(-1, -1, normal_img_feats_pos_stack.shape[-1]), dim=1
) # [BS, num_proposals, 2]
img_query_feat = img_feat_stack.gather(
index=img_top_proposals_index[:, None, :].expand(-1, img_feat_stack.shape[1], -1), dim=-1
) # [BS, C, num_proposals]
img_query_view = img_top_proposals_view_idx.clone() # [BS, num_proposals]
one_hot = F.one_hot(img_top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1) # [BS, num_classes, num_proposals]
self.img_query_label = img_top_proposals_class
img_query_cat_encoding = self.img_class_encoding(one_hot.float()) # [BS, C, num_proposals]
img_query_feat += img_query_cat_encoding
else:
img_query_feat = self.img_query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals]
normal_img_query_pos = self.img_query_pos.repeat(batch_size, 1, 1).to(img_feat.device) # [BS, num_proposals, 2]
img_query_pos_view = torch.arange(self.num_img_proposals).reshape(1, -1).repeat(batch_size, 1).to(img_feat.device)
img_query_view = img_query_pos_view % self.num_views
view_proj_matrix = self.construction_view_projection_matrix(proj_matrix, img_query_view)
#################################
# transformer decoder layer (LiDAR feature as K,V)
#################################
ret_dicts = []
pts_query_feat, pts_query_pos, pts_ret_dicts = self.point_transformer(pts_query_feat, pts_query_pos, lidar_feat_flatten, bev_pos)
ret_dicts.extend(pts_ret_dicts)
#################################
# transformer decoder layer (img feature as K,V)
#################################
img_query_feat, normal_img_query_pos, img_query_pos_bev, camera_info, img_ret_dicts = \
self.img_transformer(img_query_feat, normal_img_query_pos, img_query_view, img_feats, normal_img_feats_pos_stack, view_proj_matrix['lidar2cam_rt'], view_proj_matrix['cam_intrinsic'], img_metas, input_padding_mask)
#################################
# view transformation layer
#################################
if self.view_transform:
img_query_feat, img_query_pos_bev, view_ret_dicts = self.view_transformer(img_query_feat, img_query_pos_bev, normal_img_query_pos[..., :2], img_ret_dicts, camera_info)
img_query_pos_bev = img_query_pos_bev[..., :2]
#################################
# fusion layer
#################################
all_query_feat, all_query_pos, fusion_ret_dicts = self.fusion_transformer(pts_query_feat, pts_query_pos, img_query_feat, img_query_pos_bev)
ret_dicts.extend(fusion_ret_dicts)
if self.initialize_by_heatmap:
ret_dicts[0]['query_heatmap_score'] = heatmap.gather(index=pts_top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) # [bs, num_classes, num_proposals]
ret_dicts[0]['dense_heatmap'] = dense_heatmap
ret_dicts[0]['img_query_heatmap_score'] = img_heatmap.gather(index=img_top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) # [bs, num_classes, num_proposals]
ret_dicts[0]['img_dense_heatmap'] = img_dense_heatmap
# return all the layer's results for auxiliary superivison
new_res = {}
for key in ret_dicts[0].keys():
if key not in ['dense_heatmap', 'query_heatmap_score', 'img_query_heatmap_score', 'img_dense_heatmap']:
new_res[key] = torch.cat([ret_dict[key] for ret_dict in ret_dicts], dim=-1)
else:
new_res[key] = ret_dicts[0][key]
for key in img_ret_dicts[0].keys():
new_res[key] = torch.cat([ret_dict[key] for ret_dict in img_ret_dicts], dim=-1)
new_res['view'] = img_query_view.repeat(1, self.num_img_decoder_layers)
if self.view_transform:
for key in view_ret_dicts[0].keys():
new_res[key] = torch.cat([ret_dict[key] for ret_dict in view_ret_dicts], dim=-1)
return [new_res]
def forward(self, feats, img_feats, img_metas, sparse_depth=None):
"""Forward pass.
Args:
feats (list[torch.Tensor]): Multi-level features, e.g.,
features produced by FPN.
Returns:
tuple(list[dict]): Output results. first index by level, second index by layer
"""
if img_feats is None:
img_feats = [None]
else:
img_feats = [img_feats[:self.level_num]]
if sparse_depth is None:
sparse_depth = [None]
else:
sparse_depth = [sparse_depth[:, :, :self.level_num]]
res = multi_apply(self.forward_single, feats, img_feats, [img_metas], sparse_depth)
assert len(res) == 1, "only support one level features."
return res
def construct_input_padding_mask(self, img_feats, img_metas):
batch_size = len(img_metas)
device = img_feats[0].device
img_h_lvl = []
img_w_lvl = []
for img_feat_lvl in img_feats:
img_h_lvl.append(img_feat_lvl.shape[-2])
img_w_lvl.append(img_feat_lvl.shape[-1])
padding_mask = []
for sample_idx in range(batch_size):
sample_mask = []
for view_idx in range(self.num_views):
view_mask = []
if 'valid_shape' in img_metas[sample_idx]:
valid_shape = img_metas[sample_idx]['valid_shape'][view_idx] / self.out_size_factor_img
else:
valid_shape = np.array([img_metas[sample_idx]['img_shape'][1], img_metas[sample_idx]['img_shape'][0]]) / self.out_size_factor_img
for lvl_idx in range(self.level_num):
lvl_mask = torch.ones([img_h_lvl[lvl_idx], img_w_lvl[lvl_idx]], dtype=torch.bool, device=device)
valid_shape_lvl = valid_shape // (2 ** lvl_idx)
valid_w_lvl = int(valid_shape_lvl[0])
valid_h_lvl = int(valid_shape_lvl[1])
lvl_mask[:valid_h_lvl, :valid_w_lvl] = False
view_mask.append(lvl_mask.view(-1))
view_mask = torch.cat(view_mask)
sample_mask.append(view_mask)
sample_mask = torch.stack(sample_mask, dim=0)
padding_mask.append(sample_mask)
padding_mask = torch.stack(padding_mask, dim=0)
return padding_mask
def construction_view_projection_matrix(self, proj_matrix, img_query_view):
view_proj_matrix = {}
batch_size = img_query_view.shape[0]
batch_ids = torch.arange(batch_size)[:, None].repeat(1, self.num_img_proposals)
batch_ids = batch_ids.to(img_query_view.device)
for key in proj_matrix:
view_proj_matrix[key] = proj_matrix[key][batch_ids, img_query_view]
return view_proj_matrix
def construct_projection_matrix(self, img_metas, device):
batch_size = len(img_metas)
cam_ints = torch.zeros([batch_size, self.num_views, 4, 4], device=device)
cam_ints[:, :, 3, 3] = 1
for sample_id in range(batch_size):
cam_ints[sample_id, :, :3, :3] = torch.Tensor(img_metas[sample_id]['cam_intrinsic']).to(device)
lidar2cam_rt = torch.zeros([batch_size, self.num_views, 4, 4], device=device)
lidar2cam_rt[:, :, 3, 3] = 1
for sample_id in range(batch_size):
lidar2cam_rt[sample_id, :, :3, :3] = torch.Tensor(img_metas[sample_id]['lidar2cam_r']).to(device)
lidar2cam_rt[sample_id, :, :3, 3] = torch.Tensor(img_metas[sample_id]['lidar2cam_t']).to(device)
lidar2img_rt = torch.matmul(cam_ints, lidar2cam_rt)
proj_matrix = {"cam_intrinsic": cam_ints, "lidar2cam_rt": lidar2cam_rt, "lidar2img_rt": lidar2img_rt}
return proj_matrix
def build_heatmap_LN(self, hidden_channel, bias, num_classes, layer_num=2, kernel_size=3):
layers = []
for i in range(layer_num-1):
layers.append(ConvLN(
hidden_channel,
hidden_channel,
kernel_size=kernel_size,
padding=(kernel_size-1)//2,
))
layers.append(build_conv_layer(
dict(type='Conv2d'),
hidden_channel,
num_classes,
kernel_size=kernel_size,
padding=(kernel_size-1)//2,
bias=bias,
))
return nn.Sequential(*layers)
def build_heatmap(self, hidden_channel, bias, num_classes, layer_num=2, kernel_size=3):
layers = []
for i in range(layer_num-1):
layers.append(ConvModule(
hidden_channel,
hidden_channel,
kernel_size=kernel_size,
padding=(kernel_size-1)//2,
bias=bias,
conv_cfg=dict(type='Conv2d'),
norm_cfg=dict(type='BN2d'),
))
layers.append(build_conv_layer(
dict(type='Conv2d'),
hidden_channel,
num_classes,
kernel_size=kernel_size,
padding=(kernel_size-1)//2,
bias=bias,
))
return nn.Sequential(*layers)
def generate_heatmap_deform(self, lidar_feat, img_feat, voxel_height, img_metas, lidar2img_rt, input_padding_mask=None):
# img_feat [bs*num_view, C, img_h, img_w]
# lidar_feat [BS, C, H, W]
batch_size = lidar_feat.shape[0]
H, W = lidar_feat.shape[2], lidar_feat.shape[3]
voxel_height = voxel_height.view(batch_size, H*W)
valid_height_mask = voxel_height > -50
level_start_index = [0]
spatial_shapes = []
img_feats_flatten = []
for lvl in range(self.level_num):
img_h_lvl, img_w_lvl = img_feat[lvl].shape[-2], img_feat[lvl].shape[-1]
img_feat[lvl] = self.heatmap_img_proj(img_feat[lvl].permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
# img_feat[lvl] = self.heatmap_img_proj(img_feat[lvl])
img_feat[lvl] = img_feat[lvl].view(batch_size, self.num_views, self.hidden_channel, img_h_lvl, img_w_lvl)
img_feat_flatten = img_feat[lvl].view(batch_size, self.num_views, self.hidden_channel, img_h_lvl*img_w_lvl)
img_feats_flatten.append(img_feat_flatten)
level_start_index.append(level_start_index[-1] + img_h_lvl * img_w_lvl)
spatial_shapes.append([img_h_lvl, img_w_lvl])
level_start_index = level_start_index[:-1]
level_start_index = torch.LongTensor(level_start_index).to(lidar_feat.device)
spatial_shapes = torch.LongTensor(spatial_shapes).to(lidar_feat.device)
img_feats_stack = torch.cat(img_feats_flatten, dim=3) # [bs, num_view, C, h*w (sum)]
normal_img_feats_pos_stack = self.normal_img_feats_pos_stack # [1, h*w (sum), 2]
lidar_feat = self.heatmap_pts_proj(lidar_feat.permute(0, 2, 3, 1)).permute(0, 3, 1, 2)
# lidar_feat = self.heatmap_pts_proj(lidar_feat)
lidar_feat_flatten = lidar_feat.reshape(batch_size, self.hidden_channel, H*W) # [bs, C, H*W]
lidar_feat_output = torch.zeros(batch_size, self.hidden_channel, H*W).to(lidar_feat.device)
lidar_feat_count = torch.zeros(batch_size, 1, H*W).to(lidar_feat.device)
bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device)
query_pos_realmetric = bev_pos.permute(0, 2, 1) * self.test_cfg['out_size_factor'] * \
self.test_cfg['voxel_size'][0] + self.test_cfg['pc_range'][0] # (bs, 2, H*W)
query_pos_3d = torch.cat([query_pos_realmetric, voxel_height[:, None]], dim=1) # (bs, 3, H*W)
points_4d = torch.cat([query_pos_3d, torch.ones_like(query_pos_3d[:, :1])], dim=1).permute(0, 2, 1) # (bs, H*W, 4)
points_2d = torch.matmul(points_4d[:, None], lidar2img_rt.transpose(-1, -2)) # (bs, num_view, H*W, 4)
points_2d[..., 2] = torch.clamp(points_2d[..., 2], min=1e-5)
points_2d[..., :2] = points_2d[..., :2] / points_2d[..., 2:3] / self.out_size_factor_img
if 'valid_shape' in img_metas[0]:
valid_shape = []
for sample_idx in range(batch_size):
sample_valid_shape = img_metas[sample_idx]['valid_shape'] / self.out_size_factor_img
valid_shape.append(sample_valid_shape)
valid_shape = np.array(valid_shape)
valid_img_w = valid_shape[..., 0]
valid_img_h = valid_shape[..., 1]
else:
valid_img_w = np.full([batch_size, self.num_views], img_feat[0].shape[-1])
valid_img_h = np.full([batch_size, self.num_views], img_feat[0].shape[-2])
valid_img_w = torch.from_numpy(valid_img_w).to(points_2d.device)
valid_img_h = torch.from_numpy(valid_img_h).to(points_2d.device)
img_h, img_w = img_feat[0].shape[-2], img_feat[0].shape[-1]
center_xs = points_2d[..., 0] # (bs, num_view, H*W)
center_ys = points_2d[..., 1]
on_the_image = (center_xs >= 0) & (center_xs < valid_img_w[..., None]) & (center_ys >= 0) & \
(center_ys < valid_img_h[..., None]) & valid_height_mask[:, None] # [bs, num_view, H*W]
depth = points_2d[..., 2] # [bs, num_view, H*W]
depth = torch.log(depth)
for sample_idx in range(batch_size):
on_the_image_sample = on_the_image[sample_idx] # [num_view, H*W]
bincount = torch.sum(on_the_image_sample, dim=1)
max_len = torch.max(bincount)
sample_query_feature = torch.zeros([self.num_views, self.hidden_channel, max_len], device=points_2d.device)
sample_query_pos = torch.zeros([self.num_views, max_len, 3], device=points_2d.device)
sample_reference_points = torch.zeros([self.num_views, max_len, 2], device=points_2d.device)
sample_padding_mask = torch.zeros([self.num_views, max_len], device=points_2d.device, dtype=torch.bool)
for view_idx in range(self.num_views):
on_the_image_view = on_the_image_sample[view_idx]
center_xs_view = center_xs[sample_idx, view_idx, on_the_image_view] # [N, ]
center_ys_view = center_ys[sample_idx, view_idx, on_the_image_view] # [N, ]
reference_points = torch.stack([center_xs_view / img_w, center_ys_view / img_h], dim=-1) # [N, 2]
view_count = bincount[view_idx]
sample_reference_points[view_idx, :view_count] = reference_points
sample_query_feature[view_idx, :, :view_count] = lidar_feat_flatten[sample_idx, :, on_the_image_view]
sample_query_pos[view_idx, :view_count, 2] = depth[sample_idx, view_idx, on_the_image_view]
sample_padding_mask[view_idx, view_count:] = True
sample_centers_normal = sample_reference_points * 2 - 1
sample_query_img_feat = []
for lvl in range(self.level_num):
img_feat_lvl = img_feat[lvl][sample_idx]
img_feat_lvl = F.grid_sample(img_feat_lvl, sample_centers_normal[:, None], mode='bilinear', padding_mode="border", align_corners=False)
img_feat_lvl = img_feat_lvl[:, :, 0]
sample_query_img_feat.append(img_feat_lvl)
sample_query_img_feat = torch.stack(sample_query_img_feat, dim=0)
sample_query_img_feat = torch.max(sample_query_img_feat, dim=0)[0] # [num_view, C, max_len]
sample_query_feature = sample_query_feature + sample_query_img_feat
sample_query_pos[..., :2] = inverse_sigmoid(sample_reference_points)
sample_reference_points = sample_reference_points[:, :, None].repeat(1, 1, self.level_num, 1)
if batch_size == 1: # whether it is doing evaluation or training
if input_padding_mask is None:
sample_input_padding_mask = None
else:
sample_input_padding_mask = input_padding_mask[sample_idx:sample_idx+1]
output = self.cross_heatmap_decoder(
sample_query_feature, img_feats_stack[sample_idx],
sample_query_pos, normal_img_feats_pos_stack.repeat(self.num_views, 1, 1),
reference_points=sample_reference_points, level_start_index=level_start_index, spatial_shapes=spatial_shapes,
query_padding_mask=sample_padding_mask, input_padding_mask=sample_input_padding_mask
)
else:
output = []
for view_idx in range(self.num_views):
view_query_feature = sample_query_feature[view_idx, :, torch.logical_not(sample_padding_mask[view_idx])]
view_query_pos = sample_query_pos[view_idx, torch.logical_not(sample_padding_mask[view_idx])]
view_reference_points = sample_reference_points[view_idx, torch.logical_not(sample_padding_mask[view_idx])]
if input_padding_mask is None:
view_input_padding_mask = None
else:
view_input_padding_mask = input_padding_mask[sample_idx, view_idx, None]
output_item = self.cross_heatmap_decoder(
view_query_feature[None], img_feats_stack[sample_idx, view_idx, None],
view_query_pos[None], normal_img_feats_pos_stack,
reference_points=view_reference_points[None], level_start_index=level_start_index, spatial_shapes=spatial_shapes,
input_padding_mask=view_input_padding_mask
)
output_item_pad = torch.zeros([output_item.shape[1], sample_padding_mask.shape[1]]).type_as(output_item)
output_item_pad[:, torch.logical_not(sample_padding_mask[view_idx])] = output_item[0]
output.append(output_item_pad)
output = torch.stack(output, dim=0)
for view_idx in range(self.num_views):
view_count = bincount[view_idx]
on_the_image_view = on_the_image_sample[view_idx]
overlap_mask = lidar_feat_count[sample_idx, 0, on_the_image_view] > 0
output_view = output[view_idx, :, :view_count]
nonoverlap_mask = torch.logical_not(overlap_mask)
lidar_feat_output_view = lidar_feat_output[sample_idx, :, on_the_image_view]
lidar_feat_output_view[:, overlap_mask] = torch.maximum(lidar_feat_output_view[:, overlap_mask], output_view[:, overlap_mask])
lidar_feat_output_view[:, nonoverlap_mask] = output_view[:, nonoverlap_mask]
lidar_feat_output[sample_idx, :, on_the_image_view] = lidar_feat_output_view
lidar_feat_count[sample_idx, :, on_the_image_view] += 1
lidar_feat_output = lidar_feat_output.reshape(batch_size, lidar_feat_output.shape[1], H, W)
# lidar_feat_output = self.reduce_conv(lidar_feat_output)
lidar_feat_count = lidar_feat_count.reshape(batch_size, 1, H, W)
lidar_feat_flag = torch.where(lidar_feat_count>0, torch.ones_like(lidar_feat_count), torch.zeros_like(lidar_feat_count))
lidar_feat_output = lidar_feat_output + (1 - lidar_feat_flag) * lidar_feat
lidar_feat_output = torch.cat([lidar_feat_output, lidar_feat_flag], dim=1)
lidar_feat_output = self.reduce_conv(lidar_feat_output)
heatmap_output = self.cross_heatmap_head(lidar_feat_output.contiguous())
return heatmap_output
def generate_heatmap(self, lidar_feat, min_voxel_height, max_voxel_height, batch_size, img_metas, lidar2img_rt, img_feat=None, input_padding_mask=None):
dense_heatmap = self.heatmap_head(lidar_feat) # [BS, num_class, H, W]
if img_feat is None:
heatmap = dense_heatmap.detach().sigmoid() # [BS, num_class, H, W]
else:
voxel_height = (min_voxel_height + max_voxel_height) / 2
dense_heatmap_cross = self.generate_heatmap_deform(lidar_feat, img_feat, voxel_height, img_metas, lidar2img_rt, input_padding_mask)
if self.cross_only:
heatmap = dense_heatmap_cross.detach().sigmoid()
else:
heatmap = (dense_heatmap.detach().sigmoid() + dense_heatmap_cross.detach().sigmoid()) / 2
dense_heatmap = dense_heatmap_cross
padding = self.nms_kernel_size // 2
local_max = torch.zeros_like(heatmap)
# equals to nms radius = voxel_size * out_size_factor * kenel_size
local_max_inner = F.max_pool2d(heatmap, kernel_size=self.nms_kernel_size, stride=1, padding=0)
local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner
## for Pedestrian & Traffic_cone in nuScenes
if self.test_cfg['dataset'] == 'nuScenes':
local_max[:, 8, ] = F.max_pool2d(heatmap[:, 8], kernel_size=1, stride=1, padding=0)
local_max[:, 9, ] = F.max_pool2d(heatmap[:, 9], kernel_size=1, stride=1, padding=0)
elif self.test_cfg['dataset'] == 'Waymo': # for Pedestrian & Cyclist in Waymo
local_max[:, 1, ] = F.max_pool2d(heatmap[:, 1], kernel_size=1, stride=1, padding=0)
local_max[:, 2, ] = F.max_pool2d(heatmap[:, 2], kernel_size=1, stride=1, padding=0)
heatmap = heatmap * (heatmap == local_max) # [BS, num_class, H, W]
heatmap = heatmap.view(batch_size, heatmap.shape[1], -1) # [BS, num_class, H*W]
# top #num_proposals among all classes
top_proposals = heatmap.reshape(batch_size, -1).argsort(dim=-1, descending=True)[..., :self.num_proposals] # [BS, num_proposals]
top_proposals_class = top_proposals // heatmap.shape[-1] # [BS, num_proposals]
top_proposals_index = top_proposals % heatmap.shape[-1] # [BS, num_proposals]
return heatmap, dense_heatmap, top_proposals_class, top_proposals_index
def generate_heatmap_img(self, img_feats, batch_size):
img_dense_heatmaps = []
img_heatmaps = []
for lvl in range(self.level_num):
# img_dense_heatmap = self.img_heatmap_head(img_feats[lvl]) # [BS*num_view, num_class, h, w]
img_dense_heatmap = self.img_heatmap_head[lvl](img_feats[lvl]) # [BS*num_view, num_class, h, w]
img_heatmap = img_dense_heatmap.detach().sigmoid() # [BS*num_view, num_class, h, w]
padding = self.img_nms_kernel_size // 2
local_max = torch.zeros_like(img_heatmap)
# equals to nms radius = voxel_size * out_size_factor * kenel_size
local_max_inner = F.max_pool2d(img_heatmap, kernel_size=self.img_nms_kernel_size, stride=1, padding=0)
local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner
img_heatmap = img_heatmap * (img_heatmap == local_max) # [BS*num_view, num_class, h, w]
img_heatmap = img_heatmap.view(batch_size, self.num_views, img_heatmap.shape[1], -1) # [BS, num_views, num_class, h*w]
img_heatmap = img_heatmap.permute(0, 2, 1, 3) # [BS, num_class, num_views, h*w]
img_heatmaps.append(img_heatmap)
img_dense_heatmap = img_dense_heatmap.view(batch_size, self.num_views, img_dense_heatmap.shape[1],
img_dense_heatmap.shape[2], img_dense_heatmap.shape[3]) # [BS, num_views, num_class, h, w]
img_dense_heatmap = img_dense_heatmap.permute(0, 2, 1, 3, 4) # [BS, num_class, num_views, h, w]
img_dense_heatmap = img_dense_heatmap.view(batch_size, self.num_classes, self.num_views, img_dense_heatmap.shape[-2]*img_dense_heatmap.shape[-1])
img_dense_heatmaps.append(img_dense_heatmap)
img_heatmap_stack = torch.cat(img_heatmaps, dim=3) # [BS, num_class, num_views, h*w (sum)]
# top #num_proposals among all classes
top_proposals = img_heatmap_stack.view(batch_size, -1).argsort(dim=-1, descending=True)[..., :self.num_img_proposals] # [BS, num_proposals]
top_proposals_class = top_proposals // (img_heatmap_stack.shape[-1]*img_heatmap_stack.shape[-2]) # [BS, num_proposals]
top_proposals_view_index = top_proposals % (img_heatmap_stack.shape[-1]*img_heatmap_stack.shape[-2]) // img_heatmap_stack.shape[-1] # [BS, num_proposals]
top_proposals_pos_index = top_proposals % img_heatmap_stack.shape[-1] # [BS, num_proposals]
top_proposals_index = top_proposals % (img_heatmap_stack.shape[-1]*img_heatmap_stack.shape[-2]) # [BS, num_proposals]
img_heatmap_stack = img_heatmap_stack.contiguous().view(batch_size, img_heatmap_stack.shape[1], -1)
img_dense_heatmaps_stack = torch.cat(img_dense_heatmaps, dim=-1)
return img_heatmap_stack, img_dense_heatmaps_stack, top_proposals_class, top_proposals_index, top_proposals_view_index, top_proposals_pos_index
def get_targets(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible, gt_bboxes_lidar_view, preds_dict, img_metas):
"""Generate training targets.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
preds_dicts (tuple of dict): first index by layer (default 1)
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
- torch.Tensor: classification target. [BS, num_proposals]
- torch.Tensor: classification weights (mask) [BS, num_proposals]
- torch.Tensor: regression target. [BS, num_proposals, 8]
- torch.Tensor: regression weights. [BS, num_proposals, 8]
"""
# change preds_dict into list of dict (index by batch_id)
# preds_dict[0]['center'].shape [bs, 3, num_proposal]
list_of_pred_dict = []
for batch_idx in range(len(gt_bboxes_3d)):
pred_dict = {}
for key in preds_dict[0].keys():
pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1]
list_of_pred_dict.append(pred_dict)
assert len(gt_bboxes_3d) == len(list_of_pred_dict)
res_tuple = multi_apply(self.get_targets_single, gt_bboxes_3d, gt_labels_3d, gt_visible, list_of_pred_dict, np.arange(len(gt_labels_3d)))
labels = torch.cat(res_tuple[0], dim=0)
label_weights = torch.cat(res_tuple[1], dim=0)
bbox_targets = torch.cat(res_tuple[2], dim=0)
bbox_weights = torch.cat(res_tuple[3], dim=0)
ious = torch.cat(res_tuple[4], dim=0)
num_pos_layer = np.concatenate(res_tuple[5], axis=0) # [BS, num_layer]
# matched_ious = np.mean(res_tuple[6])
matched_ious = torch.cat(res_tuple[6], dim=0)
res_tuple_2d = multi_apply(self.get_targets_single_2d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_bboxes_lidar_view, list_of_pred_dict, img_metas, np.arange(len(gt_bboxes)))
labels_2d = torch.cat(res_tuple_2d[0], dim=0)
label_weights_2d = torch.cat(res_tuple_2d[1], dim=0)
bbox_targets_2d = torch.cat(res_tuple_2d[2], dim=0)
bbox_weights_2d = torch.cat(res_tuple_2d[3], dim=0)
ious_2d = torch.cat(res_tuple_2d[4], dim=0)
num_pos_layer_2d = np.concatenate(res_tuple_2d[5], axis=0) # [BS, num_layer]
matched_ious_2d = torch.cat(res_tuple_2d[6], dim=0)
if self.view_transform:
res_tuple_view = multi_apply(self.get_targets_single_view, gt_bboxes_3d, gt_labels_3d, gt_visible, list_of_pred_dict, np.arange(len(gt_bboxes)))
labels_view = torch.cat(res_tuple_view[0], dim=0)
label_weights_view = torch.cat(res_tuple_view[1], dim=0)
bbox_targets_view = torch.cat(res_tuple_view[2], dim=0)
bbox_weights_view = torch.cat(res_tuple_view[3], dim=0)
ious_view = torch.cat(res_tuple_view[4], dim=0)
num_pos_layer_view = np.concatenate(res_tuple_view[5], axis=0) # [BS, num_layer]
matched_ious_view = torch.cat(res_tuple_view[6], dim=0)
if self.initialize_by_heatmap:
heatmap = torch.cat(res_tuple[7], dim=0)
heatmap_2d = torch.cat(res_tuple_2d[7], dim=0)
if self.view_transform:
return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \
matched_ious_2d, heatmap_2d, labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, \
ious_view, num_pos_layer_view, matched_ious_view
else:
return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \
matched_ious_2d, heatmap_2d
else:
if self.view_transform:
return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, matched_ious_2d, \
labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, ious_view, num_pos_layer_view, \
matched_ious_view
else:
return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, matched_ious_2d,
def get_targets_single_2d(self, gt_bboxes, gt_labels, gt_centers_2d, gt_bboxes_cam_view, gt_bboxes_lidar_view, preds_dict, img_metas, batch_idx):
num_proposals = preds_dict['cls'].shape[-1]
loc_cam_3d = copy.deepcopy(preds_dict['loc_cam_3d'].detach())
dim = copy.deepcopy(preds_dict['dim_2d'].detach())
rot = copy.deepcopy(preds_dict['rot_2d'].detach())
if 'vel_2d' in preds_dict.keys():
vel = copy.deepcopy(preds_dict['vel_2d'].detach())
else:
vel = None
view = copy.deepcopy(preds_dict['view'].detach())[0] # [num_proposals, ]
score = copy.deepcopy(preds_dict['cls'].detach())
bboxes_dict = self.bbox_2d_coder.decode(score, rot, dim, loc_cam_3d, vel)
bboxes_3d_tensor = bboxes_dict[0]['bboxes']
gt_bboxes_3d_tensor = gt_bboxes_cam_view.tensor.to(score.device)
gt_bboxes_lidar_view_tensor = gt_bboxes_lidar_view.tensor.to(score.device)
assert gt_bboxes_lidar_view_tensor.shape[0] == gt_bboxes_3d_tensor.shape[0]
img_shape = img_metas['pad_shape']
img_scale =[img_shape[1], img_shape[0], img_shape[1], img_shape[0]]
img_scale = torch.Tensor(img_scale).to(score.device).unsqueeze(0)
gt_centers_2d = gt_centers_2d.float()
normal_gt_centers = gt_centers_2d[..., :2] / img_scale[..., :2]
normal_gt_bboxes = gt_bboxes.float() / img_scale
assign_result_list = []
for idx_layer in range(self.num_img_decoder_layers):
bboxes_tensor_layer = bboxes_3d_tensor[idx_layer*self.num_img_proposals:(idx_layer+1)*self.num_img_proposals, :] # [num_proposals, 10]
score_layer = score[..., idx_layer*self.num_img_proposals:(idx_layer+1)*self.num_img_proposals] # [1, num_class, num_proposal]
view_layer = view[idx_layer*self.num_img_proposals:(idx_layer+1)*self.num_img_proposals] # [num_proposals]
assign_result = self.bbox_assigner_2d.assign(bboxes_tensor_layer, gt_bboxes_3d_tensor, gt_labels, score_layer, view_layer, self.train_cfg)
assign_result_list.append(assign_result)
# combine assign result of each layer
assign_result_ensemble = AssignResult(
num_gts=sum([res.num_gts for res in assign_result_list]),
gt_inds=torch.cat([res.gt_inds for res in assign_result_list]),
max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]),
labels=torch.cat([res.labels for res in assign_result_list]),
)
sampling_result = self.bbox_sampler.sample(assign_result_ensemble, bboxes_3d_tensor, gt_bboxes_3d_tensor)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
assert len(pos_inds) + len(neg_inds) == num_proposals
start = 0
pos_num_layers = []
for idx_layer in range(self.num_img_decoder_layers):
layer_num_proposal = self.num_img_proposals
layer_mask = torch.logical_and(pos_inds>=start, pos_inds 0:
# bbox_targets[pos_inds, :] = sampling_result.pos_gt_bboxes
bbox_weights[pos_inds, :] = 1.0
pos_gt_bboxes = sampling_result.pos_gt_bboxes
pos_bbox_targets = self.bbox_2d_coder.encode(pos_gt_bboxes)
bbox_targets[pos_inds, :pos_bbox_targets.shape[1]] = pos_bbox_targets
view_targets[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds, 1]
if gt_labels is None:
labels[pos_inds] = 1
else:
labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds, 0]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
center_targets[pos_inds, :] = normal_gt_centers[sampling_result.pos_assigned_gt_inds, :2]
center_weights[pos_inds] = 1.0
depth = gt_centers_2d[sampling_result.pos_assigned_gt_inds, 2]
depth_labels[pos_inds] = depth
depth_weights[pos_inds] = 1
view_mask_ignore = view_targets != view
bbox_weights[view_mask_ignore, :] = 0
label_weights[view_mask_ignore] = 0
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
bbox_targets[:, :2] = center_targets
bbox_targets[:, 2] = depth_labels
# # compute dense heatmap targets
if self.initialize_by_heatmap:
device = labels.device
feature_map_size = (img_shape[1] // self.out_size_factor_img, img_shape[0] // self.out_size_factor_img)
w, h = feature_map_size
heatmaps = []
for lvl in range(self.level_num):
heatmaps.append(score.new_zeros(self.num_classes, self.num_views, h, w))
h = h // 2
w = w // 2
for idx in range(len(gt_bboxes)):
width = gt_bboxes[idx][2]
length = gt_bboxes[idx][3]
max_l = max(length, width)
width = width / self.out_size_factor_img
length = length / self.out_size_factor_img
view_id = gt_labels[idx][1]
if width > 0 and length > 0:
radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap_2d'])
radius = max(self.train_cfg['min_radius'], radius)
radius = min(self.train_cfg['max_radius'], radius)
x, y = gt_centers_2d[idx][0], gt_centers_2d[idx][1]
# x, y = gt_bboxes[idx][0], gt_bboxes[idx][1]
coor_x = x / self.out_size_factor_img
coor_y = y / self.out_size_factor_img
center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device)
if self.level_num == 4:
if max_l < 48:
lvl = 0
elif max_l < 96:
lvl = 1
center = center / 2
radius = radius / 2
elif max_l < 192:
lvl = 2
center = center / 4
radius = radius / 4
else:
lvl = 3
center = center / 8
radius = radius / 8
elif self.level_num == 3:
if max_l < 48:
lvl = 0
elif max_l < 96:
lvl = 1
center = center / 2
radius = radius / 2
else:
lvl = 2
center = center / 4
radius = radius / 4
elif self.level_num == 2:
if max_l < 96:
lvl = 0
else:
lvl = 1
center = center / 2
radius = radius / 2
else:
assert self.level_num == 1
lvl = 0
center_int = center.to(torch.int32)
radius = int(radius)
draw_heatmap_gaussian(heatmaps[lvl][gt_labels[idx][0], view_id], center_int, radius)
for lvl in range(self.level_num):
heatmaps[lvl] = heatmaps[lvl].view(self.num_classes, self.num_views, heatmaps[lvl].shape[-2]*heatmaps[lvl].shape[-1])
heatmap = torch.cat(heatmaps, dim=-1)
matched_ious = torch.ones_like(ious) * -1
matched_ious[pos_inds] = ious[pos_inds]
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None], heatmap[None], labels_lidar[None], label_lidar_weights[None], bbox_lidar_targets[None], bbox_lidar_weights[None], ious_lidar[None]
else:
matched_ious = torch.ones_like(ious) * -1
matched_ious[pos_inds] = ious[pos_inds]
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None], labels_lidar[None], label_lidar_weights[None], bbox_lidar_targets[None], bbox_lidar_weights[None], ious_lidar[None]
def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, gt_visible, preds_dict, batch_idx):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
gt_bboxes (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes 2d.
gt_labels (torch.Tensor): Labels of boxes 2d.
preds_dict (dict): dict of prediction result for a single sample
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
- torch.Tensor: classification target. [1, num_proposals]
- torch.Tensor: classification weights (mask) [1, num_proposals]
- torch.Tensor: regression target. [1, num_proposals, 8]
- torch.Tensor: regression weights. [1, num_proposals, 8]
- torch.Tensor: iou target. [1, num_proposals]
- int: number of positive proposals
"""
num_proposals = preds_dict['center'].shape[-1]
# get pred boxes, carefully ! donot change the network outputs
score = copy.deepcopy(preds_dict['heatmap'].detach())
center = copy.deepcopy(preds_dict['center'].detach())
height = copy.deepcopy(preds_dict['height'].detach())
dim = copy.deepcopy(preds_dict['dim'].detach())
rot = copy.deepcopy(preds_dict['rot'].detach())
if 'vel' in preds_dict.keys():
vel = copy.deepcopy(preds_dict['vel'].detach())
else:
vel = None
boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) # decode the prediction to real world metric bbox
bboxes_tensor = boxes_dict[0]['bboxes']
gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device)
num_fusion_decoder_layers = self.num_fusion_decoder_layers
num_layer = self.num_pts_decoder_layers + num_fusion_decoder_layers
start = 0
pos_inds = []
neg_inds = []
pos_gt_bboxes = []
pos_gt_labels = []
ious = []
for idx_layer in range(num_layer):
layer_num_proposal = self.get_layer_num_proposal(idx_layer)
bboxes_tensor_layer = bboxes_tensor[start:start + layer_num_proposal, :]
score_layer = score[..., start:start + layer_num_proposal]
gt_bboxes_tensor_layer = gt_bboxes_tensor
gt_labels_3d_layer = gt_labels_3d
if self.train_cfg.assigner.type == 'HungarianAssigner3D':
assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor_layer, gt_labels_3d_layer, score_layer, self.train_cfg)
elif self.train_cfg.assigner.type == 'HeuristicAssigner':
assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor_layer, None, gt_labels_3d_layer, self.query_labels[batch_idx])
else:
raise NotImplementedError
# assign_result_list.append(assign_result)
sampling_result_layer = self.bbox_sampler.sample(assign_result, bboxes_tensor_layer, gt_bboxes_tensor_layer)
pos_inds_layer = sampling_result_layer.pos_inds + start
neg_inds_layer = sampling_result_layer.neg_inds + start
pos_inds.append(pos_inds_layer)
neg_inds.append(neg_inds_layer)
pos_gt_bboxes_layer = sampling_result_layer.pos_gt_bboxes
pos_gt_labels_layer = gt_labels_3d_layer[sampling_result_layer.pos_assigned_gt_inds]
pos_gt_bboxes.append(pos_gt_bboxes_layer)
pos_gt_labels.append(pos_gt_labels_layer)
ious_layer = assign_result.max_overlaps
ious.append(ious_layer)
start += layer_num_proposal
pos_inds = torch.cat(pos_inds)
neg_inds = torch.cat(neg_inds)
pos_gt_bboxes = torch.cat(pos_gt_bboxes, dim=0)
pos_gt_labels = torch.cat(pos_gt_labels, dim=0)
assert len(pos_inds) + len(neg_inds) == num_proposals
start = 0
pos_num_layers = []
for idx_layer in range(num_layer):
layer_num_proposal = self.get_layer_num_proposal(idx_layer)
count = pos_inds[torch.logical_and(pos_inds>=start, pos_inds 0:
label_weights[neg_inds] = 1.0
if len(pos_inds) > 0:
pos_bbox_targets = self.bbox_coder.encode(pos_gt_bboxes)
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
if gt_labels_3d is None:
labels[pos_inds] = 1
else:
labels[pos_inds] = pos_gt_labels
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
# # compute dense heatmap targets
if self.initialize_by_heatmap:
device = labels.device
gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device)
grid_size = torch.tensor(self.train_cfg['grid_size'])
pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
voxel_size = torch.tensor(self.train_cfg['voxel_size'])
feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # [x_len, y_len]
heatmap = gt_bboxes_3d.new_zeros(self.num_classes, feature_map_size[1], feature_map_size[0])
for idx in range(len(gt_bboxes_3d)):
width = gt_bboxes_3d[idx][3]
length = gt_bboxes_3d[idx][4]
width = width / voxel_size[0] / self.train_cfg['out_size_factor']
length = length / voxel_size[1] / self.train_cfg['out_size_factor']
if width > 0 and length > 0:
radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap'])
radius = max(self.train_cfg['min_radius'], int(radius))
x, y = gt_bboxes_3d[idx][0], gt_bboxes_3d[idx][1]
coor_x = (x - pc_range[0]) / voxel_size[0] / self.train_cfg['out_size_factor']
coor_y = (y - pc_range[1]) / voxel_size[1] / self.train_cfg['out_size_factor']
center_img = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device)
center_int = center_img.to(torch.int32)
draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]], center_int, radius)
matched_ious = torch.ones_like(ious) * -1
matched_ious[pos_inds] = ious[pos_inds]
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None], heatmap[None]
else:
matched_ious = torch.ones_like(ious) * -1
matched_ious[pos_inds] = ious[pos_inds]
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None]
def get_targets_single_view(self, gt_bboxes_3d, gt_labels_3d, gt_visible_3d, preds_dict, batch_idx):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
gt_bboxes (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes 2d.
gt_labels (torch.Tensor): Labels of boxes 2d.
preds_dict (dict): dict of prediction result for a single sample
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
- torch.Tensor: classification target. [1, num_proposals]
- torch.Tensor: classification weights (mask) [1, num_proposals]
- torch.Tensor: regression target. [1, num_proposals, 8]
- torch.Tensor: regression weights. [1, num_proposals, 8]
- torch.Tensor: iou target. [1, num_proposals]
- int: number of positive proposals
"""
num_proposals = preds_dict['center_view'].shape[-1]
# get pred boxes, carefully ! donot change the network outputs
score = copy.deepcopy(preds_dict['heatmap_view'].detach())
center = copy.deepcopy(preds_dict['center_view'].detach())
height = copy.deepcopy(preds_dict['height_view'].detach())
dim = copy.deepcopy(preds_dict['dim_view'].detach())
rot = copy.deepcopy(preds_dict['rot_view'].detach())
if 'vel_view' in preds_dict.keys():
vel = copy.deepcopy(preds_dict['vel_view'].detach())
else:
vel = None
boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) # decode the prediction to real world metric bbox
bboxes_tensor = boxes_dict[0]['bboxes']
assert gt_visible_3d.shape[0] == gt_bboxes_3d.tensor.shape[0] == gt_labels_3d.shape[0]
gt_mask = gt_visible_3d == 1
gt_bboxes_3d = gt_bboxes_3d[gt_mask]
gt_labels_3d = gt_labels_3d[gt_mask]
gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device)
num_layer = 1
assign_result_list = []
start = 0
for idx_layer in range(num_layer):
layer_num_proposal = self.get_layer_num_proposal(idx_layer)
bboxes_tensor_layer = bboxes_tensor[start:start + layer_num_proposal, :]
score_layer = score[..., start:start + layer_num_proposal]
start += layer_num_proposal
if self.train_cfg.assigner.type == 'HungarianAssigner3D':
assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, gt_labels_3d, score_layer, self.train_cfg)
elif self.train_cfg.assigner.type == 'HeuristicAssigner':
assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, None, gt_labels_3d, self.query_labels[batch_idx])
else:
raise NotImplementedError
assign_result_list.append(assign_result)
# combine assign result of each layer
assign_result_ensemble = AssignResult(
num_gts=sum([res.num_gts for res in assign_result_list]),
gt_inds=torch.cat([res.gt_inds for res in assign_result_list]),
max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]),
labels=torch.cat([res.labels for res in assign_result_list]),
)
sampling_result = self.bbox_sampler.sample(assign_result_ensemble, bboxes_tensor, gt_bboxes_tensor)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
assert len(pos_inds) + len(neg_inds) == num_proposals
start = 0
pos_num_layers = []
for idx_layer in range(num_layer):
layer_num_proposal = self.get_layer_num_proposal(idx_layer)
count = pos_inds[torch.logical_and(pos_inds>=start, pos_inds 0:
pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_gt_bboxes)
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
if gt_labels_3d is None:
labels[pos_inds] = 1
else:
labels[pos_inds] = gt_labels_3d[sampling_result.pos_assigned_gt_inds]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
matched_ious = torch.ones_like(ious) * -1
matched_ious[pos_inds] = ious[pos_inds]
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], pos_num_layers[None], matched_ious[None]
@force_fp32(apply_to=('preds_dicts'))
def loss(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, img_metas, preds_dicts, **kwargs):
"""Loss function for CenterHead.
Args:
**The followings are in the same order of "gt_bboxes_3d" :**
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
gt_visible_3d (list[torch.Tensor]): visibility of LiDAR boxes for camera
**The followings are in the same order of "gt_bboxes":**
gt_bboxes (list[torch.Tensor]): Ground truth of projected 2d boxes.
(one LiDAR box may be projected to zero/one/two camera views, so "gt_bboxes" has different number with "gt_bboxes_3d")
gt_labels (list[torch.Tensor]): Labels and camera view ids of projected 2d boxes.
gt_pts_centers_view (list[torch.Tensor]): 3D center of each boxes in the LiDAR coordinate
gt_img_centers_view (list[torch.Tensor]): 3D center of each boxes in the corresponding camera coordinate
gt_bboxes_cam_view (list[:obj:`CameraInstance3DBoxes`]): ground truth boxes in the corresponding camera coordinate
gt_bboxes_lidar_view (list[:obj:`LiDARInstance3DBoxes`]): ground truth boxes in the LiDAR coordinate
preds_dicts (list[list[dict]]): Output of forward function.
Returns:
dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
"""
if self.initialize_by_heatmap:
if self.view_transform:
labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \
matched_ious_2d, heatmap_2d, labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, ious_view, \
num_pos_layer_view, matched_ious_view = self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, preds_dicts[0], img_metas)
else:
labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, heatmap, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \
matched_ious_2d, heatmap_2d = self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, preds_dicts[0], img_metas)
else:
if self.view_transform:
labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, \
matched_ious_2d, labels_view, label_weights_view, bbox_targets_view, bbox_weights_view, ious_view, \
num_pos_layer_view, matched_ious_view = self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, preds_dicts[0], img_metas)
else:
labels, label_weights, bbox_targets, bbox_weights, ious, num_pos_layer, matched_ious, \
labels_2d, label_weights_2d, bbox_targets_2d, bbox_weights_2d, ious_2d, num_pos_layer_2d, matched_ious_2d = \
self.get_targets(gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, preds_dicts[0], img_metas) # if hasattr(self, 'on_the_image_mask'):
preds_dict = preds_dicts[0][0]
loss_dict = dict()
if self.initialize_by_heatmap:
# compute heatmap loss
loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict['dense_heatmap']), heatmap, avg_factor=max(heatmap.eq(1).float().sum().item(), 1))
if 'valid_shape' in img_metas[0].keys():
bs = heatmap_2d.shape[0]
num_view = heatmap_2d.shape[2]
# heatmap_2d_weight = torch.zeros_like(heatmap_2d)
heatmaps_2d_weight = []
img_w, img_h = self.test_cfg['img_scale']
img_w = img_w // self.out_size_factor_img
img_h = img_h // self.out_size_factor_img
for lvl in range(self.level_num):
heatmap_2d_weight = torch.zeros(heatmap_2d.shape[0], self.num_classes, self.num_views, img_h, img_w).to(heatmap_2d.device)
heatmaps_2d_weight.append(heatmap_2d_weight)
img_h = img_h // 2
img_w = img_w // 2
for sample_idx in range(bs):
for view_idx in range(num_view):
valid_shape = img_metas[sample_idx]['valid_shape'][view_idx] / self.out_size_factor_img
red_width = int(valid_shape[0])
red_height = int(valid_shape[1])
for lvl in range(self.level_num):
heatmaps_2d_weight[lvl][sample_idx, :, view_idx, :red_height, :red_width] = 1
red_width = red_width // 2
red_height = red_height // 2
for lvl in range(self.level_num):
heatmaps_2d_weight[lvl] = heatmaps_2d_weight[lvl].view(heatmaps_2d_weight[lvl].shape[0], self.num_classes, self.num_views, heatmaps_2d_weight[lvl].shape[-2]*heatmaps_2d_weight[lvl].shape[-1])
heatmap_2d_weight = torch.cat(heatmaps_2d_weight, dim=-1)
loss_heatmap_2d = self.loss_heatmap_2d(clip_sigmoid(preds_dict['img_dense_heatmap']), heatmap_2d, weight=heatmap_2d_weight, avg_factor=max(heatmap_2d.eq(1).float().sum().item(), 1))
else:
loss_heatmap_2d = self.loss_heatmap_2d(clip_sigmoid(preds_dict['img_dense_heatmap']), heatmap_2d, avg_factor=max(heatmap_2d.eq(1).float().sum().item(), 1))
loss_dict['loss_heatmap'] = loss_heatmap
loss_dict['loss_heatmap_2d'] = loss_heatmap_2d
# compute loss for each layer
start = 0
num_pos_layer = np.sum(num_pos_layer, axis=0)
num_pos_layer_2d = np.sum(num_pos_layer_2d, axis=0)
if self.view_transform:
num_pos_layer_view = np.sum(num_pos_layer_view, axis=0)
num_fusion_decoder_layers = self.num_fusion_decoder_layers
num_layer = self.num_pts_decoder_layers + num_fusion_decoder_layers
for idx_layer in range(num_layer):
layer_num_proposals = self.get_layer_num_proposal(idx_layer)
if idx_layer < self.num_pts_decoder_layers:
prefix = f'layer_pts_{idx_layer}'
else:
prefix = f'layer_fusion_{idx_layer-self.num_pts_decoder_layers}'
layer_labels = labels[..., start:start + layer_num_proposals].reshape(-1)
layer_label_weights = label_weights[..., start:start + layer_num_proposals].reshape(-1)
layer_score = preds_dict['heatmap'][..., start:start + layer_num_proposals]
layer_cls_score = layer_score.permute(0, 2, 1).reshape(-1, self.num_classes)
layer_loss_cls = self.loss_cls(layer_cls_score, layer_labels, layer_label_weights, avg_factor=max(num_pos_layer[idx_layer], 1))
layer_center = preds_dict['center'][..., start:start + layer_num_proposals]
layer_height = preds_dict['height'][..., start:start + layer_num_proposals]
layer_rot = preds_dict['rot'][..., start:start + layer_num_proposals]
layer_dim = preds_dict['dim'][..., start:start + layer_num_proposals]
preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size]
if 'vel' in preds_dict.keys():
layer_vel = preds_dict['vel'][..., start:start + layer_num_proposals]
preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot, layer_vel], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size]
code_weights = self.train_cfg.get('code_weights', None)
layer_bbox_weights = bbox_weights[:, start:start + layer_num_proposals, :]
layer_reg_weights = layer_bbox_weights * layer_bbox_weights.new_tensor(code_weights)
layer_bbox_targets = bbox_targets[:, start:start + layer_num_proposals, :]
layer_loss_bbox = self.loss_bbox(preds, layer_bbox_targets, layer_reg_weights, avg_factor=max(num_pos_layer[idx_layer], 1))
layer_match_ious = matched_ious[..., start:start + layer_num_proposals]
layer_match_ious = torch.sum(layer_match_ious*(layer_match_ious>=0), dim=-1) / torch.sum(layer_match_ious>=0, dim=-1)
layer_match_ious = torch.mean(layer_match_ious)
start += layer_num_proposals
loss_dict[f'{prefix}_loss_cls'] = layer_loss_cls
loss_dict[f'{prefix}_loss_bbox'] = layer_loss_bbox
loss_dict[f'{prefix}_matched_ious'] = layer_match_ious
start = 0
for idx_layer in range(self.num_img_decoder_layers):
prefix = f'layer_img_{idx_layer}'
layer_num_proposals = self.num_img_proposals
layer_labels_2d = labels_2d[..., start:start + layer_num_proposals].reshape(-1)
layer_label_weights_2d = label_weights_2d[..., start:start + layer_num_proposals].reshape(-1)
layer_score_2d = preds_dict['cls'][..., start:start + layer_num_proposals]
layer_cls_score_2d = layer_score_2d.permute(0, 2, 1).reshape(-1, self.num_classes)
layer_loss_cls_2d = self.loss_cls(layer_cls_score_2d, layer_labels_2d, layer_label_weights_2d, avg_factor=max(num_pos_layer_2d[idx_layer], 1))
preds_2d_center = preds_dict['center_2d'][..., start:start + layer_num_proposals] # [bs, 2, num_proposal]
preds_2d_depth = preds_dict['depth_2d'][..., start:start + layer_num_proposals] # [bs, 1, num_proposal]
preds_2d_dim = preds_dict['dim_2d'][..., start:start + layer_num_proposals] # [bs, 3, num_proposal]
preds_2d_rot = preds_dict['rot_2d'][..., start:start + layer_num_proposals] # [bs, 2, num_proposal]
preds_2d_vel = preds_dict['vel_2d'][..., start:start + layer_num_proposals] # [bs, 2, num_proposal]
preds_2d = torch.cat([preds_2d_center, preds_2d_depth[:, :1], preds_2d_dim, preds_2d_rot, preds_2d_vel], dim=1).permute(0, 2, 1) # [bs, num_proposal, 10]
layer_bbox_targets_2d = bbox_targets_2d[:, start:start + layer_num_proposals, :preds_2d.shape[2]]
layer_reg_weights_2d = bbox_weights_2d[:, start:start + layer_num_proposals, :preds_2d.shape[2]]
code_weights = self.train_cfg.get('img_code_weights', None)
layer_reg_weights_2d = layer_reg_weights_2d * layer_reg_weights_2d.new_tensor(code_weights)
layer_loss_center_2d = self.loss_center_2d(preds_2d[...,:2], layer_bbox_targets_2d[...,:2], layer_reg_weights_2d[...,:2], avg_factor=max(num_pos_layer_2d[idx_layer], 1))
layer_loss_depth_2d = self.loss_bbox(preds_2d[...,2:3], layer_bbox_targets_2d[...,2:3], layer_reg_weights_2d[...,2:3], avg_factor=max(num_pos_layer_2d[idx_layer], 1))
layer_loss_dim_2d = self.loss_bbox(preds_2d[...,3:6], layer_bbox_targets_2d[...,3:6], layer_reg_weights_2d[...,3:6], avg_factor=max(num_pos_layer_2d[idx_layer], 1))
layer_loss_rot_2d = self.loss_bbox(preds_2d[...,6:8], layer_bbox_targets_2d[...,6:8], layer_reg_weights_2d[...,6:8], avg_factor=max(num_pos_layer_2d[idx_layer], 1))
layer_match_ious_2d = matched_ious_2d[..., start:start + layer_num_proposals]
layer_match_ious_2d = torch.sum(layer_match_ious_2d*(layer_match_ious_2d>=0), dim=-1) / (torch.sum(layer_match_ious_2d>=0, dim=-1) + 1e-2)
layer_match_ious_2d = torch.mean(layer_match_ious_2d)
start += layer_num_proposals
loss_dict[f'{prefix}_loss_cls_2d'] = layer_loss_cls_2d
loss_dict[f'{prefix}_loss_center_2d'] = layer_loss_center_2d
loss_dict[f'{prefix}_loss_depth_2d'] = layer_loss_depth_2d
loss_dict[f'{prefix}_loss_dim_2d'] = layer_loss_dim_2d
loss_dict[f'{prefix}_loss_rot_2d'] = layer_loss_rot_2d
if preds_2d.shape[-1] > 8:
layer_loss_vel_2d = self.loss_bbox(preds_2d[...,8:10], layer_bbox_targets_2d[...,8:10], layer_reg_weights_2d[...,8:10], avg_factor=max(num_pos_layer_2d[idx_layer], 1))
loss_dict[f'{prefix}_loss_vel_2d'] = layer_loss_vel_2d
else:
layer_loss_vel_2d = 0
loss_dict[f'{prefix}_matched_ious_2d'] = layer_match_ious_2d
loss_dict[f'{prefix}_reg_bbox_2d'] = (layer_loss_center_2d+layer_loss_depth_2d+layer_loss_dim_2d+layer_loss_rot_2d+layer_loss_vel_2d).detach()
if self.view_transform:
layer_labels_view = labels_view.reshape(-1)
layer_label_weights_view = label_weights_view.reshape(-1)
layer_cls_score = preds_dict['heatmap_view'].permute(0, 2, 1).reshape(-1, self.num_classes)
layer_loss_cls_view = self.loss_cls(
layer_cls_score, layer_labels_view, layer_label_weights_view, avg_factor=max(num_pos_layer_view[0], 1)
)
layer_center_view = preds_dict['center_view']
layer_height_view = preds_dict['height_view']
layer_rot_view = preds_dict['rot_view']
layer_dim_view = preds_dict['dim_view']
preds_view = torch.cat([layer_center_view, layer_height_view, layer_dim_view, layer_rot_view],
dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size]
if 'vel' in preds_dict.keys():
layer_vel_view = preds_dict['vel_view']
preds_view = torch.cat([layer_center_view, layer_height_view, layer_dim_view, layer_rot_view, layer_vel_view],
dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size]
code_weights = self.train_cfg.get('code_weights', None)
layer_reg_weights_view = bbox_weights_view * bbox_weights_view.new_tensor(code_weights)
layer_loss_bbox_view = self.loss_bbox(preds_view, bbox_targets_view, layer_reg_weights_view, avg_factor=max(num_pos_layer_view[0], 1))
layer_match_ious_view = matched_ious_view
layer_match_ious_view = torch.sum(layer_match_ious_view * (layer_match_ious_view >= 0), dim=-1) / torch.sum(
layer_match_ious_view >= 0, dim=-1)
layer_match_ious_view = torch.mean(layer_match_ious_view)
loss_dict['view_loss_cls'] = layer_loss_cls_view
loss_dict['view_loss_bbox'] = layer_loss_bbox_view
loss_dict['view_matched_ious'] = layer_match_ious_view
return loss_dict
def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False, for_roi=False):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
Returns:
list[list[dict]]: Decoded bbox, scores and labels for each layer & each batch
"""
rets = []
for id, preds_dict in enumerate(preds_dicts):
layer_num_proposal = self.num_proposals + self.num_img_proposals
batch_size = preds_dict[0]['heatmap'].shape[0]
batch_score_raw = preds_dict[0]['heatmap'][..., -layer_num_proposal:].sigmoid()
one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1)
query_heatmap_score = preds_dict[0]['query_heatmap_score'] * one_hot
one_hot_img = F.one_hot(self.img_query_label, num_classes=self.num_classes).permute(0, 2, 1)
img_query_label_decoder = torch.max(preds_dict[0]['cls'], dim=1)[1]
one_hot_img_decoder = F.one_hot(img_query_label_decoder, num_classes=self.num_classes).permute(0, 2, 1)
img_query_heatmap_score = preds_dict[0]['img_query_heatmap_score'] * one_hot_img * one_hot_img_decoder * 0.5
query_heatmap_score = torch.cat([query_heatmap_score, img_query_heatmap_score], dim=2)
batch_score = batch_score_raw * query_heatmap_score
batch_center = preds_dict[0]['center'][..., -layer_num_proposal:]
batch_height = preds_dict[0]['height'][..., -layer_num_proposal:]
batch_dim = preds_dict[0]['dim'][..., -layer_num_proposal:]
batch_rot = preds_dict[0]['rot'][..., -layer_num_proposal:]
batch_vel = None
if 'vel' in preds_dict[0]:
batch_vel = preds_dict[0]['vel'][..., -layer_num_proposal:]
temp = self.bbox_coder.decode(batch_score, batch_rot, batch_dim, batch_center, batch_height, batch_vel, filter=True)
if self.test_cfg['dataset'] == 'nuScenes':
self.tasks = [
dict(num_class=1, class_names=['car'], indices=[0], radius=0.35),
dict(num_class=1, class_names=['truck'], indices=[1], radius=0.35),
dict(num_class=1, class_names=['construction_vehicle'], indices=[2], radius=0.35),
dict(num_class=1, class_names=['bus'], indices=[3], radius=0.35),
dict(num_class=1, class_names=['trailer'], indices=[4], radius=0.35),
dict(num_class=1, class_names=['barrier'], indices=[5], radius=0.175),
dict(num_class=1, class_names=['motorcycle'], indices=[6], radius=0.1),
dict(num_class=1, class_names=['bicycle'], indices=[7], radius=-1),
dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.1),
dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.1),
]
# self.tasks = [
# dict(num_class=8, class_names=[], indices=[0, 1, 2, 3, 4, 5, 6, 7], radius=-1),
# dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.175),
# dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.175),
# ]
elif self.test_cfg['dataset'] == 'Waymo':
self.tasks = [
dict(num_class=1, class_names=['Car'], indices=[0], radius=0.7),
dict(num_class=1, class_names=['Pedestrian'], indices=[1], radius=0.7),
dict(num_class=1, class_names=['Cyclist'], indices=[2], radius=0.7),
]
ret_layer = []
for i in range(batch_size):
boxes3d = temp[i]['bboxes']
scores = temp[i]['scores']
labels = temp[i]['labels']
## adopt circle nms for different categories
if self.test_cfg['nms_type'] != None:
keep_mask = torch.zeros_like(scores)
for task in self.tasks:
task_mask = torch.zeros_like(scores)
for cls_idx in task['indices']:
task_mask += labels == cls_idx
task_mask = task_mask.bool()
if task['radius'] > 0 and task_mask.sum() > 0:
if self.test_cfg['nms_type'] == 'circle':
boxes_for_nms = torch.cat([boxes3d[task_mask][:, :2], scores[:, None][task_mask]], dim=1)
task_keep_indices = torch.tensor(
circle_nms(
boxes_for_nms.detach().cpu().numpy(),
task['radius'],
# 5,
post_max_size=500
)
)
else:
boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](boxes3d[task_mask][:, :7], 7).bev)
top_scores = scores[task_mask]
task_keep_indices = nms_gpu(
boxes_for_nms,
top_scores,
thresh=task['radius'],
# pre_maxsize=self.test_cfg['pre_maxsize'],
# post_max_size=self.test_cfg['post_maxsize'],
)
else:
task_keep_indices = torch.arange(task_mask.sum())
if task_keep_indices.shape[0] != 0:
keep_indices = torch.where(task_mask != 0)[0][task_keep_indices]
keep_mask[keep_indices] = 1
keep_mask = keep_mask.bool()
ret = dict(bboxes=boxes3d[keep_mask], scores=scores[keep_mask], labels=labels[keep_mask])
else: # no nms
ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
ret_layer.append(ret)
rets.append(ret_layer)
assert len(rets) == 1
assert len(rets[0]) == 1
res = [[
img_metas[0]['box_type_3d'](rets[0][0]['bboxes'], box_dim=rets[0][0]['bboxes'].shape[-1]),
rets[0][0]['scores'],
rets[0][0]['labels'].int()
]]
return res
def get_layer_num_proposal(self, idx_layer):
if idx_layer >= self.num_pts_decoder_layers:
layer_num_proposal = self.num_proposals + self.num_img_proposals
else:
layer_num_proposal = self.num_proposals
return layer_num_proposal
================================================
FILE: mmdet3d/models/dense_heads/ssd_3d_head.py
================================================
import torch
from mmcv.ops.nms import batched_nms
from mmcv.runner import force_fp32
from torch.nn import functional as F
from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes,
LiDARInstance3DBoxes,
rotation_3d_in_axis)
from mmdet3d.models.builder import build_loss
from mmdet.core import multi_apply
from mmdet.models import HEADS
from .vote_head import VoteHead
@HEADS.register_module()
class SSD3DHead(VoteHead):
r"""Bbox head of `3DSSD `_.
Args:
num_classes (int): The number of class.
bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
decoding boxes.
in_channels (int): The number of input feature channel.
train_cfg (dict): Config for training.
test_cfg (dict): Config for testing.
vote_module_cfg (dict): Config of VoteModule for point-wise votes.
vote_aggregation_cfg (dict): Config of vote aggregation layer.
pred_layer_cfg (dict): Config of classfication and regression
prediction layers.
conv_cfg (dict): Config of convolution in prediction layer.
norm_cfg (dict): Config of BN in prediction layer.
act_cfg (dict): Config of activation in prediction layer.
objectness_loss (dict): Config of objectness loss.
center_loss (dict): Config of center loss.
dir_class_loss (dict): Config of direction classification loss.
dir_res_loss (dict): Config of direction residual regression loss.
size_res_loss (dict): Config of size residual regression loss.
corner_loss (dict): Config of bbox corners regression loss.
vote_loss (dict): Config of candidate points regression loss.
"""
def __init__(self,
num_classes,
bbox_coder,
in_channels=256,
train_cfg=None,
test_cfg=None,
vote_module_cfg=None,
vote_aggregation_cfg=None,
pred_layer_cfg=None,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU'),
objectness_loss=None,
center_loss=None,
dir_class_loss=None,
dir_res_loss=None,
size_res_loss=None,
corner_loss=None,
vote_loss=None):
super(SSD3DHead, self).__init__(
num_classes,
bbox_coder,
train_cfg=train_cfg,
test_cfg=test_cfg,
vote_module_cfg=vote_module_cfg,
vote_aggregation_cfg=vote_aggregation_cfg,
pred_layer_cfg=pred_layer_cfg,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
objectness_loss=objectness_loss,
center_loss=center_loss,
dir_class_loss=dir_class_loss,
dir_res_loss=dir_res_loss,
size_class_loss=None,
size_res_loss=size_res_loss,
semantic_loss=None)
self.corner_loss = build_loss(corner_loss)
self.vote_loss = build_loss(vote_loss)
self.num_candidates = vote_module_cfg['num_points']
def _get_cls_out_channels(self):
"""Return the channel number of classification outputs."""
# Class numbers (k) + objectness (1)
return self.num_classes
def _get_reg_out_channels(self):
"""Return the channel number of regression outputs."""
# Bbox classification and regression
# (center residual (3), size regression (3)
# heading class+residual (num_dir_bins*2)),
return 3 + 3 + self.num_dir_bins * 2
def _extract_input(self, feat_dict):
"""Extract inputs from features dictionary.
Args:
feat_dict (dict): Feature dict from backbone.
Returns:
torch.Tensor: Coordinates of input points.
torch.Tensor: Features of input points.
torch.Tensor: Indices of input points.
"""
seed_points = feat_dict['sa_xyz'][-1]
seed_features = feat_dict['sa_features'][-1]
seed_indices = feat_dict['sa_indices'][-1]
return seed_points, seed_features, seed_indices
@force_fp32(apply_to=('bbox_preds', ))
def loss(self,
bbox_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
img_metas=None,
gt_bboxes_ignore=None):
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of SSD3DHead.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise
semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise
instance mask.
img_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict: Losses of 3DSSD.
"""
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
bbox_preds)
(vote_targets, center_targets, size_res_targets, dir_class_targets,
dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
vote_mask, positive_mask, negative_mask, centerness_weights,
box_loss_weights, heading_res_loss_weight) = targets
# calculate centerness loss
centerness_loss = self.objectness_loss(
bbox_preds['obj_scores'].transpose(2, 1),
centerness_targets,
weight=centerness_weights)
# calculate center loss
center_loss = self.center_loss(
bbox_preds['center_offset'],
center_targets,
weight=box_loss_weights.unsqueeze(-1))
# calculate direction class loss
dir_class_loss = self.dir_class_loss(
bbox_preds['dir_class'].transpose(1, 2),
dir_class_targets,
weight=box_loss_weights)
# calculate direction residual loss
dir_res_loss = self.dir_res_loss(
bbox_preds['dir_res_norm'],
dir_res_targets.unsqueeze(-1).repeat(1, 1, self.num_dir_bins),
weight=heading_res_loss_weight)
# calculate size residual loss
size_loss = self.size_res_loss(
bbox_preds['size'],
size_res_targets,
weight=box_loss_weights.unsqueeze(-1))
# calculate corner loss
one_hot_dir_class_targets = dir_class_targets.new_zeros(
bbox_preds['dir_class'].shape)
one_hot_dir_class_targets.scatter_(2, dir_class_targets.unsqueeze(-1),
1)
pred_bbox3d = self.bbox_coder.decode(
dict(
center=bbox_preds['center'],
dir_res=bbox_preds['dir_res'],
dir_class=one_hot_dir_class_targets,
size=bbox_preds['size']))
pred_bbox3d = pred_bbox3d.reshape(-1, pred_bbox3d.shape[-1])
pred_bbox3d = img_metas[0]['box_type_3d'](
pred_bbox3d.clone(),
box_dim=pred_bbox3d.shape[-1],
with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 0.5))
pred_corners3d = pred_bbox3d.corners.reshape(-1, 8, 3)
corner_loss = self.corner_loss(
pred_corners3d,
corner3d_targets.reshape(-1, 8, 3),
weight=box_loss_weights.view(-1, 1, 1))
# calculate vote loss
vote_loss = self.vote_loss(
bbox_preds['vote_offset'].transpose(1, 2),
vote_targets,
weight=vote_mask.unsqueeze(-1))
losses = dict(
centerness_loss=centerness_loss,
center_loss=center_loss,
dir_class_loss=dir_class_loss,
dir_res_loss=dir_res_loss,
size_res_loss=size_loss,
corner_loss=corner_loss,
vote_loss=vote_loss)
return losses
def get_targets(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
bbox_preds=None):
"""Generate targets of ssd3d head.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance
label of each batch.
bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head.
Returns:
tuple[torch.Tensor]: Targets of ssd3d head.
"""
# find empty example
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
if pts_semantic_mask is None:
pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
pts_instance_mask = [None for i in range(len(gt_labels_3d))]
aggregated_points = [
bbox_preds['aggregated_points'][i]
for i in range(len(gt_labels_3d))
]
seed_points = [
bbox_preds['seed_points'][i, :self.num_candidates].detach()
for i in range(len(gt_labels_3d))
]
(vote_targets, center_targets, size_res_targets, dir_class_targets,
dir_res_targets, mask_targets, centerness_targets, corner3d_targets,
vote_mask, positive_mask, negative_mask) = multi_apply(
self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask, aggregated_points,
seed_points)
center_targets = torch.stack(center_targets)
positive_mask = torch.stack(positive_mask)
negative_mask = torch.stack(negative_mask)
dir_class_targets = torch.stack(dir_class_targets)
dir_res_targets = torch.stack(dir_res_targets)
size_res_targets = torch.stack(size_res_targets)
mask_targets = torch.stack(mask_targets)
centerness_targets = torch.stack(centerness_targets).detach()
corner3d_targets = torch.stack(corner3d_targets)
vote_targets = torch.stack(vote_targets)
vote_mask = torch.stack(vote_mask)
center_targets -= bbox_preds['aggregated_points']
centerness_weights = (positive_mask +
negative_mask).unsqueeze(-1).repeat(
1, 1, self.num_classes).float()
centerness_weights = centerness_weights / \
(centerness_weights.sum() + 1e-6)
vote_mask = vote_mask / (vote_mask.sum() + 1e-6)
box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6)
batch_size, proposal_num = dir_class_targets.shape[:2]
heading_label_one_hot = dir_class_targets.new_zeros(
(batch_size, proposal_num, self.num_dir_bins))
heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
heading_res_loss_weight = heading_label_one_hot * \
box_loss_weights.unsqueeze(-1)
return (vote_targets, center_targets, size_res_targets,
dir_class_targets, dir_res_targets, mask_targets,
centerness_targets, corner3d_targets, vote_mask, positive_mask,
negative_mask, centerness_weights, box_loss_weights,
heading_res_loss_weight)
def get_targets_single(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
aggregated_points=None,
seed_points=None):
"""Generate targets of ssd3d head for single batch.
Args:
points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \
boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic
label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance
label of each batch.
aggregated_points (torch.Tensor): Aggregated points from
candidate points layer.
seed_points (torch.Tensor): Seed points of candidate points.
Returns:
tuple[torch.Tensor]: Targets of ssd3d head.
"""
assert self.bbox_coder.with_rot or pts_semantic_mask is not None
gt_bboxes_3d = gt_bboxes_3d.to(points.device)
valid_gt = gt_labels_3d != -1
gt_bboxes_3d = gt_bboxes_3d[valid_gt]
gt_labels_3d = gt_labels_3d[valid_gt]
# Generate fake GT for empty scene
if valid_gt.sum() == 0:
vote_targets = points.new_zeros(self.num_candidates, 3)
center_targets = points.new_zeros(self.num_candidates, 3)
size_res_targets = points.new_zeros(self.num_candidates, 3)
dir_class_targets = points.new_zeros(
self.num_candidates, dtype=torch.int64)
dir_res_targets = points.new_zeros(self.num_candidates)
mask_targets = points.new_zeros(
self.num_candidates, dtype=torch.int64)
centerness_targets = points.new_zeros(self.num_candidates,
self.num_classes)
corner3d_targets = points.new_zeros(self.num_candidates, 8, 3)
vote_mask = points.new_zeros(self.num_candidates, dtype=torch.bool)
positive_mask = points.new_zeros(
self.num_candidates, dtype=torch.bool)
negative_mask = points.new_ones(
self.num_candidates, dtype=torch.bool)
return (vote_targets, center_targets, size_res_targets,
dir_class_targets, dir_res_targets, mask_targets,
centerness_targets, corner3d_targets, vote_mask,
positive_mask, negative_mask)
gt_corner3d = gt_bboxes_3d.corners
(center_targets, size_targets, dir_class_targets,
dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
points_mask, assignment = self._assign_targets_by_points_inside(
gt_bboxes_3d, aggregated_points)
center_targets = center_targets[assignment]
size_res_targets = size_targets[assignment]
mask_targets = gt_labels_3d[assignment]
dir_class_targets = dir_class_targets[assignment]
dir_res_targets = dir_res_targets[assignment]
corner3d_targets = gt_corner3d[assignment]
top_center_targets = center_targets.clone()
top_center_targets[:, 2] += size_res_targets[:, 2]
dist = torch.norm(aggregated_points - top_center_targets, dim=1)
dist_mask = dist < self.train_cfg.pos_distance_thr
positive_mask = (points_mask.max(1)[0] > 0) * dist_mask
negative_mask = (points_mask.max(1)[0] == 0)
# Centerness loss targets
canonical_xyz = aggregated_points - center_targets
if self.bbox_coder.with_rot:
# TODO: Align points rotation implementation of
# LiDARInstance3DBoxes and DepthInstance3DBoxes
canonical_xyz = rotation_3d_in_axis(
canonical_xyz.unsqueeze(0).transpose(0, 1),
-gt_bboxes_3d.yaw[assignment], 2).squeeze(1)
distance_front = torch.clamp(
size_res_targets[:, 0] - canonical_xyz[:, 0], min=0)
distance_back = torch.clamp(
size_res_targets[:, 0] + canonical_xyz[:, 0], min=0)
distance_left = torch.clamp(
size_res_targets[:, 1] - canonical_xyz[:, 1], min=0)
distance_right = torch.clamp(
size_res_targets[:, 1] + canonical_xyz[:, 1], min=0)
distance_top = torch.clamp(
size_res_targets[:, 2] - canonical_xyz[:, 2], min=0)
distance_bottom = torch.clamp(
size_res_targets[:, 2] + canonical_xyz[:, 2], min=0)
centerness_l = torch.min(distance_front, distance_back) / torch.max(
distance_front, distance_back)
centerness_w = torch.min(distance_left, distance_right) / torch.max(
distance_left, distance_right)
centerness_h = torch.min(distance_bottom, distance_top) / torch.max(
distance_bottom, distance_top)
centerness_targets = torch.clamp(
centerness_l * centerness_w * centerness_h, min=0)
centerness_targets = centerness_targets.pow(1 / 3.0)
centerness_targets = torch.clamp(centerness_targets, min=0, max=1)
proposal_num = centerness_targets.shape[0]
one_hot_centerness_targets = centerness_targets.new_zeros(
(proposal_num, self.num_classes))
one_hot_centerness_targets.scatter_(1, mask_targets.unsqueeze(-1), 1)
centerness_targets = centerness_targets.unsqueeze(
1) * one_hot_centerness_targets
# Vote loss targets
enlarged_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(
self.train_cfg.expand_dims_length)
enlarged_gt_bboxes_3d.tensor[:, 2] -= self.train_cfg.expand_dims_length
vote_mask, vote_assignment = self._assign_targets_by_points_inside(
enlarged_gt_bboxes_3d, seed_points)
vote_targets = gt_bboxes_3d.gravity_center
vote_targets = vote_targets[vote_assignment] - seed_points
vote_mask = vote_mask.max(1)[0] > 0
return (vote_targets, center_targets, size_res_targets,
dir_class_targets, dir_res_targets, mask_targets,
centerness_targets, corner3d_targets, vote_mask, positive_mask,
negative_mask)
def get_bboxes(self, points, bbox_preds, input_metas, rescale=False):
"""Generate bboxes from sdd3d head predictions.
Args:
points (torch.Tensor): Input points.
bbox_preds (dict): Predictions from sdd3d head.
input_metas (list[dict]): Point cloud and image's meta info.
rescale (bool): Whether to rescale bboxes.
Returns:
list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
"""
# decode boxes
sem_scores = F.sigmoid(bbox_preds['obj_scores']).transpose(1, 2)
obj_scores = sem_scores.max(-1)[0]
bbox3d = self.bbox_coder.decode(bbox_preds)
batch_size = bbox3d.shape[0]
results = list()
for b in range(batch_size):
bbox_selected, score_selected, labels = self.multiclass_nms_single(
obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
input_metas[b])
bbox = input_metas[b]['box_type_3d'](
bbox_selected.clone(),
box_dim=bbox_selected.shape[-1],
with_yaw=self.bbox_coder.with_rot)
results.append((bbox, score_selected, labels))
return results
def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
input_meta):
"""Multi-class nms in single batch.
Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes.
points (torch.Tensor): Input points.
input_meta (dict): Point cloud and image's meta info.
Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels.
"""
num_bbox = bbox.shape[0]
bbox = input_meta['box_type_3d'](
bbox.clone(),
box_dim=bbox.shape[-1],
with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 1.0))
if isinstance(bbox, LiDARInstance3DBoxes):
box_idx = bbox.points_in_boxes(points)
box_indices = box_idx.new_zeros([num_bbox + 1])
box_idx[box_idx == -1] = num_bbox
box_indices.scatter_add_(0, box_idx.long(),
box_idx.new_ones(box_idx.shape))
box_indices = box_indices[:-1]
nonempty_box_mask = box_indices >= 0
elif isinstance(bbox, DepthInstance3DBoxes):
box_indices = bbox.points_in_boxes(points)
nonempty_box_mask = box_indices.T.sum(1) >= 0
else:
raise NotImplementedError('Unsupported bbox type!')
corner3d = bbox.corners
minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
bbox_classes = torch.argmax(sem_scores, -1)
nms_selected = batched_nms(
minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]],
obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask],
self.test_cfg.nms_cfg)[1]
if nms_selected.shape[0] > self.test_cfg.max_output_num:
nms_selected = nms_selected[:self.test_cfg.max_output_num]
# filter empty boxes and boxes with low score
scores_mask = (obj_scores >= self.test_cfg.score_thr)
nonempty_box_inds = torch.nonzero(
nonempty_box_mask, as_tuple=False).flatten()
nonempty_mask = torch.zeros_like(bbox_classes).scatter(
0, nonempty_box_inds[nms_selected], 1)
selected = (nonempty_mask.bool() & scores_mask.bool())
if self.test_cfg.per_class_proposal:
bbox_selected, score_selected, labels = [], [], []
for k in range(sem_scores.shape[-1]):
bbox_selected.append(bbox[selected].tensor)
score_selected.append(obj_scores[selected])
labels.append(
torch.zeros_like(bbox_classes[selected]).fill_(k))
bbox_selected = torch.cat(bbox_selected, 0)
score_selected = torch.cat(score_selected, 0)
labels = torch.cat(labels, 0)
else:
bbox_selected = bbox[selected].tensor
score_selected = obj_scores[selected]
labels = bbox_classes[selected]
return bbox_selected, score_selected, labels
def _assign_targets_by_points_inside(self, bboxes_3d, points):
"""Compute assignment by checking whether point is inside bbox.
Args:
bboxes_3d (BaseInstance3DBoxes): Instance of bounding boxes.
points (torch.Tensor): Points of a batch.
Returns:
tuple[torch.Tensor]: Flags indicating whether each point is
inside bbox and the index of box where each point are in.
"""
# TODO: align points_in_boxes function in each box_structures
num_bbox = bboxes_3d.tensor.shape[0]
if isinstance(bboxes_3d, LiDARInstance3DBoxes):
assignment = bboxes_3d.points_in_boxes(points).long()
points_mask = assignment.new_zeros(
[assignment.shape[0], num_bbox + 1])
assignment[assignment == -1] = num_bbox
points_mask.scatter_(1, assignment.unsqueeze(1), 1)
points_mask = points_mask[:, :-1]
assignment[assignment == num_bbox] = num_bbox - 1
elif isinstance(bboxes_3d, DepthInstance3DBoxes):
points_mask = bboxes_3d.points_in_boxes(points)
assignment = points_mask.argmax(dim=-1)
else:
raise NotImplementedError('Unsupported bbox type!')
return points_mask, assignment
================================================
FILE: mmdet3d/models/dense_heads/train_mixins.py
================================================
import numpy as np
import torch
from mmdet3d.core import limit_period
from mmdet.core import images_to_levels, multi_apply
class AnchorTrainMixin(object):
"""Mixin class for target assigning of dense heads."""
def anchor_target_3d(self,
anchor_list,
gt_bboxes_list,
input_metas,
gt_bboxes_ignore_list=None,
gt_labels_list=None,
label_channels=1,
num_classes=1,
sampling=True):
"""Compute regression and classification targets for anchors.
Args:
anchor_list (list[list]): Multi level anchors of each image.
gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth
bboxes of each image.
input_metas (list[dict]): Meta info of each image.
gt_bboxes_ignore_list (None | list): Ignore list of gt bboxes.
gt_labels_list (list[torch.Tensor]): Gt labels of batches.
label_channels (int): The channel of labels.
num_classes (int): The number of classes.
sampling (bool): Whether to sample anchors.
Returns:
tuple (list, list, list, list, list, list, int, int):
Anchor targets, including labels, label weights,
bbox targets, bbox weights, direction targets,
direction weights, number of postive anchors and
number of negative anchors.
"""
num_imgs = len(input_metas)
assert len(anchor_list) == num_imgs
if isinstance(anchor_list[0][0], list):
# sizes of anchors are different
# anchor number of a single level
num_level_anchors = [
sum([anchor.size(0) for anchor in anchors])
for anchors in anchor_list[0]
]
for i in range(num_imgs):
anchor_list[i] = anchor_list[i][0]
else:
# anchor number of multi levels
num_level_anchors = [
anchors.view(-1, self.box_code_size).size(0)
for anchors in anchor_list[0]
]
# concat all level anchors and flags to a single tensor
for i in range(num_imgs):
anchor_list[i] = torch.cat(anchor_list[i])
# compute targets for each image
if gt_bboxes_ignore_list is None:
gt_bboxes_ignore_list = [None for _ in range(num_imgs)]
if gt_labels_list is None:
gt_labels_list = [None for _ in range(num_imgs)]
(all_labels, all_label_weights, all_bbox_targets, all_bbox_weights,
all_dir_targets, all_dir_weights, pos_inds_list,
neg_inds_list) = multi_apply(
self.anchor_target_3d_single,
anchor_list,
gt_bboxes_list,
gt_bboxes_ignore_list,
gt_labels_list,
input_metas,
label_channels=label_channels,
num_classes=num_classes,
sampling=sampling)
# no valid anchors
if any([labels is None for labels in all_labels]):
return None
# sampled anchors of all images
num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list])
num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list])
# split targets to a list w.r.t. multiple levels
labels_list = images_to_levels(all_labels, num_level_anchors)
label_weights_list = images_to_levels(all_label_weights,
num_level_anchors)
bbox_targets_list = images_to_levels(all_bbox_targets,
num_level_anchors)
bbox_weights_list = images_to_levels(all_bbox_weights,
num_level_anchors)
dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors)
dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors)
return (labels_list, label_weights_list, bbox_targets_list,
bbox_weights_list, dir_targets_list, dir_weights_list,
num_total_pos, num_total_neg)
def anchor_target_3d_single(self,
anchors,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
input_meta,
label_channels=1,
num_classes=1,
sampling=True):
"""Compute targets of anchors in single batch.
Args:
anchors (torch.Tensor): Concatenated multi-level anchor.
gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
gt_labels (torch.Tensor): Gt class labels.
input_meta (dict): Meta info of each image.
label_channels (int): The channel of labels.
num_classes (int): The number of classes.
sampling (bool): Whether to sample anchors.
Returns:
tuple[torch.Tensor]: Anchor targets.
"""
if isinstance(self.bbox_assigner,
list) and (not isinstance(anchors, list)):
feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2)
rot_angles = anchors.size(-2)
assert len(self.bbox_assigner) == anchors.size(-3)
(total_labels, total_label_weights, total_bbox_targets,
total_bbox_weights, total_dir_targets, total_dir_weights,
total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
current_anchor_num = 0
for i, assigner in enumerate(self.bbox_assigner):
current_anchors = anchors[..., i, :, :].reshape(
-1, self.box_code_size)
current_anchor_num += current_anchors.size(0)
if self.assign_per_class:
gt_per_cls = (gt_labels == i)
anchor_targets = self.anchor_target_single_assigner(
assigner, current_anchors, gt_bboxes[gt_per_cls, :],
gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
num_classes, sampling)
else:
anchor_targets = self.anchor_target_single_assigner(
assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
gt_labels, input_meta, num_classes, sampling)
(labels, label_weights, bbox_targets, bbox_weights,
dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
total_labels.append(labels.reshape(feat_size, 1, rot_angles))
total_label_weights.append(
label_weights.reshape(feat_size, 1, rot_angles))
total_bbox_targets.append(
bbox_targets.reshape(feat_size, 1, rot_angles,
anchors.size(-1)))
total_bbox_weights.append(
bbox_weights.reshape(feat_size, 1, rot_angles,
anchors.size(-1)))
total_dir_targets.append(
dir_targets.reshape(feat_size, 1, rot_angles))
total_dir_weights.append(
dir_weights.reshape(feat_size, 1, rot_angles))
total_pos_inds.append(pos_inds)
total_neg_inds.append(neg_inds)
total_labels = torch.cat(total_labels, dim=-2).reshape(-1)
total_label_weights = torch.cat(
total_label_weights, dim=-2).reshape(-1)
total_bbox_targets = torch.cat(
total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1))
total_bbox_weights = torch.cat(
total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1))
total_dir_targets = torch.cat(
total_dir_targets, dim=-2).reshape(-1)
total_dir_weights = torch.cat(
total_dir_weights, dim=-2).reshape(-1)
total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1)
total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1)
return (total_labels, total_label_weights, total_bbox_targets,
total_bbox_weights, total_dir_targets, total_dir_weights,
total_pos_inds, total_neg_inds)
elif isinstance(self.bbox_assigner, list) and isinstance(
anchors, list):
# class-aware anchors with different feature map sizes
assert len(self.bbox_assigner) == len(anchors), \
'The number of bbox assigners and anchors should be the same.'
(total_labels, total_label_weights, total_bbox_targets,
total_bbox_weights, total_dir_targets, total_dir_weights,
total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], []
current_anchor_num = 0
for i, assigner in enumerate(self.bbox_assigner):
current_anchors = anchors[i]
current_anchor_num += current_anchors.size(0)
if self.assign_per_class:
gt_per_cls = (gt_labels == i)
anchor_targets = self.anchor_target_single_assigner(
assigner, current_anchors, gt_bboxes[gt_per_cls, :],
gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta,
num_classes, sampling)
else:
anchor_targets = self.anchor_target_single_assigner(
assigner, current_anchors, gt_bboxes, gt_bboxes_ignore,
gt_labels, input_meta, num_classes, sampling)
(labels, label_weights, bbox_targets, bbox_weights,
dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets
total_labels.append(labels)
total_label_weights.append(label_weights)
total_bbox_targets.append(
bbox_targets.reshape(-1, anchors[i].size(-1)))
total_bbox_weights.append(
bbox_weights.reshape(-1, anchors[i].size(-1)))
total_dir_targets.append(dir_targets)
total_dir_weights.append(dir_weights)
total_pos_inds.append(pos_inds)
total_neg_inds.append(neg_inds)
total_labels = torch.cat(total_labels, dim=0)
total_label_weights = torch.cat(total_label_weights, dim=0)
total_bbox_targets = torch.cat(total_bbox_targets, dim=0)
total_bbox_weights = torch.cat(total_bbox_weights, dim=0)
total_dir_targets = torch.cat(total_dir_targets, dim=0)
total_dir_weights = torch.cat(total_dir_weights, dim=0)
total_pos_inds = torch.cat(total_pos_inds, dim=0)
total_neg_inds = torch.cat(total_neg_inds, dim=0)
return (total_labels, total_label_weights, total_bbox_targets,
total_bbox_weights, total_dir_targets, total_dir_weights,
total_pos_inds, total_neg_inds)
else:
return self.anchor_target_single_assigner(self.bbox_assigner,
anchors, gt_bboxes,
gt_bboxes_ignore,
gt_labels, input_meta,
num_classes, sampling)
def anchor_target_single_assigner(self,
bbox_assigner,
anchors,
gt_bboxes,
gt_bboxes_ignore,
gt_labels,
input_meta,
num_classes=1,
sampling=True):
"""Assign anchors and encode positive anchors.
Args:
bbox_assigner (BaseAssigner): assign positive and negative boxes.
anchors (torch.Tensor): Concatenated multi-level anchor.
gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes.
gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes.
gt_labels (torch.Tensor): Gt class labels.
input_meta (dict): Meta info of each image.
num_classes (int): The number of classes.
sampling (bool): Whether to sample anchors.
Returns:
tuple[torch.Tensor]: Anchor targets.
"""
anchors = anchors.reshape(-1, anchors.size(-1))
num_valid_anchors = anchors.shape[0]
bbox_targets = torch.zeros_like(anchors)
bbox_weights = torch.zeros_like(anchors)
dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long)
dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float)
labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long)
label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float)
if len(gt_bboxes) > 0:
if not isinstance(gt_bboxes, torch.Tensor):
gt_bboxes = gt_bboxes.tensor.to(anchors.device)
assign_result = bbox_assigner.assign(anchors, gt_bboxes,
gt_bboxes_ignore, gt_labels)
sampling_result = self.bbox_sampler.sample(assign_result, anchors,
gt_bboxes)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
else:
pos_inds = torch.nonzero(
anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0,
as_tuple=False).squeeze(-1).unique()
neg_inds = torch.nonzero(
anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0,
as_tuple=False).squeeze(-1).unique()
if gt_labels is not None:
labels += num_classes
if len(pos_inds) > 0:
pos_bbox_targets = self.bbox_coder.encode(
sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes)
pos_dir_targets = get_direction_target(
sampling_result.pos_bboxes,
pos_bbox_targets,
self.dir_offset,
one_hot=False)
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
dir_targets[pos_inds] = pos_dir_targets
dir_weights[pos_inds] = 1.0
if gt_labels is None:
labels[pos_inds] = 1
else:
labels[pos_inds] = gt_labels[
sampling_result.pos_assigned_gt_inds]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
return (labels, label_weights, bbox_targets, bbox_weights, dir_targets,
dir_weights, pos_inds, neg_inds)
def get_direction_target(anchors,
reg_targets,
dir_offset=0,
num_bins=2,
one_hot=True):
"""Encode direction to 0 ~ num_bins-1.
Args:
anchors (torch.Tensor): Concatenated multi-level anchor.
reg_targets (torch.Tensor): Bbox regression targets.
dir_offset (int): Direction offset.
num_bins (int): Number of bins to divide 2*PI.
one_hot (bool): Whether to encode as one hot.
Returns:
torch.Tensor: Encoded direction targets.
"""
rot_gt = reg_targets[..., 6] + anchors[..., 6]
offset_rot = limit_period(rot_gt - dir_offset, 0, 2 * np.pi)
dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long()
dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1)
if one_hot:
dir_targets = torch.zeros(
*list(dir_cls_targets.shape),
num_bins,
dtype=anchors.dtype,
device=dir_cls_targets.device)
dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0)
dir_cls_targets = dir_targets
return dir_cls_targets
================================================
FILE: mmdet3d/models/dense_heads/transfusion_head.py
================================================
import copy
import numpy as np
import torch
from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init
from mmcv.runner import force_fp32
from torch import nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn import Linear
from torch.nn.init import xavier_uniform_, constant_
from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius,
xywhr2xyxyr, limit_period, PseudoSampler)
from mmdet3d.core.bbox.structures import rotation_3d_in_axis
from mmdet3d.core import Box3DMode, LiDARInstance3DBoxes
from mmdet3d.models import builder
from mmdet3d.models.builder import HEADS, build_loss
from mmdet3d.models.utils import clip_sigmoid
from mmdet3d.models.fusion_layers import apply_3d_transformation
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu
from mmdet.core import build_bbox_coder, multi_apply, build_assigner, build_sampler, AssignResult
from mmdet3d.ops.roiaware_pool3d import points_in_boxes_batch
class PositionEmbeddingLearned(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
nn.BatchNorm1d(num_pos_feats),
nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
def forward(self, xyz):
xyz = xyz.transpose(1, 2).contiguous()
position_embedding = self.position_embedding_head(xyz)
return position_embedding
class TransformerDecoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
self_posembed=None, cross_posembed=None, cross_only=False):
super().__init__()
self.cross_only = cross_only
if not self.cross_only:
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
self.activation = _get_activation_fn(activation)
self.self_posembed = self_posembed
self.cross_posembed = cross_posembed
def with_pos_embed(self, tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, query, key, query_pos, key_pos, attn_mask=None):
"""
:param query: B C Pq
:param key: B C Pk
:param query_pos: B Pq 3/6
:param key_pos: B Pk 3/6
:param value_pos: [B Pq 3/6]
:return:
"""
# NxCxP to PxNxC
if self.self_posembed is not None:
query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1)
else:
query_pos_embed = None
if self.cross_posembed is not None:
key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1)
else:
key_pos_embed = None
query = query.permute(2, 0, 1)
key = key.permute(2, 0, 1)
if not self.cross_only:
q = k = v = self.with_pos_embed(query, query_pos_embed)
query2 = self.self_attn(q, k, value=v)[0]
query = query + self.dropout1(query2)
query = self.norm1(query)
query2 = self.multihead_attn(query=self.with_pos_embed(query, query_pos_embed),
key=self.with_pos_embed(key, key_pos_embed),
value=self.with_pos_embed(key, key_pos_embed), attn_mask=attn_mask)[0]
query = query + self.dropout2(query2)
query = self.norm2(query)
query2 = self.linear2(self.dropout(self.activation(self.linear1(query))))
query = query + self.dropout3(query2)
query = self.norm3(query)
# NxCxP to PxNxC
query = query.permute(1, 2, 0)
return query
class MultiheadAttention(nn.Module):
r"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
Args:
embed_dim: total dimension of the model.
num_heads: parallel attention heads.
dropout: a Dropout layer on attn_output_weights. Default: 0.0.
bias: add bias as module parameter. Default: True.
add_bias_kv: add bias to the key and value sequences at dim=0.
add_zero_attn: add a new batch of zeros to the key and
value sequences at dim=1.
kdim: total number of features in key. Default: None.
vdim: total number of features in key. Default: None.
Note: if kdim and vdim are None, they will be set to embed_dim such that
query, key, and value have the same number of features.
Examples::
>>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)
"""
def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None,
vdim=None):
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
if self._qkv_same_embed_dim is False:
self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
if bias:
self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
else:
self.register_parameter('in_proj_bias', None)
self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
if add_bias_kv:
self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
else:
self.bias_k = self.bias_v = None
self.add_zero_attn = add_zero_attn
self._reset_parameters()
def _reset_parameters(self):
if self._qkv_same_embed_dim:
xavier_uniform_(self.in_proj_weight)
else:
xavier_uniform_(self.q_proj_weight)
xavier_uniform_(self.k_proj_weight)
xavier_uniform_(self.v_proj_weight)
if self.in_proj_bias is not None:
constant_(self.in_proj_bias, 0.)
constant_(self.out_proj.bias, 0.)
if self.bias_k is not None:
xavier_normal_(self.bias_k)
if self.bias_v is not None:
xavier_normal_(self.bias_v)
def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
r"""
Args:
query, key, value: map a query and a set of key-value pairs to an output.
See "Attention Is All You Need" for more details.
key_padding_mask: if provided, specified padding elements in the key will
be ignored by the attention. This is an binary mask. When the value is True,
the corresponding value on the attention layer will be filled with -inf.
need_weights: output attn_output_weights.
attn_mask: mask that prevents attention to certain positions. This is an additive mask
(i.e. the values will be added to the attention layer).
Shape:
- Inputs:
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
the embedding dimension.
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length.
- attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
- Outputs:
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
E is the embedding dimension.
- attn_output_weights: :math:`(N, L, S)` where N is the batch size,
L is the target sequence length, S is the source sequence length.
"""
if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False:
return multi_head_attention_forward(
query, key, value, self.embed_dim, self.num_heads,
self.in_proj_weight, self.in_proj_bias,
self.bias_k, self.bias_v, self.add_zero_attn,
self.dropout, self.out_proj.weight, self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask, need_weights=need_weights,
attn_mask=attn_mask, use_separate_proj_weight=True,
q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
v_proj_weight=self.v_proj_weight)
else:
if not hasattr(self, '_qkv_same_embed_dim'):
warnings.warn('A new version of MultiheadAttention module has been implemented. \
Please re-train your model with the new module',
UserWarning)
return multi_head_attention_forward(
query, key, value, self.embed_dim, self.num_heads,
self.in_proj_weight, self.in_proj_bias,
self.bias_k, self.bias_v, self.add_zero_attn,
self.dropout, self.out_proj.weight, self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask, need_weights=need_weights,
attn_mask=attn_mask)
def multi_head_attention_forward(query, # type: Tensor
key, # type: Tensor
value, # type: Tensor
embed_dim_to_check, # type: int
num_heads, # type: int
in_proj_weight, # type: Tensor
in_proj_bias, # type: Tensor
bias_k, # type: Optional[Tensor]
bias_v, # type: Optional[Tensor]
add_zero_attn, # type: bool
dropout_p, # type: float
out_proj_weight, # type: Tensor
out_proj_bias, # type: Tensor
training=True, # type: bool
key_padding_mask=None, # type: Optional[Tensor]
need_weights=True, # type: bool
attn_mask=None, # type: Optional[Tensor]
use_separate_proj_weight=False, # type: bool
q_proj_weight=None, # type: Optional[Tensor]
k_proj_weight=None, # type: Optional[Tensor]
v_proj_weight=None, # type: Optional[Tensor]
static_k=None, # type: Optional[Tensor]
static_v=None, # type: Optional[Tensor]
):
# type: (...) -> Tuple[Tensor, Optional[Tensor]]
r"""
Args:
query, key, value: map a query and a set of key-value pairs to an output.
See "Attention Is All You Need" for more details.
embed_dim_to_check: total dimension of the model.
num_heads: parallel attention heads.
in_proj_weight, in_proj_bias: input projection weight and bias.
bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
add_zero_attn: add a new batch of zeros to the key and
value sequences at dim=1.
dropout_p: probability of an element to be zeroed.
out_proj_weight, out_proj_bias: the output projection weight and bias.
training: apply dropout if is ``True``.
key_padding_mask: if provided, specified padding elements in the key will
be ignored by the attention. This is an binary mask. When the value is True,
the corresponding value on the attention layer will be filled with -inf.
need_weights: output attn_output_weights.
attn_mask: mask that prevents attention to certain positions. This is an additive mask
(i.e. the values will be added to the attention layer).
use_separate_proj_weight: the function accept the proj. weights for query, key,
and value in differnt forms. If false, in_proj_weight will be used, which is
a combination of q_proj_weight, k_proj_weight, v_proj_weight.
q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
static_k, static_v: static key and value used for attention operators.
Shape:
Inputs:
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
the embedding dimension.
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length.
- attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
- static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
- static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
Outputs:
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
E is the embedding dimension.
- attn_output_weights: :math:`(N, L, S)` where N is the batch size,
L is the target sequence length, S is the source sequence length.
"""
qkv_same = torch.equal(query, key) and torch.equal(key, value)
kv_same = torch.equal(key, value)
tgt_len, bsz, embed_dim = query.size()
assert embed_dim == embed_dim_to_check
assert list(query.size()) == [tgt_len, bsz, embed_dim]
assert key.size() == value.size()
head_dim = embed_dim // num_heads
assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
scaling = float(head_dim) ** -0.5
if use_separate_proj_weight is not True:
if qkv_same:
# self-attention
q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
elif kv_same:
# encoder-decoder attention
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = 0
_end = embed_dim
_w = in_proj_weight[_start:_end, :]
if _b is not None:
_b = _b[_start:_end]
q = F.linear(query, _w, _b)
if key is None:
assert value is None
k = None
v = None
else:
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = embed_dim
_end = None
_w = in_proj_weight[_start:, :]
if _b is not None:
_b = _b[_start:]
k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
else:
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = 0
_end = embed_dim
_w = in_proj_weight[_start:_end, :]
if _b is not None:
_b = _b[_start:_end]
q = F.linear(query, _w, _b)
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = embed_dim
_end = embed_dim * 2
_w = in_proj_weight[_start:_end, :]
if _b is not None:
_b = _b[_start:_end]
k = F.linear(key, _w, _b)
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = embed_dim * 2
_end = None
_w = in_proj_weight[_start:, :]
if _b is not None:
_b = _b[_start:]
v = F.linear(value, _w, _b)
else:
q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
len1, len2 = q_proj_weight_non_opt.size()
assert len1 == embed_dim and len2 == query.size(-1)
k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
len1, len2 = k_proj_weight_non_opt.size()
assert len1 == embed_dim and len2 == key.size(-1)
v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
len1, len2 = v_proj_weight_non_opt.size()
assert len1 == embed_dim and len2 == value.size(-1)
if in_proj_bias is not None:
q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
else:
q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
q = q * scaling
if bias_k is not None and bias_v is not None:
if static_k is None and static_v is None:
k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = torch.cat([attn_mask,
torch.zeros((attn_mask.size(0), 1),
dtype=attn_mask.dtype,
device=attn_mask.device)], dim=1)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
dtype=key_padding_mask.dtype,
device=key_padding_mask.device)], dim=1)
else:
assert static_k is None, "bias cannot be added to static key."
assert static_v is None, "bias cannot be added to static value."
else:
assert bias_k is None
assert bias_v is None
q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
if k is not None:
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
if v is not None:
v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
if static_k is not None:
assert static_k.size(0) == bsz * num_heads
assert static_k.size(2) == head_dim
k = static_k
if static_v is not None:
assert static_v.size(0) == bsz * num_heads
assert static_v.size(2) == head_dim
v = static_v
src_len = k.size(1)
if key_padding_mask is not None:
assert key_padding_mask.size(0) == bsz
assert key_padding_mask.size(1) == src_len
if add_zero_attn:
src_len += 1
k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
if attn_mask is not None:
if len(attn_mask.shape) == 2:
attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1),
dtype=attn_mask.dtype,
device=attn_mask.device)], dim=1)
else:
attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), attn_mask.size(1), 1),
dtype=attn_mask.dtype,
device=attn_mask.device)], dim=2)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
dtype=key_padding_mask.dtype,
device=key_padding_mask.device)], dim=1)
attn_output_weights = torch.bmm(q, k.transpose(1, 2))
assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
if attn_mask is not None:
if len(attn_mask.shape) == 2:
attn_mask = attn_mask.unsqueeze(0)
else:
attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
attn_mask = attn_mask.reshape(attn_mask.size(0)*num_heads, attn_mask.size(2), attn_mask.size(3))
attn_output_weights += attn_mask
if key_padding_mask is not None:
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
attn_output_weights = attn_output_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2),
float('-inf'),
)
attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
attn_output_weights = F.softmax(
attn_output_weights, dim=-1)
attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training)
attn_output = torch.bmm(attn_output_weights, v)
assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
if need_weights:
# average attention weights over heads
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
return attn_output, attn_output_weights.sum(dim=1) / num_heads
else:
return attn_output, None
class FFN(nn.Module):
def __init__(self,
in_channels,
heads,
head_conv=64,
final_kernel=1,
init_bias=-2.19,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
bias='auto',
**kwargs):
super(FFN, self).__init__()
self.heads = heads
self.init_bias = init_bias
for head in self.heads:
classes, num_conv = self.heads[head]
conv_layers = []
c_in = in_channels
for i in range(num_conv - 1):
conv_layers.append(
ConvModule(
c_in,
head_conv,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=bias,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg))
c_in = head_conv
conv_layers.append(
build_conv_layer(
conv_cfg,
head_conv,
classes,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=True))
conv_layers = nn.Sequential(*conv_layers)
self.__setattr__(head, conv_layers)
def init_weights(self):
"""Initialize weights."""
for head in self.heads:
if head == 'heatmap':
self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
else:
for m in self.__getattr__(head).modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
def forward(self, x):
"""Forward function for SepHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
dict[str: torch.Tensor]: contains the following keys:
-reg (torch.Tensor): 2D regression value with the \
shape of [B, 2, H, W].
-height (torch.Tensor): Height value with the \
shape of [B, 1, H, W].
-dim (torch.Tensor): Size value with the shape \
of [B, 3, H, W].
-rot (torch.Tensor): Rotation value with the \
shape of [B, 1, H, W].
-vel (torch.Tensor): Velocity value with the \
shape of [B, 2, H, W].
-heatmap (torch.Tensor): Heatmap with the shape of \
[B, N, H, W].
"""
ret_dict = dict()
for head in self.heads:
ret_dict[head] = self.__getattr__(head)(x)
return ret_dict
@HEADS.register_module()
class TransFusionHead(nn.Module):
def __init__(self,
fuse_img=False,
num_views=0,
in_channels_img=64,
out_size_factor_img=4,
num_proposals=128,
auxiliary=True,
in_channels=128 * 3,
hidden_channel=128,
num_classes=4,
# config for Transformer
num_decoder_layers=3,
num_heads=8,
learnable_query_pos=False,
initialize_by_heatmap=False,
nms_kernel_size=1,
ffn_channel=256,
dropout=0.1,
bn_momentum=0.1,
activation='relu',
# config for FFN
common_heads=dict(),
num_heatmap_convs=2,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
bias='auto',
# loss
loss_cls=dict(type='GaussianFocalLoss', reduction='mean'),
loss_iou=dict(type='VarifocalLoss', use_sigmoid=True, iou_weighted=True, reduction='mean'),
loss_bbox=dict(type='L1Loss', reduction='mean'),
loss_heatmap=dict(type='GaussianFocalLoss', reduction='mean'),
# others
train_cfg=None,
test_cfg=None,
bbox_coder=None,
):
super(TransFusionHead, self).__init__()
self.num_classes = num_classes
self.num_proposals = num_proposals
self.auxiliary = auxiliary
self.in_channels = in_channels
self.num_heads = num_heads
self.num_decoder_layers = num_decoder_layers
self.bn_momentum = bn_momentum
self.learnable_query_pos = learnable_query_pos
self.initialize_by_heatmap = initialize_by_heatmap
self.nms_kernel_size = nms_kernel_size
if self.initialize_by_heatmap is True:
assert self.learnable_query_pos is False, "initialized by heatmap is conflicting with learnable query position"
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
if not self.use_sigmoid_cls:
self.num_classes += 1
self.loss_cls = build_loss(loss_cls)
self.loss_bbox = build_loss(loss_bbox)
self.loss_iou = build_loss(loss_iou)
self.loss_heatmap = build_loss(loss_heatmap)
self.bbox_coder = build_bbox_coder(bbox_coder)
self.sampling = False
# a shared convolution
self.shared_conv = build_conv_layer(
dict(type='Conv2d'),
in_channels,
hidden_channel,
kernel_size=3,
padding=1,
bias=bias,
)
if self.initialize_by_heatmap:
layers = []
layers.append(ConvModule(
hidden_channel,
hidden_channel,
kernel_size=3,
padding=1,
bias=bias,
conv_cfg=dict(type='Conv2d'),
norm_cfg=dict(type='BN2d'),
))
layers.append(build_conv_layer(
dict(type='Conv2d'),
hidden_channel,
num_classes,
kernel_size=3,
padding=1,
bias=bias,
))
self.heatmap_head = nn.Sequential(*layers)
self.class_encoding = nn.Conv1d(num_classes, hidden_channel, 1)
else:
# query feature
self.query_feat = nn.Parameter(torch.randn(1, hidden_channel, self.num_proposals))
self.query_pos = nn.Parameter(torch.rand([1, self.num_proposals, 2]), requires_grad=learnable_query_pos)
# transformer decoder layers for object query with LiDAR feature
self.decoder = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.decoder.append(
TransformerDecoderLayer(
hidden_channel, num_heads, ffn_channel, dropout, activation,
self_posembed=PositionEmbeddingLearned(2, hidden_channel),
cross_posembed=PositionEmbeddingLearned(2, hidden_channel),
))
# Prediction Head
self.prediction_heads = nn.ModuleList()
for i in range(self.num_decoder_layers):
heads = copy.deepcopy(common_heads)
heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs)))
self.prediction_heads.append(FFN(hidden_channel, heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias))
self.fuse_img = fuse_img
if self.fuse_img:
self.num_views = num_views
self.out_size_factor_img = out_size_factor_img
self.shared_conv_img = build_conv_layer(
dict(type='Conv2d'),
in_channels_img, # channel of img feature map
hidden_channel,
kernel_size=3,
padding=1,
bias=bias,
)
if self.initialize_by_heatmap:
self.heatmap_head_img = copy.deepcopy(self.heatmap_head)
# transformer decoder layers for img fusion
self.decoder.append(
TransformerDecoderLayer(
hidden_channel, num_heads, ffn_channel, dropout, activation,
self_posembed=PositionEmbeddingLearned(2, hidden_channel),
cross_posembed=PositionEmbeddingLearned(2, hidden_channel),
))
# cross-attention only layers for projecting img feature onto BEV
for i in range(num_views):
self.decoder.append(
TransformerDecoderLayer(
hidden_channel, num_heads, ffn_channel, dropout, activation,
self_posembed=PositionEmbeddingLearned(2, hidden_channel),
cross_posembed=PositionEmbeddingLearned(2, hidden_channel),
cross_only=True,
))
self.fc = nn.Sequential(*[nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)])
heads = copy.deepcopy(common_heads)
heads.update(dict(heatmap=(self.num_classes, num_heatmap_convs)))
self.prediction_heads.append(FFN(hidden_channel * 2, heads, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias))
self.init_weights()
self._init_assigner_sampler()
# Position Embedding for Cross-Attention, which is re-used during training
x_size = self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor']
y_size = self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor']
self.bev_pos = self.create_2D_grid(x_size, y_size)
self.img_feat_pos = None
self.img_feat_collapsed_pos = None
def create_2D_grid(self, x_size, y_size):
meshgrid = [[0, x_size - 1, x_size], [0, y_size - 1, y_size]]
batch_y, batch_x = torch.meshgrid(*[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
batch_x = batch_x + 0.5
batch_y = batch_y + 0.5
coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None]
coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1)
return coord_base
def init_weights(self):
# initialize transformer
for m in self.decoder.parameters():
if m.dim() > 1:
nn.init.xavier_uniform_(m)
if hasattr(self, 'query'):
nn.init.xavier_normal_(self.query)
self.init_bn_momentum()
def init_bn_momentum(self):
for m in self.modules():
if isinstance(m, (nn.BatchNorm2d, nn.BatchNorm1d)):
m.momentum = self.bn_momentum
def _init_assigner_sampler(self):
"""Initialize the target assigner and sampler of the head."""
if self.train_cfg is None:
return
if self.sampling:
self.bbox_sampler = build_sampler(self.train_cfg.sampler)
else:
self.bbox_sampler = PseudoSampler()
if isinstance(self.train_cfg.assigner, dict):
self.bbox_assigner = build_assigner(self.train_cfg.assigner)
elif isinstance(self.train_cfg.assigner, list):
self.bbox_assigner = [
build_assigner(res) for res in self.train_cfg.assigner
]
def forward_single(self, inputs, img_inputs, img_metas):
"""Forward function for CenterPoint.
Args:
inputs (torch.Tensor): Input feature map with the shape of
[B, 512, 128(H), 128(W)]. (consistent with L748)
Returns:
list[dict]: Output results for tasks.
"""
batch_size = inputs.shape[0]
lidar_feat = self.shared_conv(inputs)
#################################
# image to BEV
#################################
lidar_feat_flatten = lidar_feat.view(batch_size, lidar_feat.shape[1], -1) # [BS, C, H*W]
bev_pos = self.bev_pos.repeat(batch_size, 1, 1).to(lidar_feat.device)
if self.fuse_img:
img_feat = self.shared_conv_img(img_inputs) # [BS * n_views, C, H, W]
img_h, img_w, num_channel = img_inputs.shape[-2], img_inputs.shape[-1], img_feat.shape[1]
raw_img_feat = img_feat.view(batch_size, self.num_views, num_channel, img_h, img_w).permute(0, 2, 3, 1, 4) # [BS, C, H, n_views, W]
img_feat = raw_img_feat.reshape(batch_size, num_channel, img_h, img_w * self.num_views) # [BS, C, H, n_views*W]
img_feat_collapsed = img_feat.max(2).values
img_feat_collapsed = self.fc(img_feat_collapsed).view(batch_size, num_channel, img_w * self.num_views)
# positional encoding for image guided query initialization
if self.img_feat_collapsed_pos is None:
img_feat_collapsed_pos = self.img_feat_collapsed_pos = self.create_2D_grid(1, img_feat_collapsed.shape[-1]).to(img_feat.device)
else:
img_feat_collapsed_pos = self.img_feat_collapsed_pos
bev_feat = lidar_feat_flatten
for idx_view in range(self.num_views):
bev_feat = self.decoder[2 + idx_view](bev_feat, img_feat_collapsed[..., img_w * idx_view:img_w * (idx_view + 1)], bev_pos, img_feat_collapsed_pos[:, img_w * idx_view:img_w * (idx_view + 1)])
#################################
# image guided query initialization
#################################
if self.initialize_by_heatmap:
dense_heatmap = self.heatmap_head(lidar_feat)
dense_heatmap_img = None
if self.fuse_img:
dense_heatmap_img = self.heatmap_head_img(bev_feat.view(lidar_feat.shape)) # [BS, num_classes, H, W]
heatmap = (dense_heatmap.detach().sigmoid() + dense_heatmap_img.detach().sigmoid()) / 2
else:
heatmap = dense_heatmap.detach().sigmoid()
padding = self.nms_kernel_size // 2
local_max = torch.zeros_like(heatmap)
# equals to nms radius = voxel_size * out_size_factor * kenel_size
local_max_inner = F.max_pool2d(heatmap, kernel_size=self.nms_kernel_size, stride=1, padding=0)
local_max[:, :, padding:(-padding), padding:(-padding)] = local_max_inner
## for Pedestrian & Traffic_cone in nuScenes
if self.test_cfg['dataset'] == 'nuScenes':
local_max[:, 8, ] = F.max_pool2d(heatmap[:, 8], kernel_size=1, stride=1, padding=0)
local_max[:, 9, ] = F.max_pool2d(heatmap[:, 9], kernel_size=1, stride=1, padding=0)
elif self.test_cfg['dataset'] == 'Waymo': # for Pedestrian & Cyclist in Waymo
local_max[:, 1, ] = F.max_pool2d(heatmap[:, 1], kernel_size=1, stride=1, padding=0)
local_max[:, 2, ] = F.max_pool2d(heatmap[:, 2], kernel_size=1, stride=1, padding=0)
heatmap = heatmap * (heatmap == local_max)
heatmap = heatmap.view(batch_size, heatmap.shape[1], -1)
# top #num_proposals among all classes
top_proposals = heatmap.view(batch_size, -1).argsort(dim=-1, descending=True)[..., :self.num_proposals]
top_proposals_class = top_proposals // heatmap.shape[-1]
top_proposals_index = top_proposals % heatmap.shape[-1]
query_feat = lidar_feat_flatten.gather(index=top_proposals_index[:, None, :].expand(-1, lidar_feat_flatten.shape[1], -1), dim=-1)
self.query_labels = top_proposals_class
# add category embedding
one_hot = F.one_hot(top_proposals_class, num_classes=self.num_classes).permute(0, 2, 1)
query_cat_encoding = self.class_encoding(one_hot.float())
query_feat += query_cat_encoding
query_pos = bev_pos.gather(index=top_proposals_index[:, None, :].permute(0, 2, 1).expand(-1, -1, bev_pos.shape[-1]), dim=1)
else:
query_feat = self.query_feat.repeat(batch_size, 1, 1) # [BS, C, num_proposals]
query_pos = self.query_pos.repeat(batch_size, 1, 1).to(lidar_feat.device) # [BS, num_proposals, 2]
#################################
# transformer decoder layer (LiDAR feature as K,V)
#################################
ret_dicts = []
for i in range(self.num_decoder_layers):
prefix = 'last_' if (i == self.num_decoder_layers - 1) else f'{i}head_'
# Transformer Decoder Layer
# :param query: B C Pq :param query_pos: B Pq 3/6
query_feat = self.decoder[i](query_feat, lidar_feat_flatten, query_pos, bev_pos)
# Prediction
res_layer = self.prediction_heads[i](query_feat)
res_layer['center'] = res_layer['center'] + query_pos.permute(0, 2, 1)
first_res_layer = res_layer
if not self.fuse_img:
ret_dicts.append(res_layer)
# for next level positional embedding
query_pos = res_layer['center'].detach().clone().permute(0, 2, 1)
#################################
# transformer decoder layer (img feature as K,V)
#################################
if self.fuse_img:
# positional encoding for image fusion
img_feat = raw_img_feat.permute(0, 3, 1, 2, 4) # [BS, n_views, C, H, W]
img_feat_flatten = img_feat.view(batch_size, self.num_views, num_channel, -1) # [BS, n_views, C, H*W]
if self.img_feat_pos is None:
(h, w) = img_inputs.shape[-2], img_inputs.shape[-1]
img_feat_pos = self.img_feat_pos = self.create_2D_grid(h, w).to(img_feat_flatten.device)
else:
img_feat_pos = self.img_feat_pos
prev_query_feat = query_feat.detach().clone()
query_feat = torch.zeros_like(query_feat) # create new container for img query feature
query_pos_realmetric = query_pos.permute(0, 2, 1) * self.test_cfg['out_size_factor'] * self.test_cfg['voxel_size'][0] + self.test_cfg['pc_range'][0]
query_pos_3d = torch.cat([query_pos_realmetric, res_layer['height']], dim=1).detach().clone()
if 'vel' in res_layer:
vel = copy.deepcopy(res_layer['vel'].detach())
else:
vel = None
pred_boxes = self.bbox_coder.decode(
copy.deepcopy(res_layer['heatmap'].detach()),
copy.deepcopy(res_layer['rot'].detach()),
copy.deepcopy(res_layer['dim'].detach()),
copy.deepcopy(res_layer['center'].detach()),
copy.deepcopy(res_layer['height'].detach()),
vel,
)
on_the_image_mask = torch.ones([batch_size, self.num_proposals]).to(query_pos_3d.device) * -1
for sample_idx in range(batch_size if self.fuse_img else 0):
lidar2img_rt = query_pos_3d.new_tensor(img_metas[sample_idx]['lidar2img'])
img_scale_factor = (
query_pos_3d.new_tensor(img_metas[sample_idx]['scale_factor'][:2]
if 'scale_factor' in img_metas[sample_idx].keys() else [1.0, 1.0])
)
img_flip = img_metas[sample_idx]['flip'] if 'flip' in img_metas[sample_idx].keys() else False
img_crop_offset = (
query_pos_3d.new_tensor(img_metas[sample_idx]['img_crop_offset'])
if 'img_crop_offset' in img_metas[sample_idx].keys() else 0)
img_shape = img_metas[sample_idx]['img_shape'][:2]
img_pad_shape = img_metas[sample_idx]['input_shape'][:2]
boxes = LiDARInstance3DBoxes(pred_boxes[sample_idx]['bboxes'][:, :7], box_dim=7)
query_pos_3d_with_corners = torch.cat([query_pos_3d[sample_idx], boxes.corners.permute(2, 0, 1).view(3, -1)], dim=-1) # [3, num_proposals] + [3, num_proposals*8]
# transform point clouds back to original coordinate system by reverting the data augmentation
if batch_size == 1: # skip during inference to save time
points = query_pos_3d_with_corners.T
else:
points = apply_3d_transformation(query_pos_3d_with_corners.T, 'LIDAR', img_metas[sample_idx], reverse=True).detach()
num_points = points.shape[0]
for view_idx in range(self.num_views):
pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1)
pts_2d = pts_4d @ lidar2img_rt[view_idx].t()
pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5)
pts_2d[:, 0] /= pts_2d[:, 2]
pts_2d[:, 1] /= pts_2d[:, 2]
# img transformation: scale -> crop -> flip
# the image is resized by img_scale_factor
img_coors = pts_2d[:, 0:2] * img_scale_factor # Nx2
img_coors -= img_crop_offset
# grid sample, the valid grid range should be in [-1,1]
coor_x, coor_y = torch.split(img_coors, 1, dim=1) # each is Nx1
if img_flip:
# by default we take it as horizontal flip
# use img_shape before padding for flip
orig_h, orig_w = img_shape
coor_x = orig_w - coor_x
coor_x, coor_corner_x = coor_x[0:self.num_proposals, :], coor_x[self.num_proposals:, :]
coor_y, coor_corner_y = coor_y[0:self.num_proposals, :], coor_y[self.num_proposals:, :]
coor_corner_x = coor_corner_x.reshape(self.num_proposals, 8, 1)
coor_corner_y = coor_corner_y.reshape(self.num_proposals, 8, 1)
coor_corner_xy = torch.cat([coor_corner_x, coor_corner_y], dim=-1)
h, w = img_pad_shape
on_the_image = (coor_x > 0) * (coor_x < w) * (coor_y > 0) * (coor_y < h)
on_the_image = on_the_image.squeeze()
# skip the following computation if no object query fall on current image
if on_the_image.sum() <= 1:
continue
on_the_image_mask[sample_idx, on_the_image] = view_idx
# add spatial constraint
center_ys = (coor_y[on_the_image] / self.out_size_factor_img)
center_xs = (coor_x[on_the_image] / self.out_size_factor_img)
centers = torch.cat([center_xs, center_ys], dim=-1).int() # center on the feature map
corners = (coor_corner_xy[on_the_image].max(1).values - coor_corner_xy[on_the_image].min(1).values) / self.out_size_factor_img
radius = torch.ceil(corners.norm(dim=-1, p=2) / 2).int() # radius of the minimum circumscribed circle of the wireframe
sigma = (radius * 2 + 1) / 6.0
distance = (centers[:, None, :] - (img_feat_pos - 0.5)).norm(dim=-1) ** 2
gaussian_mask = (-distance / (2 * sigma[:, None] ** 2)).exp()
gaussian_mask[gaussian_mask < torch.finfo(torch.float32).eps] = 0
attn_mask = gaussian_mask
query_feat_view = prev_query_feat[sample_idx, :, on_the_image]
query_pos_view = torch.cat([center_xs, center_ys], dim=-1)
query_feat_view = self.decoder[self.num_decoder_layers](query_feat_view[None], img_feat_flatten[sample_idx:sample_idx + 1, view_idx], query_pos_view[None], img_feat_pos, attn_mask=attn_mask.log())
query_feat[sample_idx, :, on_the_image] = query_feat_view.clone()
self.on_the_image_mask = (on_the_image_mask != -1)
res_layer = self.prediction_heads[self.num_decoder_layers](torch.cat([query_feat, prev_query_feat], dim=1))
res_layer['center'] = res_layer['center'] + query_pos.permute(0, 2, 1)
for key, value in res_layer.items():
pred_dim = value.shape[1]
res_layer[key][~self.on_the_image_mask.unsqueeze(1).repeat(1, pred_dim, 1)] = first_res_layer[key][~self.on_the_image_mask.unsqueeze(1).repeat(1, pred_dim, 1)]
ret_dicts.append(res_layer)
if self.initialize_by_heatmap:
ret_dicts[0]['query_heatmap_score'] = heatmap.gather(index=top_proposals_index[:, None, :].expand(-1, self.num_classes, -1), dim=-1) # [bs, num_classes, num_proposals]
if self.fuse_img:
ret_dicts[0]['dense_heatmap'] = dense_heatmap_img
else:
ret_dicts[0]['dense_heatmap'] = dense_heatmap
if self.auxiliary is False:
# only return the results of last decoder layer
return [ret_dicts[-1]]
# return all the layer's results for auxiliary superivison
new_res = {}
for key in ret_dicts[0].keys():
if key not in ['dense_heatmap', 'dense_heatmap_old', 'query_heatmap_score']:
new_res[key] = torch.cat([ret_dict[key] for ret_dict in ret_dicts], dim=-1)
else:
new_res[key] = ret_dicts[0][key]
return [new_res]
def forward(self, feats, img_feats, img_metas):
"""Forward pass.
Args:
feats (list[torch.Tensor]): Multi-level features, e.g.,
features produced by FPN.
Returns:
tuple(list[dict]): Output results. first index by level, second index by layer
"""
if img_feats is None:
img_feats = [None]
res = multi_apply(self.forward_single, feats, img_feats, [img_metas])
assert len(res) == 1, "only support one level features."
return res
def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict):
"""Generate training targets.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
preds_dicts (tuple of dict): first index by layer (default 1)
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
- torch.Tensor: classification target. [BS, num_proposals]
- torch.Tensor: classification weights (mask) [BS, num_proposals]
- torch.Tensor: regression target. [BS, num_proposals, 8]
- torch.Tensor: regression weights. [BS, num_proposals, 8]
"""
# change preds_dict into list of dict (index by batch_id)
# preds_dict[0]['center'].shape [bs, 3, num_proposal]
list_of_pred_dict = []
for batch_idx in range(len(gt_bboxes_3d)):
pred_dict = {}
for key in preds_dict[0].keys():
pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1]
list_of_pred_dict.append(pred_dict)
assert len(gt_bboxes_3d) == len(list_of_pred_dict)
res_tuple = multi_apply(self.get_targets_single, gt_bboxes_3d, gt_labels_3d, list_of_pred_dict, np.arange(len(gt_labels_3d)))
labels = torch.cat(res_tuple[0], dim=0)
label_weights = torch.cat(res_tuple[1], dim=0)
bbox_targets = torch.cat(res_tuple[2], dim=0)
bbox_weights = torch.cat(res_tuple[3], dim=0)
ious = torch.cat(res_tuple[4], dim=0)
num_pos = np.sum(res_tuple[5])
matched_ious = np.mean(res_tuple[6])
if self.initialize_by_heatmap:
heatmap = torch.cat(res_tuple[7], dim=0)
return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious, heatmap
else:
return labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious
def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict, batch_idx):
"""Generate training targets for a single sample.
Args:
gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
gt_labels_3d (torch.Tensor): Labels of boxes.
preds_dict (dict): dict of prediction result for a single sample
Returns:
tuple[torch.Tensor]: Tuple of target including \
the following results in order.
- torch.Tensor: classification target. [1, num_proposals]
- torch.Tensor: classification weights (mask) [1, num_proposals]
- torch.Tensor: regression target. [1, num_proposals, 8]
- torch.Tensor: regression weights. [1, num_proposals, 8]
- torch.Tensor: iou target. [1, num_proposals]
- int: number of positive proposals
"""
num_proposals = preds_dict['center'].shape[-1]
# get pred boxes, carefully ! donot change the network outputs
score = copy.deepcopy(preds_dict['heatmap'].detach())
center = copy.deepcopy(preds_dict['center'].detach())
height = copy.deepcopy(preds_dict['height'].detach())
dim = copy.deepcopy(preds_dict['dim'].detach())
rot = copy.deepcopy(preds_dict['rot'].detach())
if 'vel' in preds_dict.keys():
vel = copy.deepcopy(preds_dict['vel'].detach())
else:
vel = None
boxes_dict = self.bbox_coder.decode(score, rot, dim, center, height, vel) # decode the prediction to real world metric bbox
bboxes_tensor = boxes_dict[0]['bboxes']
gt_bboxes_tensor = gt_bboxes_3d.tensor.to(score.device)
# each layer should do label assign seperately.
if self.auxiliary:
num_layer = self.num_decoder_layers
else:
num_layer = 1
assign_result_list = []
for idx_layer in range(num_layer):
bboxes_tensor_layer = bboxes_tensor[self.num_proposals * idx_layer:self.num_proposals * (idx_layer + 1), :]
score_layer = score[..., self.num_proposals * idx_layer:self.num_proposals * (idx_layer + 1)]
if self.train_cfg.assigner.type == 'HungarianAssigner3D':
assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, gt_labels_3d, score_layer, self.train_cfg)
elif self.train_cfg.assigner.type == 'HeuristicAssigner':
assign_result = self.bbox_assigner.assign(bboxes_tensor_layer, gt_bboxes_tensor, None, gt_labels_3d, self.query_labels[batch_idx])
else:
raise NotImplementedError
assign_result_list.append(assign_result)
# combine assign result of each layer
assign_result_ensemble = AssignResult(
num_gts=sum([res.num_gts for res in assign_result_list]),
gt_inds=torch.cat([res.gt_inds for res in assign_result_list]),
max_overlaps=torch.cat([res.max_overlaps for res in assign_result_list]),
labels=torch.cat([res.labels for res in assign_result_list]),
)
sampling_result = self.bbox_sampler.sample(assign_result_ensemble, bboxes_tensor, gt_bboxes_tensor)
pos_inds = sampling_result.pos_inds
neg_inds = sampling_result.neg_inds
assert len(pos_inds) + len(neg_inds) == num_proposals
# create target for loss computation
bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size]).to(center.device)
bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size]).to(center.device)
ious = assign_result_ensemble.max_overlaps
ious = torch.clamp(ious, min=0.0, max=1.0)
labels = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long)
label_weights = bboxes_tensor.new_zeros(num_proposals, dtype=torch.long)
if gt_labels_3d is not None: # default label is -1
labels += self.num_classes
# both pos and neg have classification loss, only pos has regression and iou loss
if len(pos_inds) > 0:
pos_bbox_targets = self.bbox_coder.encode(sampling_result.pos_gt_bboxes)
bbox_targets[pos_inds, :] = pos_bbox_targets
bbox_weights[pos_inds, :] = 1.0
if gt_labels_3d is None:
labels[pos_inds] = 1
else:
labels[pos_inds] = gt_labels_3d[sampling_result.pos_assigned_gt_inds]
if self.train_cfg.pos_weight <= 0:
label_weights[pos_inds] = 1.0
else:
label_weights[pos_inds] = self.train_cfg.pos_weight
if len(neg_inds) > 0:
label_weights[neg_inds] = 1.0
# # compute dense heatmap targets
if self.initialize_by_heatmap:
device = labels.device
gt_bboxes_3d = torch.cat([gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]], dim=1).to(device)
grid_size = torch.tensor(self.train_cfg['grid_size'])
pc_range = torch.tensor(self.train_cfg['point_cloud_range'])
voxel_size = torch.tensor(self.train_cfg['voxel_size'])
feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # [x_len, y_len]
heatmap = gt_bboxes_3d.new_zeros(self.num_classes, feature_map_size[1], feature_map_size[0])
for idx in range(len(gt_bboxes_3d)):
width = gt_bboxes_3d[idx][3]
length = gt_bboxes_3d[idx][4]
width = width / voxel_size[0] / self.train_cfg['out_size_factor']
length = length / voxel_size[1] / self.train_cfg['out_size_factor']
if width > 0 and length > 0:
radius = gaussian_radius((length, width), min_overlap=self.train_cfg['gaussian_overlap'])
radius = max(self.train_cfg['min_radius'], int(radius))
x, y = gt_bboxes_3d[idx][0], gt_bboxes_3d[idx][1]
coor_x = (x - pc_range[0]) / voxel_size[0] / self.train_cfg['out_size_factor']
coor_y = (y - pc_range[1]) / voxel_size[1] / self.train_cfg['out_size_factor']
center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device)
center_int = center.to(torch.int32)
draw_heatmap_gaussian(heatmap[gt_labels_3d[idx]], center_int, radius)
mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1)
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], int(pos_inds.shape[0]), float(mean_iou), heatmap[None]
else:
mean_iou = ious[pos_inds].sum() / max(len(pos_inds), 1)
return labels[None], label_weights[None], bbox_targets[None], bbox_weights[None], ious[None], int(pos_inds.shape[0]), float(mean_iou)
@force_fp32(apply_to=('preds_dicts'))
def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
"""Loss function for CenterHead.
Args:
gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
truth gt boxes.
gt_labels_3d (list[torch.Tensor]): Labels of boxes.
preds_dicts (list[list[dict]]): Output of forward function.
Returns:
dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
"""
if self.initialize_by_heatmap:
labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious, heatmap = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0])
else:
labels, label_weights, bbox_targets, bbox_weights, ious, num_pos, matched_ious = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0])
if hasattr(self, 'on_the_image_mask'):
label_weights = label_weights * self.on_the_image_mask
bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None]
num_pos = bbox_weights.max(-1).values.sum()
preds_dict = preds_dicts[0][0]
loss_dict = dict()
if self.initialize_by_heatmap:
# compute heatmap loss
loss_heatmap = self.loss_heatmap(clip_sigmoid(preds_dict['dense_heatmap']), heatmap, avg_factor=max(heatmap.eq(1).float().sum().item(), 1))
loss_dict['loss_heatmap'] = loss_heatmap
# compute loss for each layer
for idx_layer in range(self.num_decoder_layers if self.auxiliary else 1):
if idx_layer == self.num_decoder_layers - 1 or (idx_layer == 0 and self.auxiliary is False):
prefix = 'layer_-1'
else:
prefix = f'layer_{idx_layer}'
layer_labels = labels[..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals].reshape(-1)
layer_label_weights = label_weights[..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals].reshape(-1)
layer_score = preds_dict['heatmap'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals]
layer_cls_score = layer_score.permute(0, 2, 1).reshape(-1, self.num_classes)
layer_loss_cls = self.loss_cls(layer_cls_score, layer_labels, layer_label_weights, avg_factor=max(num_pos, 1))
layer_center = preds_dict['center'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals]
layer_height = preds_dict['height'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals]
layer_rot = preds_dict['rot'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals]
layer_dim = preds_dict['dim'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals]
preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size]
if 'vel' in preds_dict.keys():
layer_vel = preds_dict['vel'][..., idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals]
preds = torch.cat([layer_center, layer_height, layer_dim, layer_rot, layer_vel], dim=1).permute(0, 2, 1) # [BS, num_proposals, code_size]
code_weights = self.train_cfg.get('code_weights', None)
layer_bbox_weights = bbox_weights[:, idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals, :]
layer_reg_weights = layer_bbox_weights * layer_bbox_weights.new_tensor(code_weights)
layer_bbox_targets = bbox_targets[:, idx_layer * self.num_proposals:(idx_layer + 1) * self.num_proposals, :]
layer_loss_bbox = self.loss_bbox(preds, layer_bbox_targets, layer_reg_weights, avg_factor=max(num_pos, 1))
# layer_iou = preds_dict['iou'][..., idx_layer*self.num_proposals:(idx_layer+1)*self.num_proposals].squeeze(1)
# layer_iou_target = ious[..., idx_layer*self.num_proposals:(idx_layer+1)*self.num_proposals]
# layer_loss_iou = self.loss_iou(layer_iou, layer_iou_target, layer_bbox_weights.max(-1).values, avg_factor=max(num_pos, 1))
loss_dict[f'{prefix}_loss_cls'] = layer_loss_cls
loss_dict[f'{prefix}_loss_bbox'] = layer_loss_bbox
# loss_dict[f'{prefix}_loss_iou'] = layer_loss_iou
loss_dict[f'matched_ious'] = layer_loss_cls.new_tensor(matched_ious)
return loss_dict
def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False, for_roi=False):
"""Generate bboxes from bbox head predictions.
Args:
preds_dicts (tuple[list[dict]]): Prediction results.
Returns:
list[list[dict]]: Decoded bbox, scores and labels for each layer & each batch
"""
rets = []
for layer_id, preds_dict in enumerate(preds_dicts):
batch_size = preds_dict[0]['heatmap'].shape[0]
batch_score = preds_dict[0]['heatmap'][..., -self.num_proposals:].sigmoid()
# if self.loss_iou.loss_weight != 0:
# batch_score = torch.sqrt(batch_score * preds_dict[0]['iou'][..., -self.num_proposals:].sigmoid())
one_hot = F.one_hot(self.query_labels, num_classes=self.num_classes).permute(0, 2, 1)
batch_score = batch_score * preds_dict[0]['query_heatmap_score'] * one_hot
batch_center = preds_dict[0]['center'][..., -self.num_proposals:]
batch_height = preds_dict[0]['height'][..., -self.num_proposals:]
batch_dim = preds_dict[0]['dim'][..., -self.num_proposals:]
batch_rot = preds_dict[0]['rot'][..., -self.num_proposals:]
batch_vel = None
if 'vel' in preds_dict[0]:
batch_vel = preds_dict[0]['vel'][..., -self.num_proposals:]
temp = self.bbox_coder.decode(batch_score, batch_rot, batch_dim, batch_center, batch_height, batch_vel, filter=True)
if self.test_cfg['dataset'] == 'nuScenes':
self.tasks = [
dict(num_class=8, class_names=[], indices=[0, 1, 2, 3, 4, 5, 6, 7], radius=-1),
dict(num_class=1, class_names=['pedestrian'], indices=[8], radius=0.175),
dict(num_class=1, class_names=['traffic_cone'], indices=[9], radius=0.175),
]
elif self.test_cfg['dataset'] == 'Waymo':
self.tasks = [
dict(num_class=1, class_names=['Car'], indices=[0], radius=0.7),
dict(num_class=1, class_names=['Pedestrian'], indices=[1], radius=0.7),
dict(num_class=1, class_names=['Cyclist'], indices=[2], radius=0.7),
]
ret_layer = []
for i in range(batch_size):
boxes3d = temp[i]['bboxes']
scores = temp[i]['scores']
labels = temp[i]['labels']
## adopt circle nms for different categories
if self.test_cfg['nms_type'] != None:
keep_mask = torch.zeros_like(scores)
for task in self.tasks:
task_mask = torch.zeros_like(scores)
for cls_idx in task['indices']:
task_mask += labels == cls_idx
task_mask = task_mask.bool()
if task['radius'] > 0:
if self.test_cfg['nms_type'] == 'circle':
boxes_for_nms = torch.cat([boxes3d[task_mask][:, :2], scores[:, None][task_mask]], dim=1)
task_keep_indices = torch.tensor(
circle_nms(
boxes_for_nms.detach().cpu().numpy(),
task['radius'],
)
)
else:
boxes_for_nms = xywhr2xyxyr(img_metas[i]['box_type_3d'](boxes3d[task_mask][:, :7], 7).bev)
top_scores = scores[task_mask]
task_keep_indices = nms_gpu(
boxes_for_nms,
top_scores,
thresh=task['radius'],
pre_maxsize=self.test_cfg['pre_maxsize'],
post_max_size=self.test_cfg['post_maxsize'],
)
else:
task_keep_indices = torch.arange(task_mask.sum())
if task_keep_indices.shape[0] != 0:
keep_indices = torch.where(task_mask != 0)[0][task_keep_indices]
keep_mask[keep_indices] = 1
keep_mask = keep_mask.bool()
ret = dict(bboxes=boxes3d[keep_mask], scores=scores[keep_mask], labels=labels[keep_mask])
else: # no nms
ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
ret_layer.append(ret)
rets.append(ret_layer)
assert len(rets) == 1
assert len(rets[0]) == 1
res = [[
img_metas[0]['box_type_3d'](rets[0][0]['bboxes'], box_dim=rets[0][0]['bboxes'].shape[-1]),
rets[0][0]['scores'],
rets[0][0]['labels'].int()
]]
return res
================================================
FILE: mmdet3d/models/dense_heads/vote_head.py
================================================
import numpy as np
import torch
from mmcv.runner import force_fp32
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.core.post_processing import aligned_3d_nms
from mmdet3d.models.builder import build_loss
from mmdet3d.models.losses import chamfer_distance
from mmdet3d.models.model_utils import VoteModule
from mmdet3d.ops import build_sa_module, furthest_point_sample
from mmdet.core import build_bbox_coder, multi_apply
from mmdet.models import HEADS
from .base_conv_bbox_head import BaseConvBboxHead
@HEADS.register_module()
class VoteHead(nn.Module):
r"""Bbox head of `Votenet `_.
Args:
num_classes (int): The number of class.
bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
decoding boxes.
train_cfg (dict): Config for training.
test_cfg (dict): Config for testing.
vote_module_cfg (dict): Config of VoteModule for point-wise votes.
vote_aggregation_cfg (dict): Config of vote aggregation layer.
pred_layer_cfg (dict): Config of classfication and regression
prediction layers.
conv_cfg (dict): Config of convolution in prediction layer.
norm_cfg (dict): Config of BN in prediction layer.
objectness_loss (dict): Config of objectness loss.
center_loss (dict): Config of center loss.
dir_class_loss (dict): Config of direction classification loss.
dir_res_loss (dict): Config of direction residual regression loss.
size_class_loss (dict): Config of size classification loss.
size_res_loss (dict): Config of size residual regression loss.
semantic_loss (dict): Config of point-wise semantic segmentation loss.
"""
def __init__(self,
num_classes,
bbox_coder,
train_cfg=None,
test_cfg=None,
vote_module_cfg=None,
vote_aggregation_cfg=None,
pred_layer_cfg=None,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=None,
center_loss=None,
dir_class_loss=None,
dir_res_loss=None,
size_class_loss=None,
size_res_loss=None,
semantic_loss=None,
iou_loss=None):
super(VoteHead, self).__init__()
self.num_classes = num_classes
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.gt_per_seed = vote_module_cfg['gt_per_seed']
self.num_proposal = vote_aggregation_cfg['num_point']
self.objectness_loss = build_loss(objectness_loss)
self.center_loss = build_loss(center_loss)
self.dir_res_loss = build_loss(dir_res_loss)
self.dir_class_loss = build_loss(dir_class_loss)
self.size_res_loss = build_loss(size_res_loss)
if size_class_loss is not None:
self.size_class_loss = build_loss(size_class_loss)
if semantic_loss is not None:
self.semantic_loss = build_loss(semantic_loss)
if iou_loss is not None:
self.iou_loss = build_loss(iou_loss)
else:
self.iou_loss = None
self.bbox_coder = build_bbox_coder(bbox_coder)
self.num_sizes = self.bbox_coder.num_sizes
self.num_dir_bins = self.bbox_coder.num_dir_bins
self.vote_module = VoteModule(**vote_module_cfg)
self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
self.fp16_enabled = False
# Bbox classification and regression
self.conv_pred = BaseConvBboxHead(
**pred_layer_cfg,
num_cls_out_channels=self._get_cls_out_channels(),
num_reg_out_channels=self._get_reg_out_channels())
def init_weights(self):
"""Initialize weights of VoteHead."""
pass
def _get_cls_out_channels(self):
"""Return the channel number of classification outputs."""
# Class numbers (k) + objectness (2)
return self.num_classes + 2
def _get_reg_out_channels(self):
"""Return the channel number of regression outputs."""
# Objectness scores (2), center residual (3),
# heading class+residual (num_dir_bins*2),
# size class+residual(num_sizes*4)
return 3 + self.num_dir_bins * 2 + self.num_sizes * 4
def _extract_input(self, feat_dict):
"""Extract inputs from features dictionary.
Args:
feat_dict (dict): Feature dict from backbone.
Returns:
torch.Tensor: Coordinates of input points.
torch.Tensor: Features of input points.
torch.Tensor: Indices of input points.
"""
# for imvotenet
if 'seed_points' in feat_dict and \
'seed_features' in feat_dict and \
'seed_indices' in feat_dict:
seed_points = feat_dict['seed_points']
seed_features = feat_dict['seed_features']
seed_indices = feat_dict['seed_indices']
# for votenet
else:
seed_points = feat_dict['fp_xyz'][-1]
seed_features = feat_dict['fp_features'][-1]
seed_indices = feat_dict['fp_indices'][-1]
return seed_points, seed_features, seed_indices
def forward(self, feat_dict, sample_mod):
"""Forward pass.
Note:
The forward of VoteHead is devided into 4 steps:
1. Generate vote_points from seed_points.
2. Aggregate vote_points.
3. Predict bbox and score.
4. Decode predictions.
Args:
feat_dict (dict): Feature dict from backbone.
sample_mod (str): Sample mode for vote aggregation layer.
valid modes are "vote", "seed", "random" and "spec".
Returns:
dict: Predictions of vote head.
"""
assert sample_mod in ['vote', 'seed', 'random', 'spec']
seed_points, seed_features, seed_indices = self._extract_input(
feat_dict)
# 1. generate vote_points from seed_points
vote_points, vote_features, vote_offset = self.vote_module(
seed_points, seed_features)
results = dict(
seed_points=seed_points,
seed_indices=seed_indices,
vote_points=vote_points,
vote_features=vote_features,
vote_offset=vote_offset)
# 2. aggregate vote_points
if sample_mod == 'vote':
# use fps in vote_aggregation
aggregation_inputs = dict(
points_xyz=vote_points, features=vote_features)
elif sample_mod == 'seed':
# FPS on seed and choose the votes corresponding to the seeds
sample_indices = furthest_point_sample(seed_points,
self.num_proposal)
aggregation_inputs = dict(
points_xyz=vote_points,
features=vote_features,
indices=sample_indices)
elif sample_mod == 'random':
# Random sampling from the votes
batch_size, num_seed = seed_points.shape[:2]
sample_indices = seed_points.new_tensor(
torch.randint(0, num_seed, (batch_size, self.num_proposal)),
dtype=torch.int32)
aggregation_inputs = dict(
points_xyz=vote_points,
features=vote_features,
indices=sample_indices)
elif sample_mod == 'spec':
# Specify the new center in vote_aggregation
aggregation_inputs = dict(
points_xyz=seed_points,
features=seed_features,
target_xyz=vote_points)
else:
raise NotImplementedError(
f'Sample mode {sample_mod} is not supported!')
vote_aggregation_ret = self.vote_aggregation(**aggregation_inputs)
aggregated_points, features, aggregated_indices = vote_aggregation_ret
results['aggregated_points'] = aggregated_points
results['aggregated_features'] = features
results['aggregated_indices'] = aggregated_indices
# 3. predict bbox and score
cls_predictions, reg_predictions = self.conv_pred(features)
# 4. decode predictions
decode_res = self.bbox_coder.split_pred(cls_predictions,
reg_predictions,
aggregated_points)
results.update(decode_res)
return results
@force_fp32(apply_to=('bbox_preds', ))
def loss(self,
bbox_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
img_metas=None,
gt_bboxes_ignore=None,
ret_target=False):
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of vote head.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise
semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise
instance mask.
img_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
ret_target (Bool): Return targets or not.
Returns:
dict: Losses of Votenet.
"""
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
bbox_preds)
(vote_targets, vote_target_masks, size_class_targets, size_res_targets,
dir_class_targets, dir_res_targets, center_targets,
assigned_center_targets, mask_targets, valid_gt_masks,
objectness_targets, objectness_weights, box_loss_weights,
valid_gt_weights) = targets
# calculate vote loss
vote_loss = self.vote_module.get_loss(bbox_preds['seed_points'],
bbox_preds['vote_points'],
bbox_preds['seed_indices'],
vote_target_masks, vote_targets)
# calculate objectness loss
objectness_loss = self.objectness_loss(
bbox_preds['obj_scores'].transpose(2, 1),
objectness_targets,
weight=objectness_weights)
# calculate center loss
source2target_loss, target2source_loss = self.center_loss(
bbox_preds['center'],
center_targets,
src_weight=box_loss_weights,
dst_weight=valid_gt_weights)
center_loss = source2target_loss + target2source_loss
# calculate direction class loss
dir_class_loss = self.dir_class_loss(
bbox_preds['dir_class'].transpose(2, 1),
dir_class_targets,
weight=box_loss_weights)
# calculate direction residual loss
batch_size, proposal_num = size_class_targets.shape[:2]
heading_label_one_hot = vote_targets.new_zeros(
(batch_size, proposal_num, self.num_dir_bins))
heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
dir_res_norm = torch.sum(
bbox_preds['dir_res_norm'] * heading_label_one_hot, -1)
dir_res_loss = self.dir_res_loss(
dir_res_norm, dir_res_targets, weight=box_loss_weights)
# calculate size class loss
size_class_loss = self.size_class_loss(
bbox_preds['size_class'].transpose(2, 1),
size_class_targets,
weight=box_loss_weights)
# calculate size residual loss
one_hot_size_targets = vote_targets.new_zeros(
(batch_size, proposal_num, self.num_sizes))
one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
-1).repeat(1, 1, 1, 3).contiguous()
size_residual_norm = torch.sum(
bbox_preds['size_res_norm'] * one_hot_size_targets_expand, 2)
box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
1, 1, 3)
size_res_loss = self.size_res_loss(
size_residual_norm,
size_res_targets,
weight=box_loss_weights_expand)
# calculate semantic loss
semantic_loss = self.semantic_loss(
bbox_preds['sem_scores'].transpose(2, 1),
mask_targets,
weight=box_loss_weights)
losses = dict(
vote_loss=vote_loss,
objectness_loss=objectness_loss,
semantic_loss=semantic_loss,
center_loss=center_loss,
dir_class_loss=dir_class_loss,
dir_res_loss=dir_res_loss,
size_class_loss=size_class_loss,
size_res_loss=size_res_loss)
if self.iou_loss:
corners_pred = self.bbox_coder.decode_corners(
bbox_preds['center'], size_residual_norm,
one_hot_size_targets_expand)
corners_target = self.bbox_coder.decode_corners(
assigned_center_targets, size_res_targets,
one_hot_size_targets_expand)
iou_loss = self.iou_loss(
corners_pred, corners_target, weight=box_loss_weights)
losses['iou_loss'] = iou_loss
if ret_target:
losses['targets'] = targets
return losses
def get_targets(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
bbox_preds=None):
"""Generate targets of vote head.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance
label of each batch.
bbox_preds (torch.Tensor): Bounding box predictions of vote head.
Returns:
tuple[torch.Tensor]: Targets of vote head.
"""
# find empty example
valid_gt_masks = list()
gt_num = list()
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
gt_num.append(1)
else:
valid_gt_masks.append(gt_labels_3d[index].new_ones(
gt_labels_3d[index].shape))
gt_num.append(gt_labels_3d[index].shape[0])
max_gt_num = max(gt_num)
if pts_semantic_mask is None:
pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
pts_instance_mask = [None for i in range(len(gt_labels_3d))]
aggregated_points = [
bbox_preds['aggregated_points'][i]
for i in range(len(gt_labels_3d))
]
(vote_targets, vote_target_masks, size_class_targets, size_res_targets,
dir_class_targets, dir_res_targets, center_targets,
assigned_center_targets, mask_targets, objectness_targets,
objectness_masks) = multi_apply(self.get_targets_single, points,
gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
aggregated_points)
# pad targets as original code of votenet.
for index in range(len(gt_labels_3d)):
pad_num = max_gt_num - gt_labels_3d[index].shape[0]
center_targets[index] = F.pad(center_targets[index],
(0, 0, 0, pad_num))
valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num))
vote_targets = torch.stack(vote_targets)
vote_target_masks = torch.stack(vote_target_masks)
center_targets = torch.stack(center_targets)
valid_gt_masks = torch.stack(valid_gt_masks)
assigned_center_targets = torch.stack(assigned_center_targets)
objectness_targets = torch.stack(objectness_targets)
objectness_weights = torch.stack(objectness_masks)
objectness_weights /= (torch.sum(objectness_weights) + 1e-6)
box_loss_weights = objectness_targets.float() / (
torch.sum(objectness_targets).float() + 1e-6)
valid_gt_weights = valid_gt_masks.float() / (
torch.sum(valid_gt_masks.float()) + 1e-6)
dir_class_targets = torch.stack(dir_class_targets)
dir_res_targets = torch.stack(dir_res_targets)
size_class_targets = torch.stack(size_class_targets)
size_res_targets = torch.stack(size_res_targets)
mask_targets = torch.stack(mask_targets)
return (vote_targets, vote_target_masks, size_class_targets,
size_res_targets, dir_class_targets, dir_res_targets,
center_targets, assigned_center_targets, mask_targets,
valid_gt_masks, objectness_targets, objectness_weights,
box_loss_weights, valid_gt_weights)
def get_targets_single(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
aggregated_points=None):
"""Generate targets of vote head for single batch.
Args:
points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \
boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic
label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance
label of each batch.
aggregated_points (torch.Tensor): Aggregated points from
vote aggregation layer.
Returns:
tuple[torch.Tensor]: Targets of vote head.
"""
assert self.bbox_coder.with_rot or pts_semantic_mask is not None
gt_bboxes_3d = gt_bboxes_3d.to(points.device)
# generate votes target
num_points = points.shape[0]
if self.bbox_coder.with_rot:
vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed])
vote_target_masks = points.new_zeros([num_points],
dtype=torch.long)
vote_target_idx = points.new_zeros([num_points], dtype=torch.long)
box_indices_all = gt_bboxes_3d.points_in_boxes(points)
for i in range(gt_labels_3d.shape[0]):
box_indices = box_indices_all[:, i]
indices = torch.nonzero(
box_indices, as_tuple=False).squeeze(-1)
selected_points = points[indices]
vote_target_masks[indices] = 1
vote_targets_tmp = vote_targets[indices]
votes = gt_bboxes_3d.gravity_center[i].unsqueeze(
0) - selected_points[:, :3]
for j in range(self.gt_per_seed):
column_indices = torch.nonzero(
vote_target_idx[indices] == j,
as_tuple=False).squeeze(-1)
vote_targets_tmp[column_indices,
int(j * 3):int(j * 3 +
3)] = votes[column_indices]
if j == 0:
vote_targets_tmp[column_indices] = votes[
column_indices].repeat(1, self.gt_per_seed)
vote_targets[indices] = vote_targets_tmp
vote_target_idx[indices] = torch.clamp(
vote_target_idx[indices] + 1, max=2)
elif pts_semantic_mask is not None:
vote_targets = points.new_zeros([num_points, 3])
vote_target_masks = points.new_zeros([num_points],
dtype=torch.long)
for i in torch.unique(pts_instance_mask):
indices = torch.nonzero(
pts_instance_mask == i, as_tuple=False).squeeze(-1)
if pts_semantic_mask[indices[0]] < self.num_classes:
selected_points = points[indices, :3]
center = 0.5 * (
selected_points.min(0)[0] + selected_points.max(0)[0])
vote_targets[indices, :] = center - selected_points
vote_target_masks[indices] = 1
vote_targets = vote_targets.repeat((1, self.gt_per_seed))
else:
raise NotImplementedError
(center_targets, size_class_targets, size_res_targets,
dir_class_targets,
dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d)
proposal_num = aggregated_points.shape[0]
distance1, _, assignment, _ = chamfer_distance(
aggregated_points.unsqueeze(0),
center_targets.unsqueeze(0),
reduction='none')
assignment = assignment.squeeze(0)
euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6)
objectness_targets = points.new_zeros((proposal_num), dtype=torch.long)
objectness_targets[
euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1
objectness_masks = points.new_zeros((proposal_num))
objectness_masks[
euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0
objectness_masks[
euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0
dir_class_targets = dir_class_targets[assignment]
dir_res_targets = dir_res_targets[assignment]
dir_res_targets /= (np.pi / self.num_dir_bins)
size_class_targets = size_class_targets[assignment]
size_res_targets = size_res_targets[assignment]
one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros(
(proposal_num, self.num_sizes))
one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1)
one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat(
1, 1, 3)
mean_sizes = size_res_targets.new_tensor(
self.bbox_coder.mean_sizes).unsqueeze(0)
pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1)
size_res_targets /= pos_mean_sizes
mask_targets = gt_labels_3d[assignment]
assigned_center_targets = center_targets[assignment]
return (vote_targets, vote_target_masks, size_class_targets,
size_res_targets, dir_class_targets,
dir_res_targets, center_targets, assigned_center_targets,
mask_targets.long(), objectness_targets, objectness_masks)
def get_bboxes(self,
points,
bbox_preds,
input_metas,
rescale=False,
use_nms=True):
"""Generate bboxes from vote head predictions.
Args:
points (torch.Tensor): Input points.
bbox_preds (dict): Predictions from vote head.
input_metas (list[dict]): Point cloud and image's meta info.
rescale (bool): Whether to rescale bboxes.
use_nms (bool): Whether to apply NMS, skip nms postprocessing
while using vote head in rpn stage.
Returns:
list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
"""
# decode boxes
obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1]
sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
bbox3d = self.bbox_coder.decode(bbox_preds)
if use_nms:
batch_size = bbox3d.shape[0]
results = list()
for b in range(batch_size):
bbox_selected, score_selected, labels = \
self.multiclass_nms_single(obj_scores[b], sem_scores[b],
bbox3d[b], points[b, ..., :3],
input_metas[b])
bbox = input_metas[b]['box_type_3d'](
bbox_selected,
box_dim=bbox_selected.shape[-1],
with_yaw=self.bbox_coder.with_rot)
results.append((bbox, score_selected, labels))
return results
else:
return bbox3d
def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
input_meta):
"""Multi-class nms in single batch.
Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes.
points (torch.Tensor): Input points.
input_meta (dict): Point cloud and image's meta info.
Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels.
"""
bbox = input_meta['box_type_3d'](
bbox,
box_dim=bbox.shape[-1],
with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 0.5))
box_indices = bbox.points_in_boxes(points)
corner3d = bbox.corners
minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
nonempty_box_mask = box_indices.T.sum(1) > 5
bbox_classes = torch.argmax(sem_scores, -1)
nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
obj_scores[nonempty_box_mask],
bbox_classes[nonempty_box_mask],
self.test_cfg.nms_thr)
# filter empty boxes and boxes with low score
scores_mask = (obj_scores > self.test_cfg.score_thr)
nonempty_box_inds = torch.nonzero(
nonempty_box_mask, as_tuple=False).flatten()
nonempty_mask = torch.zeros_like(bbox_classes).scatter(
0, nonempty_box_inds[nms_selected], 1)
selected = (nonempty_mask.bool() & scores_mask.bool())
if self.test_cfg.per_class_proposal:
bbox_selected, score_selected, labels = [], [], []
for k in range(sem_scores.shape[-1]):
bbox_selected.append(bbox[selected].tensor)
score_selected.append(obj_scores[selected] *
sem_scores[selected][:, k])
labels.append(
torch.zeros_like(bbox_classes[selected]).fill_(k))
bbox_selected = torch.cat(bbox_selected, 0)
score_selected = torch.cat(score_selected, 0)
labels = torch.cat(labels, 0)
else:
bbox_selected = bbox[selected].tensor
score_selected = obj_scores[selected]
labels = bbox_classes[selected]
return bbox_selected, score_selected, labels
================================================
FILE: mmdet3d/models/detectors/__init__.py
================================================
from .base import Base3DDetector
from .centerpoint import CenterPoint
from .dynamic_voxelnet import DynamicVoxelNet
from .h3dnet import H3DNet
from .imvotenet import ImVoteNet
from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN
from .mvx_two_stage import MVXTwoStageDetector
from .parta2 import PartA2
from .ssd3dnet import SSD3DNet
from .votenet import VoteNet
from .voxelnet import VoxelNet
from .transfusion import TransFusionDetector
from .sparsefusion import SparseFusionDetector
__all__ = [
'Base3DDetector',
'VoxelNet',
'DynamicVoxelNet',
'MVXTwoStageDetector',
'DynamicMVXFasterRCNN',
'MVXFasterRCNN',
'PartA2',
'VoteNet',
'H3DNet',
'CenterPoint',
'SSD3DNet',
'ImVoteNet',
'TransFusionDetector',
'SparseFusionDetector',
]
================================================
FILE: mmdet3d/models/detectors/base.py
================================================
import mmcv
import torch
from mmcv.parallel import DataContainer as DC
from mmcv.runner import auto_fp16
from os import path as osp
from mmdet3d.core import Box3DMode, Coord3DMode, show_result
from mmdet.models.detectors import BaseDetector
class Base3DDetector(BaseDetector):
"""Base class for detectors."""
def forward_test(self, points, img_metas, img=None, **kwargs):
"""
Args:
points (list[torch.Tensor]): the outer list indicates test-time
augmentations and inner torch.Tensor should have a shape NxC,
which contains all points in the batch.
img_metas (list[list[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch
img (list[torch.Tensor], optional): the outer
list indicates test-time augmentations and inner
torch.Tensor should have a shape NxCxHxW, which contains
all images in the batch. Defaults to None.
"""
for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(points)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.format(
len(points), len(img_metas)))
if num_augs == 1:
img = [img] if img is None else img
return self.simple_test(points[0], img_metas[0], img[0], **kwargs)
else:
return self.aug_test(points, img_metas, img, **kwargs)
@auto_fp16(apply_to=('img', 'points'))
def forward(self, return_loss=True, **kwargs):
"""Calls either forward_train or forward_test depending on whether
return_loss=True.
Note this setting will change the expected inputs. When
`return_loss=True`, img and img_metas are single-nested (i.e.
torch.Tensor and list[dict]), and when `resturn_loss=False`, img and
img_metas should be double nested (i.e. list[torch.Tensor],
list[list[dict]]), with the outer list indicating test time
augmentations.
"""
if return_loss:
return self.forward_train(**kwargs)
else:
return self.forward_test(**kwargs)
def show_results(self, data, result, out_dir):
"""Results visualization.
Args:
data (list[dict]): Input points and the information of the sample.
result (list[dict]): Prediction results.
out_dir (str): Output directory of visualization result.
"""
for batch_id in range(len(result)):
if isinstance(data['points'][0], DC):
points = data['points'][0]._data[0][batch_id].numpy()
elif mmcv.is_list_of(data['points'][0], torch.Tensor):
points = data['points'][0][batch_id]
else:
ValueError(f"Unsupported data type {type(data['points'][0])} "
f'for visualization!')
if isinstance(data['img_metas'][0], DC):
pts_filename = data['img_metas'][0]._data[0][batch_id][
'pts_filename']
box_mode_3d = data['img_metas'][0]._data[0][batch_id][
'box_mode_3d']
elif mmcv.is_list_of(data['img_metas'][0], dict):
pts_filename = data['img_metas'][0][batch_id]['pts_filename']
box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
else:
ValueError(
f"Unsupported data type {type(data['img_metas'][0])} "
f'for visualization!')
file_name = osp.split(pts_filename)[-1].split('.')[0]
assert out_dir is not None, 'Expect out_dir, got none.'
pred_bboxes = result[batch_id]['boxes_3d']
# for now we convert points and bbox into depth mode
if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
== Box3DMode.LIDAR):
points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
Coord3DMode.DEPTH)
pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
Box3DMode.DEPTH)
elif box_mode_3d != Box3DMode.DEPTH:
ValueError(
f'Unsupported box_mode_3d {box_mode_3d} for convertion!')
pred_bboxes = pred_bboxes.tensor.cpu().numpy()
show_result(points, None, pred_bboxes, out_dir, file_name)
================================================
FILE: mmdet3d/models/detectors/centerpoint.py
================================================
import torch
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet.models import DETECTORS
from .mvx_two_stage import MVXTwoStageDetector
@DETECTORS.register_module()
class CenterPoint(MVXTwoStageDetector):
"""Base class of Multi-modality VoxelNet."""
def __init__(self,
pts_voxel_layer=None,
pts_voxel_encoder=None,
pts_middle_encoder=None,
pts_fusion_layer=None,
img_backbone=None,
pts_backbone=None,
img_neck=None,
pts_neck=None,
pts_bbox_head=None,
img_roi_head=None,
img_rpn_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(CenterPoint,
self).__init__(pts_voxel_layer, pts_voxel_encoder,
pts_middle_encoder, pts_fusion_layer,
img_backbone, pts_backbone, img_neck, pts_neck,
pts_bbox_head, img_roi_head, img_rpn_head,
train_cfg, test_cfg, pretrained)
def extract_pts_feat(self, pts, img_feats, img_metas):
"""Extract features of points."""
if not self.with_pts_bbox:
return None
voxels, num_points, coors = self.voxelize(pts)
voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
batch_size = coors[-1, 0] + 1
x = self.pts_middle_encoder(voxel_features, coors, batch_size)
x = self.pts_backbone(x)
if self.with_pts_neck:
x = self.pts_neck(x)
return x
def forward_pts_train(self,
pts_feats,
gt_bboxes_3d,
gt_labels_3d,
img_metas,
gt_bboxes_ignore=None):
"""Forward function for point cloud branch.
Args:
pts_feats (list[torch.Tensor]): Features of point cloud branch
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
img_metas (list[dict]): Meta information of samples.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
outs = self.pts_bbox_head(pts_feats)
loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
losses = self.pts_bbox_head.loss(*loss_inputs)
return losses
def simple_test_pts(self, x, img_metas, rescale=False):
"""Test function of point cloud branch."""
outs = self.pts_bbox_head(x)
bbox_list = self.pts_bbox_head.get_bboxes(
outs, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test_pts(self, feats, img_metas, rescale=False):
"""Test function of point cloud branch with augmentaiton.
The function implementation process is as follows:
- step 1: map features back for double-flip augmentation.
- step 2: merge all features and generate boxes.
- step 3: map boxes back for scale augmentation.
- step 4: merge results.
Args:
feats (list[torch.Tensor]): Feature of point cloud.
img_metas (list[dict]): Meta information of samples.
rescale (bool): Whether to rescale bboxes. Default: False.
Returns:
dict: Returned bboxes consists of the following keys:
- boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes.
- scores_3d (torch.Tensor): Scores of predicted boxes.
- labels_3d (torch.Tensor): Labels of predicted boxes.
"""
# only support aug_test for one sample
outs_list = []
for x, img_meta in zip(feats, img_metas):
outs = self.pts_bbox_head(x)
# merge augmented outputs before decoding bboxes
for task_id, out in enumerate(outs):
for key in out[0].keys():
if img_meta[0]['pcd_horizontal_flip']:
outs[task_id][0][key] = torch.flip(
outs[task_id][0][key], dims=[2])
if key == 'reg':
outs[task_id][0][key][:, 1, ...] = 1 - outs[
task_id][0][key][:, 1, ...]
elif key == 'rot':
outs[task_id][0][
key][:, 1,
...] = -outs[task_id][0][key][:, 1, ...]
elif key == 'vel':
outs[task_id][0][
key][:, 1,
...] = -outs[task_id][0][key][:, 1, ...]
if img_meta[0]['pcd_vertical_flip']:
outs[task_id][0][key] = torch.flip(
outs[task_id][0][key], dims=[3])
if key == 'reg':
outs[task_id][0][key][:, 0, ...] = 1 - outs[
task_id][0][key][:, 0, ...]
elif key == 'rot':
outs[task_id][0][
key][:, 0,
...] = -outs[task_id][0][key][:, 0, ...]
elif key == 'vel':
outs[task_id][0][
key][:, 0,
...] = -outs[task_id][0][key][:, 0, ...]
outs_list.append(outs)
preds_dicts = dict()
scale_img_metas = []
# concat outputs sharing the same pcd_scale_factor
for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)):
pcd_scale_factor = img_meta[0]['pcd_scale_factor']
if pcd_scale_factor not in preds_dicts.keys():
preds_dicts[pcd_scale_factor] = outs
scale_img_metas.append(img_meta)
else:
for task_id, out in enumerate(outs):
for key in out[0].keys():
preds_dicts[pcd_scale_factor][task_id][0][key] += out[
0][key]
aug_bboxes = []
for pcd_scale_factor, preds_dict in preds_dicts.items():
for task_id, pred_dict in enumerate(preds_dict):
# merge outputs with different flips before decoding bboxes
for key in pred_dict[0].keys():
preds_dict[task_id][0][key] /= len(outs_list) / len(
preds_dicts.keys())
bbox_list = self.pts_bbox_head.get_bboxes(
preds_dict, img_metas[0], rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
if len(preds_dicts.keys()) > 1:
# merge outputs with different scales after decoding bboxes
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas,
self.pts_bbox_head.test_cfg)
return merged_bboxes
else:
for key in bbox_list[0].keys():
bbox_list[0][key] = bbox_list[0][key].to('cpu')
return bbox_list[0]
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test function with augmentaiton."""
img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
bbox_list = dict()
if pts_feats and self.with_pts_bbox:
pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale)
bbox_list.update(pts_bbox=pts_bbox)
return [bbox_list]
================================================
FILE: mmdet3d/models/detectors/dynamic_voxelnet.py
================================================
import torch
from mmcv.runner import force_fp32
from torch.nn import functional as F
from mmdet.models import DETECTORS
from .voxelnet import VoxelNet
@DETECTORS.register_module()
class DynamicVoxelNet(VoxelNet):
r"""VoxelNet using `dynamic voxelization `_.
"""
def __init__(self,
voxel_layer,
voxel_encoder,
middle_encoder,
backbone,
neck=None,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(DynamicVoxelNet, self).__init__(
voxel_layer=voxel_layer,
voxel_encoder=voxel_encoder,
middle_encoder=middle_encoder,
backbone=backbone,
neck=neck,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained,
)
def extract_feat(self, points, img_metas):
"""Extract features from points."""
voxels, coors = self.voxelize(points)
voxel_features, feature_coors = self.voxel_encoder(voxels, coors)
batch_size = coors[-1, 0].item() + 1
x = self.middle_encoder(voxel_features, feature_coors, batch_size)
x = self.backbone(x)
if self.with_neck:
x = self.neck(x)
return x
@torch.no_grad()
@force_fp32()
def voxelize(self, points):
"""Apply dynamic voxelization to points.
Args:
points (list[torch.Tensor]): Points of each sample.
Returns:
tuple[torch.Tensor]: Concatenated points and coordinates.
"""
coors = []
# dynamic voxelization only provide a coors mapping
for res in points:
res_coors = self.voxel_layer(res)
coors.append(res_coors)
points = torch.cat(points, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
return points, coors_batch
================================================
FILE: mmdet3d/models/detectors/h3dnet.py
================================================
import torch
from mmdet3d.core import merge_aug_bboxes_3d
from mmdet.models import DETECTORS
from .two_stage import TwoStage3DDetector
@DETECTORS.register_module()
class H3DNet(TwoStage3DDetector):
r"""H3DNet model.
Please refer to the `paper `_
"""
def __init__(self,
backbone,
neck=None,
rpn_head=None,
roi_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(H3DNet, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
def forward_train(self,
points,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
gt_bboxes_ignore=None):
"""Forward of training.
Args:
points (list[torch.Tensor]): Points of each batch.
img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance
label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict: Losses.
"""
points_cat = torch.stack(points)
feats_dict = self.extract_feat(points_cat)
feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
feats_dict['fp_features'] = [feats_dict['hd_feature']]
feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
losses = dict()
if self.with_rpn:
rpn_outs = self.rpn_head(feats_dict, self.train_cfg.rpn.sample_mod)
feats_dict.update(rpn_outs)
rpn_loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask, img_metas)
rpn_losses = self.rpn_head.loss(
rpn_outs,
*rpn_loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore,
ret_target=True)
feats_dict['targets'] = rpn_losses.pop('targets')
losses.update(rpn_losses)
# Generate rpn proposals
proposal_cfg = self.train_cfg.get('rpn_proposal',
self.test_cfg.rpn)
proposal_inputs = (points, rpn_outs, img_metas)
proposal_list = self.rpn_head.get_bboxes(
*proposal_inputs, use_nms=proposal_cfg.use_nms)
feats_dict['proposal_list'] = proposal_list
else:
raise NotImplementedError
roi_losses = self.roi_head.forward_train(feats_dict, img_metas, points,
gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask,
pts_instance_mask,
gt_bboxes_ignore)
losses.update(roi_losses)
return losses
def simple_test(self, points, img_metas, imgs=None, rescale=False):
"""Forward of testing.
Args:
points (list[torch.Tensor]): Points of each sample.
img_metas (list): Image metas.
rescale (bool): Whether to rescale results.
Returns:
list: Predicted 3d boxes.
"""
points_cat = torch.stack(points)
feats_dict = self.extract_feat(points_cat)
feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]]
feats_dict['fp_features'] = [feats_dict['hd_feature']]
feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]]
if self.with_rpn:
proposal_cfg = self.test_cfg.rpn
rpn_outs = self.rpn_head(feats_dict, proposal_cfg.sample_mod)
feats_dict.update(rpn_outs)
# Generate rpn proposals
proposal_list = self.rpn_head.get_bboxes(
points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms)
feats_dict['proposal_list'] = proposal_list
else:
raise NotImplementedError
return self.roi_head.simple_test(
feats_dict, img_metas, points_cat, rescale=rescale)
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test with augmentation."""
points_cat = [torch.stack(pts) for pts in points]
feats_dict = self.extract_feats(points_cat, img_metas)
for feat_dict in feats_dict:
feat_dict['fp_xyz'] = [feat_dict['fp_xyz_net0'][-1]]
feat_dict['fp_features'] = [feat_dict['hd_feature']]
feat_dict['fp_indices'] = [feat_dict['fp_indices_net0'][-1]]
# only support aug_test for one sample
aug_bboxes = []
for feat_dict, pts_cat, img_meta in zip(feats_dict, points_cat,
img_metas):
if self.with_rpn:
proposal_cfg = self.test_cfg.rpn
rpn_outs = self.rpn_head(feat_dict, proposal_cfg.sample_mod)
feat_dict.update(rpn_outs)
# Generate rpn proposals
proposal_list = self.rpn_head.get_bboxes(
points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms)
feat_dict['proposal_list'] = proposal_list
else:
raise NotImplementedError
bbox_results = self.roi_head.simple_test(
feat_dict,
self.test_cfg.rcnn.sample_mod,
img_meta,
pts_cat,
rescale=rescale)
aug_bboxes.append(bbox_results)
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
return [merged_bboxes]
def extract_feats(self, points, img_metas):
"""Extract features of multiple samples."""
return [
self.extract_feat(pts, img_meta)
for pts, img_meta in zip(points, img_metas)
]
================================================
FILE: mmdet3d/models/detectors/imvotenet.py
================================================
import numpy as np
import torch
from torch import nn as nn
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet3d.models.utils import MLP
from mmdet.models import DETECTORS
from .. import builder
from .base import Base3DDetector
def sample_valid_seeds(mask, num_sampled_seed=1024):
"""Randomly sample seeds from all imvotes.
Args:
mask (torch.Tensor): Bool tensor in shape (
seed_num*max_imvote_per_pixel), indicates
whether this imvote corresponds to a 2D bbox.
num_sampled_seed (int): How many to sample from all imvotes.
Returns:
torch.Tensor: Indices with shape (num_sampled_seed).
"""
device = mask.device
batch_size = mask.shape[0]
sample_inds = mask.new_zeros((batch_size, num_sampled_seed),
dtype=torch.int64)
for bidx in range(batch_size):
# return index of non zero elements
valid_inds = torch.nonzero(mask[bidx, :]).squeeze(-1)
if len(valid_inds) < num_sampled_seed:
# compute set t1 - t2
t1 = torch.arange(num_sampled_seed, device=device)
t2 = valid_inds % num_sampled_seed
combined = torch.cat((t1, t2))
uniques, counts = combined.unique(return_counts=True)
difference = uniques[counts == 1]
rand_inds = torch.randperm(
len(difference),
device=device)[:num_sampled_seed - len(valid_inds)]
cur_sample_inds = difference[rand_inds]
cur_sample_inds = torch.cat((valid_inds, cur_sample_inds))
else:
rand_inds = torch.randperm(
len(valid_inds), device=device)[:num_sampled_seed]
cur_sample_inds = valid_inds[rand_inds]
sample_inds[bidx, :] = cur_sample_inds
return sample_inds
@DETECTORS.register_module()
class ImVoteNet(Base3DDetector):
r"""`ImVoteNet `_ for 3D detection."""
def __init__(self,
pts_backbone=None,
pts_bbox_heads=None,
pts_neck=None,
img_backbone=None,
img_neck=None,
img_roi_head=None,
img_rpn_head=None,
img_mlp=None,
freeze_img_branch=False,
fusion_layer=None,
num_sampled_seed=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(ImVoteNet, self).__init__()
# point branch
if pts_backbone is not None:
self.pts_backbone = builder.build_backbone(pts_backbone)
if pts_neck is not None:
self.pts_neck = builder.build_neck(pts_neck)
if pts_bbox_heads is not None:
pts_bbox_head_common = pts_bbox_heads.common
pts_bbox_head_common.update(
train_cfg=train_cfg.pts if train_cfg is not None else None)
pts_bbox_head_common.update(test_cfg=test_cfg.pts)
pts_bbox_head_joint = pts_bbox_head_common.copy()
pts_bbox_head_joint.update(pts_bbox_heads.joint)
pts_bbox_head_pts = pts_bbox_head_common.copy()
pts_bbox_head_pts.update(pts_bbox_heads.pts)
pts_bbox_head_img = pts_bbox_head_common.copy()
pts_bbox_head_img.update(pts_bbox_heads.img)
self.pts_bbox_head_joint = builder.build_head(pts_bbox_head_joint)
self.pts_bbox_head_pts = builder.build_head(pts_bbox_head_pts)
self.pts_bbox_head_img = builder.build_head(pts_bbox_head_img)
self.pts_bbox_heads = [
self.pts_bbox_head_joint, self.pts_bbox_head_pts,
self.pts_bbox_head_img
]
self.loss_weights = pts_bbox_heads.loss_weights
# image branch
if img_backbone:
self.img_backbone = builder.build_backbone(img_backbone)
if img_neck is not None:
self.img_neck = builder.build_neck(img_neck)
if img_rpn_head is not None:
rpn_train_cfg = train_cfg.img_rpn if train_cfg \
is not None else None
img_rpn_head_ = img_rpn_head.copy()
img_rpn_head_.update(
train_cfg=rpn_train_cfg, test_cfg=test_cfg.img_rpn)
self.img_rpn_head = builder.build_head(img_rpn_head_)
if img_roi_head is not None:
rcnn_train_cfg = train_cfg.img_rcnn if train_cfg \
is not None else None
img_roi_head.update(
train_cfg=rcnn_train_cfg, test_cfg=test_cfg.img_rcnn)
self.img_roi_head = builder.build_head(img_roi_head)
# fusion
if fusion_layer is not None:
self.fusion_layer = builder.build_fusion_layer(fusion_layer)
self.max_imvote_per_pixel = fusion_layer.max_imvote_per_pixel
self.freeze_img_branch = freeze_img_branch
if freeze_img_branch:
self.freeze_img_branch_params()
if img_mlp is not None:
self.img_mlp = MLP(**img_mlp)
self.num_sampled_seed = num_sampled_seed
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
def init_weights(self, pretrained=None):
"""Initialize model weights."""
super(ImVoteNet, self).init_weights(pretrained)
if pretrained is None:
img_pretrained = None
pts_pretrained = None
elif isinstance(pretrained, dict):
img_pretrained = pretrained.get('img', None)
pts_pretrained = pretrained.get('pts', None)
else:
raise ValueError(
f'pretrained should be a dict, got {type(pretrained)}')
if self.with_img_backbone:
self.img_backbone.init_weights(pretrained=img_pretrained)
if self.with_img_neck:
if isinstance(self.img_neck, nn.Sequential):
for m in self.img_neck:
m.init_weights()
else:
self.img_neck.init_weights()
if self.with_img_roi_head:
self.img_roi_head.init_weights(img_pretrained)
if self.with_img_rpn:
self.img_rpn_head.init_weights()
if self.with_pts_backbone:
self.pts_backbone.init_weights(pretrained=pts_pretrained)
if self.with_pts_bbox:
self.pts_bbox_head.init_weights()
if self.with_pts_neck:
if isinstance(self.pts_neck, nn.Sequential):
for m in self.pts_neck:
m.init_weights()
else:
self.pts_neck.init_weights()
def freeze_img_branch_params(self):
"""Freeze all image branch parameters."""
if self.with_img_bbox_head:
for param in self.img_bbox_head.parameters():
param.requires_grad = False
if self.with_img_backbone:
for param in self.img_backbone.parameters():
param.requires_grad = False
if self.with_img_neck:
for param in self.img_neck.parameters():
param.requires_grad = False
if self.with_img_rpn:
for param in self.img_rpn_head.parameters():
param.requires_grad = False
if self.with_img_roi_head:
for param in self.img_roi_head.parameters():
param.requires_grad = False
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
"""Overload in order to load img network ckpts into img branch."""
module_names = ['backbone', 'neck', 'roi_head', 'rpn_head']
for key in list(state_dict):
for module_name in module_names:
if key.startswith(module_name) and ('img_' +
key) not in state_dict:
state_dict['img_' + key] = state_dict.pop(key)
super()._load_from_state_dict(state_dict, prefix, local_metadata,
strict, missing_keys, unexpected_keys,
error_msgs)
def train(self, mode=True):
"""Overload in order to keep image branch modules in eval mode."""
super(ImVoteNet, self).train(mode)
if self.freeze_img_branch:
if self.with_img_bbox_head:
self.img_bbox_head.eval()
if self.with_img_backbone:
self.img_backbone.eval()
if self.with_img_neck:
self.img_neck.eval()
if self.with_img_rpn:
self.img_rpn_head.eval()
if self.with_img_roi_head:
self.img_roi_head.eval()
@property
def with_img_bbox(self):
"""bool: Whether the detector has a 2D image box head."""
return ((hasattr(self, 'img_roi_head') and self.img_roi_head.with_bbox)
or (hasattr(self, 'img_bbox_head')
and self.img_bbox_head is not None))
@property
def with_img_bbox_head(self):
"""bool: Whether the detector has a 2D image box head (not roi)."""
return hasattr(self,
'img_bbox_head') and self.img_bbox_head is not None
@property
def with_img_backbone(self):
"""bool: Whether the detector has a 2D image backbone."""
return hasattr(self, 'img_backbone') and self.img_backbone is not None
@property
def with_img_neck(self):
"""bool: Whether the detector has a neck in image branch."""
return hasattr(self, 'img_neck') and self.img_neck is not None
@property
def with_img_rpn(self):
"""bool: Whether the detector has a 2D RPN in image detector branch."""
return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
@property
def with_img_roi_head(self):
"""bool: Whether the detector has a RoI Head in image branch."""
return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
@property
def with_pts_bbox(self):
"""bool: Whether the detector has a 3D box head."""
return hasattr(self,
'pts_bbox_head') and self.pts_bbox_head is not None
@property
def with_pts_backbone(self):
"""bool: Whether the detector has a 3D backbone."""
return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
@property
def with_pts_neck(self):
"""bool: Whether the detector has a neck in 3D detector branch."""
return hasattr(self, 'pts_neck') and self.pts_neck is not None
def extract_feat(self, imgs):
"""Just to inherit from abstract method."""
pass
def extract_img_feat(self, img):
"""Directly extract features from the img backbone+neck."""
x = self.img_backbone(img)
if self.with_img_neck:
x = self.img_neck(x)
return x
def extract_img_feats(self, imgs):
"""Extract features from multiple images.
Args:
imgs (list[torch.Tensor]): A list of images. The images are
augmented from the same image but in different ways.
Returns:
list[torch.Tensor]: Features of different images
"""
assert isinstance(imgs, list)
return [self.extract_img_feat(img) for img in imgs]
def extract_pts_feat(self, pts):
"""Extract features of points."""
x = self.pts_backbone(pts)
if self.with_pts_neck:
x = self.pts_neck(x)
seed_points = x['fp_xyz'][-1]
seed_features = x['fp_features'][-1]
seed_indices = x['fp_indices'][-1]
return (seed_points, seed_features, seed_indices)
def extract_pts_feats(self, pts):
"""Extract features of points from multiple samples."""
assert isinstance(pts, list)
return [self.extract_pts_feat(pt) for pt in pts]
@torch.no_grad()
def extract_bboxes_2d(self,
img,
img_metas,
train=True,
bboxes_2d=None,
**kwargs):
"""Extract bounding boxes from 2d detector.
Args:
img (torch.Tensor): of shape (N, C, H, W) encoding input images.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): Image meta info.
train (bool): train-time or not.
bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
not supported yet.
Return:
list[torch.Tensor]: a list of processed 2d bounding boxes.
"""
if bboxes_2d is None:
x = self.extract_img_feat(img)
proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
rets = self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=False)
rets_processed = []
for ret in rets:
tmp = np.concatenate(ret, axis=0)
sem_class = img.new_zeros((len(tmp)))
start = 0
for i, bboxes in enumerate(ret):
sem_class[start:start + len(bboxes)] = i
start += len(bboxes)
ret = img.new_tensor(tmp)
# append class index
ret = torch.cat([ret, sem_class[:, None]], dim=-1)
inds = torch.argsort(ret[:, 4], descending=True)
ret = ret.index_select(0, inds)
# drop half bboxes during training for better generalization
if train:
rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
ret = ret[rand_drop]
rets_processed.append(ret.float())
return rets_processed
else:
rets_processed = []
for ret in bboxes_2d:
if len(ret) > 0 and train:
rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2]
rand_drop = torch.sort(rand_drop)[0]
ret = ret[rand_drop]
rets_processed.append(ret.float())
return rets_processed
def forward_train(self,
points=None,
img=None,
img_metas=None,
gt_bboxes=None,
gt_labels=None,
gt_bboxes_ignore=None,
gt_masks=None,
proposals=None,
calib=None,
bboxes_2d=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
pts_semantic_mask=None,
pts_instance_mask=None,
**kwargs):
"""Forwarding of train for image branch pretrain or stage 2 train.
Args:
points (list[torch.Tensor]): Points of each batch.
img (torch.Tensor): of shape (N, C, H, W) encoding input images.
Typically these should be mean centered and std scaled.
img_metas (list[dict]): list of image and point cloud meta info
dict. For example, keys include 'ori_shape', 'img_norm_cfg',
and 'transformation_3d_flow'. For details on the values of
the keys see `mmdet/datasets/pipelines/formatting.py:Collect`.
gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image
with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
gt_labels (list[torch.Tensor]): class indices for each
2d bounding box.
gt_bboxes_ignore (None | list[torch.Tensor]): specify which
2d bounding boxes can be ignored when computing the loss.
gt_masks (None | torch.Tensor): true segmentation masks for each
2d bbox, used if the architecture supports a segmentation task.
proposals: override rpn proposals (2d) with custom proposals.
Use when `with_rpn` is False.
calib (dict[str, torch.Tensor]): camera calibration matrices,
Rt and K.
bboxes_2d (list[torch.Tensor]): provided 2d bboxes,
not supported yet.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes.
gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance
label of each batch.
Returns:
dict[str, torch.Tensor]: a dictionary of loss components.
"""
if points is None:
x = self.extract_img_feat(img)
losses = dict()
# RPN forward and loss
if self.with_img_rpn:
proposal_cfg = self.train_cfg.get('img_rpn_proposal',
self.test_cfg.img_rpn)
rpn_losses, proposal_list = self.img_rpn_head.forward_train(
x,
img_metas,
gt_bboxes,
gt_labels=None,
gt_bboxes_ignore=gt_bboxes_ignore,
proposal_cfg=proposal_cfg)
losses.update(rpn_losses)
else:
proposal_list = proposals
roi_losses = self.img_roi_head.forward_train(
x, img_metas, proposal_list, gt_bboxes, gt_labels,
gt_bboxes_ignore, gt_masks, **kwargs)
losses.update(roi_losses)
return losses
else:
bboxes_2d = self.extract_bboxes_2d(
img, img_metas, bboxes_2d=bboxes_2d, **kwargs)
points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(points)
img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
img_metas, calib)
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
pts_feat_size = seed_3d_features.shape[1]
inds_img = inds.view(batch_size, 1,
-1).expand(-1, img_feat_size, -1)
img_features = img_features.gather(-1, inds_img)
inds = inds % inds.shape[1]
inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
inds_seed_feats = inds.view(batch_size, 1,
-1).expand(-1, pts_feat_size, -1)
seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
seed_indices = seed_indices.gather(1, inds)
img_features = self.img_mlp(img_features)
fused_features = torch.cat([seed_3d_features, img_features], dim=1)
feat_dict_joint = dict(
seed_points=seeds_3d,
seed_features=fused_features,
seed_indices=seed_indices)
feat_dict_pts = dict(
seed_points=seeds_3d,
seed_features=seed_3d_features,
seed_indices=seed_indices)
feat_dict_img = dict(
seed_points=seeds_3d,
seed_features=img_features,
seed_indices=seed_indices)
loss_inputs = (points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask, img_metas)
bbox_preds_joints = self.pts_bbox_head_joint(
feat_dict_joint, self.train_cfg.pts.sample_mod)
bbox_preds_pts = self.pts_bbox_head_pts(
feat_dict_pts, self.train_cfg.pts.sample_mod)
bbox_preds_img = self.pts_bbox_head_img(
feat_dict_img, self.train_cfg.pts.sample_mod)
losses_towers = []
losses_joint = self.pts_bbox_head_joint.loss(
bbox_preds_joints,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_pts = self.pts_bbox_head_pts.loss(
bbox_preds_pts,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_img = self.pts_bbox_head_img.loss(
bbox_preds_img,
*loss_inputs,
gt_bboxes_ignore=gt_bboxes_ignore)
losses_towers.append(losses_joint)
losses_towers.append(losses_pts)
losses_towers.append(losses_img)
combined_losses = dict()
for loss_term in losses_joint:
if 'loss' in loss_term:
combined_losses[loss_term] = 0
for i in range(len(losses_towers)):
combined_losses[loss_term] += \
losses_towers[i][loss_term] * \
self.loss_weights[i]
else:
# only save the metric of the joint head
# if it is not a loss
combined_losses[loss_term] = \
losses_towers[0][loss_term]
return combined_losses
def forward_test(self,
points=None,
img_metas=None,
img=None,
calib=None,
bboxes_2d=None,
**kwargs):
"""Forwarding of test for image branch pretrain or stage 2 train.
Args:
points (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and the inner
list contains all points in the batch, where each Tensor
should have a shape NxC. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
img (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
calibs (list[dict[str, torch.Tensor]], optional): camera
calibration matrices, Rt and K.
List indicates test-time augs. Defaults to None.
bboxes_2d (list[list[torch.Tensor]], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
Returns:
list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes.
"""
if points is None:
for var, name in [(img, 'img'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError(
f'{name} must be a list, but got {type(var)}')
num_augs = len(img)
if num_augs != len(img_metas):
raise ValueError(f'num of augmentations ({len(img)}) '
f'!= num of image meta ({len(img_metas)})')
if num_augs == 1:
# proposals (List[List[Tensor]]): the outer list indicates
# test-time augs (multiscale, flip, etc.) and the inner list
# indicates images in a batch.
# The Tensor should have a shape Px4, where P is the number of
# proposals.
if 'proposals' in kwargs:
kwargs['proposals'] = kwargs['proposals'][0]
return self.simple_test_img_only(
img=img[0], img_metas=img_metas[0], **kwargs)
else:
assert img[0].size(0) == 1, 'aug test does not support ' \
'inference with batch size ' \
f'{img[0].size(0)}'
# TODO: support test augmentation for predefined proposals
assert 'proposals' not in kwargs
return self.aug_test_img_only(
img=img, img_metas=img_metas, **kwargs)
else:
for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(points)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.
format(len(points), len(img_metas)))
if num_augs == 1:
return self.simple_test(
points[0],
img_metas[0],
img[0],
calibs=calib[0],
bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None,
**kwargs)
else:
return self.aug_test(points, img_metas, img, calib, bboxes_2d,
**kwargs)
def simple_test_img_only(self,
img,
img_metas,
proposals=None,
rescale=False):
"""Test without augmentation, image network pretrain. May refer to
https://github.com/open-
mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py #
noqa.
Args:
img (torch.Tensor): Should have a shape NxCxHxW, which contains
all images in the batch.
img_metas (list[dict]):
proposals (list[Tensor], optional): override rpn proposals
with custom proposals. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes to the
original shape of input image. Defaults to False.
Returns:
list[list[torch.Tensor]]: Predicted 2d boxes.
"""
assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feat(img)
if proposals is None:
proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas)
else:
proposal_list = proposals
ret = self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=rescale)
return ret
def simple_test(self,
points=None,
img_metas=None,
img=None,
calibs=None,
bboxes_2d=None,
rescale=False,
**kwargs):
"""Test without augmentation, stage 2.
Args:
points (list[torch.Tensor], optional): Elements in the list
should have a shape NxC, the list indicates all point-clouds
in the batch. Defaults to None.
img_metas (list[dict], optional): List indicates
images in a batch. Defaults to None.
img (torch.Tensor, optional): Should have a shape NxCxHxW,
which contains all images in the batch. Defaults to None.
calibs (dict[str, torch.Tensor], optional): camera
calibration matrices, Rt and K. Defaults to None.
bboxes_2d (list[torch.Tensor], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes.
Defaults to False.
Returns:
list[dict]: Predicted 3d boxes.
"""
bboxes_2d = self.extract_bboxes_2d(
img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs)
points = torch.stack(points)
seeds_3d, seed_3d_features, seed_indices = \
self.extract_pts_feat(points)
img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d,
img_metas, calibs)
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
pts_feat_size = seed_3d_features.shape[1]
inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1)
img_features = img_features.gather(-1, inds_img)
inds = inds % inds.shape[1]
inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
inds_seed_feats = inds.view(batch_size, 1,
-1).expand(-1, pts_feat_size, -1)
seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
seed_indices = seed_indices.gather(1, inds)
img_features = self.img_mlp(img_features)
fused_features = torch.cat([seed_3d_features, img_features], dim=1)
feat_dict = dict(
seed_points=seeds_3d,
seed_features=fused_features,
seed_indices=seed_indices)
bbox_preds = self.pts_bbox_head_joint(feat_dict,
self.test_cfg.pts.sample_mod)
bbox_list = self.pts_bbox_head_joint.get_bboxes(
points, bbox_preds, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test_img_only(self, img, img_metas, rescale=False):
"""Test function with augmentation, image network pretrain. May refer
to https://github.com/open-
mmlab/mmdetection/blob/master/mmdet/models/detectors/two_stage.py #
noqa.
Args:
img (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes to the
original shape of input image. If rescale is False, then
returned bboxes and masks will fit the scale of imgs[0].
Defaults to None.
Returns:
list[list[torch.Tensor]]: Predicted 2d boxes.
"""
assert self.with_img_bbox, 'Img bbox head must be implemented.'
assert self.with_img_backbone, 'Img backbone must be implemented.'
assert self.with_img_rpn, 'Img rpn must be implemented.'
assert self.with_img_roi_head, 'Img roi head must be implemented.'
x = self.extract_img_feats(img)
proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas)
return self.img_roi_head.aug_test(
x, proposal_list, img_metas, rescale=rescale)
def aug_test(self,
points=None,
img_metas=None,
imgs=None,
calibs=None,
bboxes_2d=None,
rescale=False,
**kwargs):
"""Test function with augmentation, stage 2.
Args:
points (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and the inner
list contains all points in the batch, where each Tensor
should have a shape NxC. Defaults to None.
img_metas (list[list[dict]], optional): the outer list
indicates test-time augs (multiscale, flip, etc.)
and the inner list indicates images in a batch.
Defaults to None.
imgs (list[list[torch.Tensor]], optional): the outer
list indicates test-time augmentations and inner Tensor
should have a shape NxCxHxW, which contains all images
in the batch. Defaults to None. Defaults to None.
calibs (list[dict[str, torch.Tensor]], optional): camera
calibration matrices, Rt and K.
List indicates test-time augs. Defaults to None.
bboxes_2d (list[list[torch.Tensor]], optional):
Provided 2d bboxes, not supported yet. Defaults to None.
rescale (bool, optional): Whether or not rescale bboxes.
Defaults to False.
Returns:
list[dict]: Predicted 3d boxes.
"""
points_cat = [torch.stack(pts) for pts in points]
feats = self.extract_pts_feats(points_cat, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, pts_cat, img_meta, bbox_2d, img, calib in zip(
feats, points_cat, img_metas, bboxes_2d, imgs, calibs):
bbox_2d = self.extract_bboxes_2d(
img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs)
seeds_3d, seed_3d_features, seed_indices = x
img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d,
img_metas, calib)
inds = sample_valid_seeds(masks, self.num_sampled_seed)
batch_size, img_feat_size = img_features.shape[:2]
pts_feat_size = seed_3d_features.shape[1]
inds_img = inds.view(batch_size, 1,
-1).expand(-1, img_feat_size, -1)
img_features = img_features.gather(-1, inds_img)
inds = inds % inds.shape[1]
inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3)
seeds_3d = seeds_3d.gather(1, inds_seed_xyz)
inds_seed_feats = inds.view(batch_size, 1,
-1).expand(-1, pts_feat_size, -1)
seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats)
seed_indices = seed_indices.gather(1, inds)
img_features = self.img_mlp(img_features)
fused_features = torch.cat([seed_3d_features, img_features], dim=1)
feat_dict = dict(
seed_points=seeds_3d,
seed_features=fused_features,
seed_indices=seed_indices)
bbox_preds = self.pts_bbox_head_joint(feat_dict,
self.test_cfg.pts.sample_mod)
bbox_list = self.pts_bbox_head_joint.get_bboxes(
pts_cat, bbox_preds, img_metas, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
return [merged_bboxes]
================================================
FILE: mmdet3d/models/detectors/mvx_faster_rcnn.py
================================================
import torch
from mmcv.runner import force_fp32
from torch.nn import functional as F
from mmdet.models import DETECTORS
from .mvx_two_stage import MVXTwoStageDetector
@DETECTORS.register_module()
class MVXFasterRCNN(MVXTwoStageDetector):
"""Multi-modality VoxelNet using Faster R-CNN."""
def __init__(self, **kwargs):
super(MVXFasterRCNN, self).__init__(**kwargs)
@DETECTORS.register_module()
class DynamicMVXFasterRCNN(MVXTwoStageDetector):
"""Multi-modality VoxelNet using Faster R-CNN and dynamic voxelization."""
def __init__(self, **kwargs):
super(DynamicMVXFasterRCNN, self).__init__(**kwargs)
@torch.no_grad()
@force_fp32()
def voxelize(self, points):
"""Apply dynamic voxelization to points.
Args:
points (list[torch.Tensor]): Points of each sample.
Returns:
tuple[torch.Tensor]: Concatenated points and coordinates.
"""
coors = []
# dynamic voxelization only provide a coors mapping
for res in points:
res_coors = self.pts_voxel_layer(res)
coors.append(res_coors)
points = torch.cat(points, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
return points, coors_batch
def extract_pts_feat(self, points, img_feats, img_metas):
"""Extract point features."""
if not self.with_pts_bbox:
return None
voxels, coors = self.voxelize(points)
voxel_features, feature_coors = self.pts_voxel_encoder(
voxels, coors, points, img_feats, img_metas)
batch_size = coors[-1, 0] + 1
x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size)
x = self.pts_backbone(x)
if self.with_pts_neck:
x = self.pts_neck(x)
return x
================================================
FILE: mmdet3d/models/detectors/mvx_two_stage.py
================================================
import mmcv
import torch
from mmcv.parallel import DataContainer as DC
from mmcv.runner import force_fp32
from os import path as osp
from torch import nn as nn
from torch.nn import functional as F
import time
from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
merge_aug_bboxes_3d, show_result)
from mmdet3d.ops import Voxelization
from mmdet.core import multi_apply
from mmdet.models import DETECTORS
from .. import builder
from .base import Base3DDetector
@DETECTORS.register_module()
class MVXTwoStageDetector(Base3DDetector):
"""Base class of Multi-modality VoxelNet."""
def __init__(self,
freeze_img=True,
freeze_img_head=False,
pts_voxel_layer=None,
pts_voxel_encoder=None,
pts_middle_encoder=None,
pts_fusion_layer=None,
img_backbone=None,
pts_backbone=None,
img_neck=None,
pts_neck=None,
pts_bbox_head=None,
img_roi_head=None,
img_rpn_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
):
super(MVXTwoStageDetector, self).__init__()
self.freeze_img = freeze_img
self.freeze_img_head = freeze_img_head
if pts_voxel_layer:
self.pts_voxel_layer = Voxelization(**pts_voxel_layer)
if pts_voxel_encoder:
self.pts_voxel_encoder = builder.build_voxel_encoder(
pts_voxel_encoder)
if pts_middle_encoder:
self.pts_middle_encoder = builder.build_middle_encoder(
pts_middle_encoder)
if pts_backbone:
self.pts_backbone = builder.build_backbone(pts_backbone)
if pts_fusion_layer:
self.pts_fusion_layer = builder.build_fusion_layer(
pts_fusion_layer)
if pts_neck is not None:
self.pts_neck = builder.build_neck(pts_neck)
if pts_bbox_head:
pts_train_cfg = train_cfg.pts if train_cfg else None
pts_bbox_head.update(train_cfg=pts_train_cfg)
pts_test_cfg = test_cfg.pts if test_cfg else None
pts_bbox_head.update(test_cfg=pts_test_cfg)
self.pts_bbox_head = builder.build_head(pts_bbox_head)
if img_backbone:
self.img_backbone = builder.build_backbone(img_backbone)
if img_neck is not None:
self.img_neck = builder.build_neck(img_neck)
if img_rpn_head is not None:
self.img_rpn_head = builder.build_head(img_rpn_head)
if img_roi_head is not None:
self.img_roi_head = builder.build_head(img_roi_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
def init_weights(self, pretrained=None):
"""Initialize model weights."""
super(MVXTwoStageDetector, self).init_weights(pretrained)
if pretrained is None:
img_pretrained = None
pts_pretrained = None
elif isinstance(pretrained, dict):
img_pretrained = pretrained.get('img', None)
pts_pretrained = pretrained.get('pts', None)
else:
raise ValueError(
f'pretrained should be a dict, got {type(pretrained)}')
if self.with_img_backbone:
self.img_backbone.init_weights(pretrained=img_pretrained)
if self.with_pts_backbone:
self.pts_backbone.init_weights(pretrained=pts_pretrained)
if self.with_img_neck:
if isinstance(self.img_neck, nn.Sequential):
for m in self.img_neck:
m.init_weights()
else:
self.img_neck.init_weights()
if self.with_img_roi_head:
self.img_roi_head.init_weights(img_pretrained)
if self.with_img_rpn:
self.img_rpn_head.init_weights()
if self.with_pts_bbox:
self.pts_bbox_head.init_weights()
if self.with_pts_roi_head:
self.pts_roi_head.init_weights()
if self.freeze_img:
if self.with_img_backbone:
for param in self.img_backbone.parameters():
param.requires_grad = False
if self.with_img_neck:
for param in self.img_neck.parameters():
param.requires_grad = False
@property
def with_pts_roi_head(self):
"""bool: Whether the detector has a roi head in pts branch."""
return hasattr(self,
'pts_roi_head') and self.pts_roi_head is not None
@property
def with_img_shared_head(self):
"""bool: Whether the detector has a shared head in image branch."""
return hasattr(self,
'img_shared_head') and self.img_shared_head is not None
@property
def with_pts_bbox(self):
"""bool: Whether the detector has a 3D box head."""
return hasattr(self,
'pts_bbox_head') and self.pts_bbox_head is not None
@property
def with_img_bbox(self):
"""bool: Whether the detector has a 2D image box head."""
return hasattr(self,
'img_bbox_head') and self.img_bbox_head is not None
@property
def with_img_backbone(self):
"""bool: Whether the detector has a 2D image backbone."""
return hasattr(self, 'img_backbone') and self.img_backbone is not None
@property
def with_pts_backbone(self):
"""bool: Whether the detector has a 3D backbone."""
return hasattr(self, 'pts_backbone') and self.pts_backbone is not None
@property
def with_fusion(self):
"""bool: Whether the detector has a fusion layer."""
return hasattr(self,
'pts_fusion_layer') and self.fusion_layer is not None
@property
def with_img_neck(self):
"""bool: Whether the detector has a neck in image branch."""
return hasattr(self, 'img_neck') and self.img_neck is not None
@property
def with_pts_neck(self):
"""bool: Whether the detector has a neck in 3D detector branch."""
return hasattr(self, 'pts_neck') and self.pts_neck is not None
@property
def with_img_rpn(self):
"""bool: Whether the detector has a 2D RPN in image detector branch."""
return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None
@property
def with_img_roi_head(self):
"""bool: Whether the detector has a RoI Head in image branch."""
return hasattr(self, 'img_roi_head') and self.img_roi_head is not None
@property
def with_voxel_encoder(self):
"""bool: Whether the detector has a voxel encoder."""
return hasattr(self,
'voxel_encoder') and self.voxel_encoder is not None
@property
def with_middle_encoder(self):
"""bool: Whether the detector has a middle encoder."""
return hasattr(self,
'middle_encoder') and self.middle_encoder is not None
def extract_img_feat(self, img, img_metas):
"""Extract features of images."""
if self.with_img_backbone and img is not None:
input_shape = img.shape[-2:]
# update real input shape of each single img
for img_meta in img_metas:
img_meta.update(input_shape=input_shape)
if img.dim() == 5 and img.size(0) == 1:
img.squeeze_(0)
elif img.dim() == 5 and img.size(0) > 1:
B, N, C, H, W = img.size()
img = img.view(B * N, C, H, W)
img_feats = self.img_backbone(img.float())
else:
return None
if self.with_img_neck:
img_feats = self.img_neck(img_feats)
return img_feats
def extract_pts_feat(self, pts, img_feats, img_metas):
"""Extract features of points."""
if not self.with_pts_bbox:
return None
voxels, num_points, coors = self.voxelize(pts)
voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
)
batch_size = coors[-1, 0] + 1
x = self.pts_middle_encoder(voxel_features, coors, batch_size)
x = self.pts_backbone(x)
if self.with_pts_neck:
x = self.pts_neck(x)
return x
def extract_feat(self, points, img, img_metas):
"""Extract features from images and points."""
img_feats = self.extract_img_feat(img, img_metas)
pts_feats = self.extract_pts_feat(points, img_feats, img_metas)
return (img_feats, pts_feats)
@torch.no_grad()
@force_fp32()
def voxelize(self, points):
"""Apply dynamic voxelization to points.
Args:
points (list[torch.Tensor]): Points of each sample.
Returns:
tuple[torch.Tensor]: Concatenated points, number of points
per voxel, and coordinates.
"""
voxels, coors, num_points = [], [], []
for res in points:
res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
voxels.append(res_voxels)
coors.append(res_coors)
num_points.append(res_num_points)
voxels = torch.cat(voxels, dim=0)
num_points = torch.cat(num_points, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
return voxels, num_points, coors_batch
def forward_train(self,
points=None,
img_metas=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
gt_labels=None,
gt_bboxes=None,
img=None,
proposals=None,
gt_bboxes_ignore=None):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
img_feats, pts_feats = self.extract_feat(
points, img=img, img_metas=img_metas)
losses = dict()
if pts_feats:
losses_pts = self.forward_pts_train(pts_feats, img_feats, gt_bboxes_3d,
gt_labels_3d, img_metas,
gt_bboxes_ignore)
losses.update(losses_pts)
if img_feats:
losses_img = self.forward_img_train(
img_feats,
img_metas=img_metas,
gt_bboxes=gt_bboxes,
gt_labels=gt_labels,
gt_bboxes_ignore=gt_bboxes_ignore,
proposals=proposals)
losses.update(losses_img)
return losses
def forward_pts_train(self,
pts_feats,
img_feats,
gt_bboxes_3d,
gt_labels_3d,
img_metas,
gt_bboxes_ignore=None):
"""Forward function for point cloud branch.
Args:
pts_feats (list[torch.Tensor]): Features of point cloud branch
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
img_metas (list[dict]): Meta information of samples.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
outs = self.pts_bbox_head(pts_feats, img_feats, img_metas)
loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
losses = self.pts_bbox_head.loss(*loss_inputs)
return losses
def forward_img_train(self,
x,
img_metas,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None,
proposals=None,
**kwargs):
"""Forward function for image branch.
This function works similar to the forward function of Faster R-CNN.
Args:
x (list[torch.Tensor]): Image features of shape (B, C, H, W)
of multiple levels.
img_metas (list[dict]): Meta information of images.
gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image
sample.
gt_labels (list[torch.Tensor]): Ground truth labels of boxes.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
proposals (list[torch.Tensor], optional): Proposals of each sample.
Defaults to None.
Returns:
dict: Losses of each branch.
"""
losses = dict()
# RPN forward and loss
if self.with_img_rpn:
rpn_outs = self.img_rpn_head(x)
rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas,
self.train_cfg.img_rpn)
rpn_losses = self.img_rpn_head.loss(
*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
losses.update(rpn_losses)
proposal_cfg = self.train_cfg.get('img_rpn_proposal',
self.test_cfg.img_rpn)
proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
else:
proposal_list = proposals
# bbox head forward and loss
if self.with_img_bbox:
# bbox head forward and loss
img_roi_losses = self.img_roi_head.forward_train(
x, img_metas, proposal_list, gt_bboxes, gt_labels,
gt_bboxes_ignore, **kwargs)
losses.update(img_roi_losses)
return losses
def simple_test_img(self, x, img_metas, proposals=None, rescale=False):
"""Test without augmentation."""
if proposals is None:
proposal_list = self.simple_test_rpn(x, img_metas,
self.test_cfg.img_rpn)
else:
proposal_list = proposals
return self.img_roi_head.simple_test(
x, proposal_list, img_metas, rescale=rescale)
def simple_test_rpn(self, x, img_metas, rpn_test_cfg):
"""RPN test function."""
rpn_outs = self.img_rpn_head(x)
proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg)
proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs)
return proposal_list
def simple_test_pts(self, x, x_img, img_metas, rescale=False):
"""Test function of point cloud branch."""
outs = self.pts_bbox_head(x, x_img, img_metas)
bbox_list = self.pts_bbox_head.get_bboxes(
outs, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def simple_test(self, points, img_metas, img=None, rescale=False):
"""Test function without augmentaiton."""
img_feats, pts_feats = self.extract_feat(
points, img=img, img_metas=img_metas)
bbox_list = [dict() for i in range(len(img_metas))]
if pts_feats and self.with_pts_bbox:
bbox_pts = self.simple_test_pts(
pts_feats, img_feats, img_metas, rescale=rescale)
for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
result_dict['pts_bbox'] = pts_bbox
if img_feats and self.with_img_bbox:
bbox_img = self.simple_test_img(
img_feats, img_metas, rescale=rescale)
for result_dict, img_bbox in zip(bbox_list, bbox_img):
result_dict['img_bbox'] = img_bbox
return bbox_list
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test function with augmentaiton."""
img_feats, pts_feats = self.extract_feats(points, img_metas, imgs)
bbox_list = dict()
if pts_feats and self.with_pts_bbox:
bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale)
bbox_list.update(pts_bbox=bbox_pts)
return [bbox_list]
def extract_feats(self, points, img_metas, imgs=None):
"""Extract point and image features of multiple samples."""
if imgs is None:
imgs = [None] * len(img_metas)
img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs,
img_metas)
return img_feats, pts_feats
def aug_test_pts(self, feats, img_metas, rescale=False):
"""Test function of point cloud branch with augmentaiton."""
# only support aug_test for one sample
aug_bboxes = []
for x, img_meta in zip(feats, img_metas):
outs = self.pts_bbox_head(x)
bbox_list = self.pts_bbox_head.get_bboxes(
*outs, img_meta, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.pts_bbox_head.test_cfg)
return merged_bboxes
def show_results(self, data, result, out_dir):
"""Results visualization.
Args:
data (dict): Input points and the information of the sample.
result (dict): Prediction results.
out_dir (str): Output directory of visualization result.
"""
for batch_id in range(len(result)):
if isinstance(data['points'][0], DC):
points = data['points'][0]._data[0][batch_id].numpy()
elif mmcv.is_list_of(data['points'][0], torch.Tensor):
points = data['points'][0][batch_id]
else:
ValueError(f"Unsupported data type {type(data['points'][0])} "
f'for visualization!')
if isinstance(data['img_metas'][0], DC):
pts_filename = data['img_metas'][0]._data[0][batch_id][
'pts_filename']
box_mode_3d = data['img_metas'][0]._data[0][batch_id][
'box_mode_3d']
elif mmcv.is_list_of(data['img_metas'][0], dict):
pts_filename = data['img_metas'][0][batch_id]['pts_filename']
box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d']
else:
ValueError(
f"Unsupported data type {type(data['img_metas'][0])} "
f'for visualization!')
file_name = osp.split(pts_filename)[-1].split('.')[0]
assert out_dir is not None, 'Expect out_dir, got none.'
inds = result[batch_id]['pts_bbox']['scores_3d'] > 0.1
pred_bboxes = result[batch_id]['pts_bbox']['boxes_3d'][inds]
# for now we convert points and bbox into depth mode
if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d
== Box3DMode.LIDAR):
points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR,
Coord3DMode.DEPTH)
pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d,
Box3DMode.DEPTH)
elif box_mode_3d != Box3DMode.DEPTH:
ValueError(
f'Unsupported box_mode_3d {box_mode_3d} for convertion!')
pred_bboxes = pred_bboxes.tensor.cpu().numpy()
show_result(points, None, pred_bboxes, out_dir, file_name)
================================================
FILE: mmdet3d/models/detectors/parta2.py
================================================
import torch
from torch.nn import functional as F
from mmdet3d.ops import Voxelization
from mmdet.models import DETECTORS
from .. import builder
from .two_stage import TwoStage3DDetector
@DETECTORS.register_module()
class PartA2(TwoStage3DDetector):
r"""Part-A2 detector.
Please refer to the `paper `_
"""
def __init__(self,
voxel_layer,
voxel_encoder,
middle_encoder,
backbone,
neck=None,
rpn_head=None,
roi_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(PartA2, self).__init__(
backbone=backbone,
neck=neck,
rpn_head=rpn_head,
roi_head=roi_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained,
)
self.voxel_layer = Voxelization(**voxel_layer)
self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
self.middle_encoder = builder.build_middle_encoder(middle_encoder)
def extract_feat(self, points, img_metas):
"""Extract features from points."""
voxel_dict = self.voxelize(points)
voxel_features = self.voxel_encoder(voxel_dict['voxels'],
voxel_dict['num_points'],
voxel_dict['coors'])
batch_size = voxel_dict['coors'][-1, 0].item() + 1
feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'],
batch_size)
x = self.backbone(feats_dict['spatial_features'])
if self.with_neck:
neck_feats = self.neck(x)
feats_dict.update({'neck_feats': neck_feats})
return feats_dict, voxel_dict
@torch.no_grad()
def voxelize(self, points):
"""Apply hard voxelization to points."""
voxels, coors, num_points, voxel_centers = [], [], [], []
for res in points:
res_voxels, res_coors, res_num_points = self.voxel_layer(res)
res_voxel_centers = (
res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor(
self.voxel_layer.voxel_size) + res_voxels.new_tensor(
self.voxel_layer.point_cloud_range[0:3])
voxels.append(res_voxels)
coors.append(res_coors)
num_points.append(res_num_points)
voxel_centers.append(res_voxel_centers)
voxels = torch.cat(voxels, dim=0)
num_points = torch.cat(num_points, dim=0)
voxel_centers = torch.cat(voxel_centers, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
voxel_dict = dict(
voxels=voxels,
num_points=num_points,
coors=coors_batch,
voxel_centers=voxel_centers)
return voxel_dict
def forward_train(self,
points,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
gt_bboxes_ignore=None,
proposals=None):
"""Training forward function.
Args:
points (list[torch.Tensor]): Point cloud of each sample.
img_metas (list[dict]): Meta information of each sample
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
feats_dict, voxels_dict = self.extract_feat(points, img_metas)
losses = dict()
if self.with_rpn:
rpn_outs = self.rpn_head(feats_dict['neck_feats'])
rpn_loss_inputs = rpn_outs + (gt_bboxes_3d, gt_labels_3d,
img_metas)
rpn_losses = self.rpn_head.loss(
*rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
losses.update(rpn_losses)
proposal_cfg = self.train_cfg.get('rpn_proposal',
self.test_cfg.rpn)
proposal_inputs = rpn_outs + (img_metas, proposal_cfg)
proposal_list = self.rpn_head.get_bboxes(*proposal_inputs)
else:
proposal_list = proposals
roi_losses = self.roi_head.forward_train(feats_dict, voxels_dict,
img_metas, proposal_list,
gt_bboxes_3d, gt_labels_3d)
losses.update(roi_losses)
return losses
def simple_test(self, points, img_metas, proposals=None, rescale=False):
"""Test function without augmentaiton."""
feats_dict, voxels_dict = self.extract_feat(points, img_metas)
if self.with_rpn:
rpn_outs = self.rpn_head(feats_dict['neck_feats'])
proposal_cfg = self.test_cfg.rpn
bbox_inputs = rpn_outs + (img_metas, proposal_cfg)
proposal_list = self.rpn_head.get_bboxes(*bbox_inputs)
else:
proposal_list = proposals
return self.roi_head.simple_test(feats_dict, voxels_dict, img_metas,
proposal_list)
================================================
FILE: mmdet3d/models/detectors/single_stage.py
================================================
from torch import nn as nn
from mmdet.models import DETECTORS, build_backbone, build_head, build_neck
from .base import Base3DDetector
@DETECTORS.register_module()
class SingleStage3DDetector(Base3DDetector):
"""SingleStage3DDetector.
This class serves as a base class for single-stage 3D detectors.
Args:
backbone (dict): Config dict of detector's backbone.
neck (dict, optional): Config dict of neck. Defaults to None.
bbox_head (dict, optional): Config dict of box head. Defaults to None.
train_cfg (dict, optional): Config dict of training hyper-parameters.
Defaults to None.
test_cfg (dict, optional): Config dict of test hyper-parameters.
Defaults to None.
pretrained (str, optional): Path of pretrained models.
Defaults to None.
"""
def __init__(self,
backbone,
neck=None,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(SingleStage3DDetector, self).__init__()
self.backbone = build_backbone(backbone)
if neck is not None:
self.neck = build_neck(neck)
bbox_head.update(train_cfg=train_cfg)
bbox_head.update(test_cfg=test_cfg)
self.bbox_head = build_head(bbox_head)
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.init_weights(pretrained=pretrained)
def init_weights(self, pretrained=None):
"""Initialize weights of detector."""
super(SingleStage3DDetector, self).init_weights(pretrained)
self.backbone.init_weights(pretrained=pretrained)
if self.with_neck:
if isinstance(self.neck, nn.Sequential):
for m in self.neck:
m.init_weights()
else:
self.neck.init_weights()
self.bbox_head.init_weights()
def extract_feat(self, points, img_metas=None):
"""Directly extract features from the backbone+neck.
Args:
points (torch.Tensor): Input points.
"""
x = self.backbone(points)
if self.with_neck:
x = self.neck(x)
return x
def extract_feats(self, points, img_metas):
"""Extract features of multiple samples."""
return [
self.extract_feat(pts, img_meta)
for pts, img_meta in zip(points, img_metas)
]
================================================
FILE: mmdet3d/models/detectors/sparsefusion.py
================================================
import mmcv
import torch
from mmcv.parallel import DataContainer as DC
from mmcv.runner import force_fp32
from os import path as osp
from torch import nn as nn
from torch.nn import functional as F
import numpy as np
import time
from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
merge_aug_bboxes_3d, show_result)
from mmdet3d.ops import Voxelization
from mmdet.core import multi_apply
from mmdet.models import DETECTORS
from .. import builder
from .mvx_two_stage import MVXTwoStageDetector
from mmdet3d.ops import Voxelization
@DETECTORS.register_module()
class SparseFusionDetector(MVXTwoStageDetector):
"""Base class of Multi-modality VoxelNet."""
def __init__(self, **kwargs):
super(SparseFusionDetector, self).__init__(**kwargs)
self.freeze_img = kwargs.get('freeze_img', True)
self.freeze_img_head = kwargs.get('freeze_img_head', False)
self.init_weights(pretrained=kwargs.get('pretrained', None))
def init_weights(self, pretrained=None):
"""Initialize model weights."""
super(SparseFusionDetector, self).init_weights(pretrained)
if self.freeze_img:
if self.with_img_backbone:
for param in self.img_backbone.parameters():
param.requires_grad = False
if self.with_img_neck:
for param in self.img_neck.parameters():
param.requires_grad = False
if self.freeze_img_head:
for param in self.pts_bbox_head.img_transformer.parameters():
param.requires_grad = False
for param in self.pts_bbox_head.shared_conv_img.parameters():
param.requires_grad = False
for param in self.pts_bbox_head.img_heatmap_head.parameters():
param.requires_grad = False
def extract_img_feat(self, img, img_metas):
"""Extract features of images."""
if self.with_img_backbone and img is not None:
input_shape = img.shape[-2:]
# update real input shape of each single img
for img_meta in img_metas:
img_meta.update(input_shape=input_shape)
if img.dim() == 5 and img.size(0) == 1:
img.squeeze_(0)
elif img.dim() == 5 and img.size(0) > 1:
B, N, C, H, W = img.size()
img = img.view(B * N, C, H, W)
img_feats = self.img_backbone(img.float())
else:
return None
if self.with_img_neck:
img_feats = self.img_neck(img_feats)
return img_feats
def extract_voxel_heights(self, voxels, coors):
batch_size = coors[-1, 0].item() + 1
grid_size = self.test_cfg['pts']['grid_size']
out_size_factor = self.test_cfg['pts']['out_size_factor']
height_num = grid_size[2]
x_num = grid_size[0] // out_size_factor
y_num = grid_size[1] // out_size_factor
voxels_ = voxels[:, :, 2].clone()
voxels_[voxels_==0] = 100
min_voxel = torch.min(voxels_, dim=-1)[0]
voxels_[voxels_==100] = -200
max_voxel = torch.max(voxels_, dim=-1)[0]
min_voxel_height = torch.zeros((batch_size, y_num, x_num, out_size_factor*out_size_factor)).to(voxels.device) + 100
max_voxel_height = torch.zeros((batch_size, y_num, x_num, out_size_factor*out_size_factor)).to(voxels.device) - 200
batch_ids = coors[:, 0].long()
height_ids = coors[:, 1].long()
y_ids = (coors[:, 2] // out_size_factor).long()
x_ids = (coors[:, 3] // out_size_factor).long()
y_offsets = (coors[:, 2] % out_size_factor).long()
x_offsets = (coors[:, 3] % out_size_factor).long()
for hid in range(height_num):
height_mask = height_ids == hid
batch_mask = batch_ids[height_mask]
y_ids_mask = y_ids[height_mask]
x_ids_mask = x_ids[height_mask]
y_offsets_mask = y_offsets[height_mask]
x_offsets_mask = x_offsets[height_mask]
min_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask] = torch.minimum(min_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask], min_voxel[height_mask])
max_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask] = torch.maximum(max_voxel_height[batch_mask, y_ids_mask, x_ids_mask, y_offsets_mask * out_size_factor + x_offsets_mask], max_voxel[height_mask])
min_voxel_height = torch.min(min_voxel_height, dim=-1)[0]
max_voxel_height = torch.max(max_voxel_height, dim=-1)[0]
return min_voxel_height, max_voxel_height
def extract_pts_feat(self, pts, img_feats, img_metas):
"""Extract features of points."""
if not self.with_pts_bbox:
return None
voxels, num_points, coors, min_voxel_height, max_voxel_height = self.voxelize(pts)
voxel_features = self.pts_voxel_encoder(voxels, num_points, coors)
batch_size = coors[-1, 0] + 1
x = self.pts_middle_encoder(voxel_features, coors, batch_size)
x = self.pts_backbone(x)
if self.with_pts_neck:
x = self.pts_neck(x)
min_voxel_height = min_voxel_height[:, None]
max_voxel_height = max_voxel_height[:, None]
x[0] = torch.cat([x[0], min_voxel_height, max_voxel_height], dim=1)
return x
@torch.no_grad()
@force_fp32()
def voxelize(self, points):
"""Apply dynamic voxelization to points.
Args:
points (list[torch.Tensor]): Points of each sample.
Returns:
tuple[torch.Tensor]: Concatenated points, number of points
per voxel, and coordinates.
"""
voxels, coors, num_points = [], [], []
for res in points:
res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
voxels.append(res_voxels)
coors.append(res_coors)
num_points.append(res_num_points)
voxels = torch.cat(voxels, dim=0)
num_points = torch.cat(num_points, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
min_voxel_height, max_voxel_height = self.extract_voxel_heights(voxels, coors_batch)
return voxels, num_points, coors_batch, min_voxel_height, max_voxel_height
def forward_train(self,
points=None,
img_metas=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
gt_labels=None,
gt_bboxes=None,
gt_pts_centers_view=None,
gt_img_centers_view=None,
gt_bboxes_cam_view=None,
img=None,
sparse_depth=None,
gt_visible_3d=None,
gt_bboxes_lidar_view=None,
proposals=None,
gt_bboxes_ignore=None):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
img_feats, pts_feats = self.extract_feat(
points, img=img, img_metas=img_metas)
losses = dict()
if pts_feats:
losses_pts = self.forward_pts_train(
pts_feats, img_feats, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, img_metas, gt_bboxes_ignore, sparse_depth, gt_visible_3d, gt_bboxes_lidar_view
)
losses.update(losses_pts)
if img_feats:
losses_img = self.forward_img_train(
img_feats,
img_metas=img_metas,
gt_bboxes=gt_bboxes,
gt_labels=gt_labels,
gt_bboxes_ignore=gt_bboxes_ignore,
proposals=proposals)
losses.update(losses_img)
return losses
def forward_pts_train(self,
pts_feats,
img_feats,
gt_bboxes_3d,
gt_labels_3d,
gt_bboxes,
gt_labels,
gt_pts_centers_view,
gt_img_centers_view,
gt_bboxes_cam_view,
img_metas,
gt_bboxes_ignore=None,
sparse_depth=None,
gt_visible_3d=None,
gt_bboxes_lidar_view=None):
"""Forward function for point cloud branch.
Args:
pts_feats (list[torch.Tensor]): Features of point cloud branch
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
img_metas (list[dict]): Meta information of samples.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
outs = self.pts_bbox_head(pts_feats, img_feats, img_metas, sparse_depth)
loss_inputs = [gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, gt_pts_centers_view, gt_img_centers_view, gt_bboxes_cam_view, gt_visible_3d, gt_bboxes_lidar_view, img_metas, outs]
losses = self.pts_bbox_head.loss(*loss_inputs)
return losses
def simple_test_pts(self, x, x_img, img_metas, rescale=False, sparse_depth=None):
"""Test function of point cloud branch."""
outs = self.pts_bbox_head(x, x_img, img_metas, sparse_depth)
bbox_list = self.pts_bbox_head.get_bboxes(
outs, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def simple_test(self, points, img_metas, img=None, sparse_depth=None, rescale=False):
"""Test function without augmentaiton."""
img_feats, pts_feats = self.extract_feat(
points, img=img, img_metas=img_metas)
bbox_list = [dict() for i in range(len(img_metas))]
if pts_feats and self.with_pts_bbox:
bbox_pts = self.simple_test_pts(
pts_feats, img_feats, img_metas, rescale=rescale, sparse_depth=sparse_depth)
for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
result_dict['pts_bbox'] = pts_bbox
if img_feats and self.with_img_bbox:
bbox_img = self.simple_test_img(
img_feats, img_metas, rescale=rescale)
for result_dict, img_bbox in zip(bbox_list, bbox_img):
result_dict['img_bbox'] = img_bbox
return bbox_list
def forward_test(self, points, img_metas, img=None, sparse_depth=None, **kwargs):
"""
Args:
points (list[torch.Tensor]): the outer list indicates test-time
augmentations and inner torch.Tensor should have a shape NxC,
which contains all points in the batch.
img_metas (list[list[dict]]): the outer list indicates test-time
augs (multiscale, flip, etc.) and the inner list indicates
images in a batch
img (list[torch.Tensor], optional): the outer
list indicates test-time augmentations and inner
torch.Tensor should have a shape NxCxHxW, which contains
all images in the batch. Defaults to None.
"""
for var, name in [(points, 'points'), (img_metas, 'img_metas')]:
if not isinstance(var, list):
raise TypeError('{} must be a list, but got {}'.format(
name, type(var)))
num_augs = len(points)
if num_augs != len(img_metas):
raise ValueError(
'num of augmentations ({}) != num of image meta ({})'.format(
len(points), len(img_metas)))
if num_augs == 1:
img = [img] if img is None else img
return self.simple_test(points[0], img_metas[0], img[0], sparse_depth[0], **kwargs)
else:
return self.aug_test(points, img_metas, img, **kwargs)
================================================
FILE: mmdet3d/models/detectors/ssd3dnet.py
================================================
from mmdet.models import DETECTORS
from .votenet import VoteNet
@DETECTORS.register_module()
class SSD3DNet(VoteNet):
"""3DSSDNet model.
https://arxiv.org/abs/2002.10187.pdf
"""
def __init__(self,
backbone,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(SSD3DNet, self).__init__(
backbone=backbone,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
================================================
FILE: mmdet3d/models/detectors/transfusion.py
================================================
import mmcv
import torch
import time
from mmcv.parallel import DataContainer as DC
from mmcv.runner import force_fp32
from os import path as osp
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result,
merge_aug_bboxes_3d, show_result)
from mmdet3d.ops import Voxelization
from mmdet.core import multi_apply
from mmdet.models import DETECTORS
from .. import builder
from .mvx_two_stage import MVXTwoStageDetector
@DETECTORS.register_module()
class TransFusionDetector(MVXTwoStageDetector):
"""Base class of Multi-modality VoxelNet."""
def __init__(self, **kwargs):
super(TransFusionDetector, self).__init__(**kwargs)
self.freeze_img = kwargs.get('freeze_img', True)
self.init_weights(pretrained=kwargs.get('pretrained', None))
def init_weights(self, pretrained=None):
"""Initialize model weights."""
super(TransFusionDetector, self).init_weights(pretrained)
if self.freeze_img:
if self.with_img_backbone:
for param in self.img_backbone.parameters():
param.requires_grad = False
if self.with_img_neck:
for param in self.img_neck.parameters():
param.requires_grad = False
def extract_img_feat(self, img, img_metas):
"""Extract features of images."""
if self.with_img_backbone and img is not None:
input_shape = img.shape[-2:]
# update real input shape of each single img
for img_meta in img_metas:
img_meta.update(input_shape=input_shape)
if img.dim() == 5 and img.size(0) == 1:
img.squeeze_(0)
elif img.dim() == 5 and img.size(0) > 1:
B, N, C, H, W = img.size()
img = img.view(B * N, C, H, W)
img_feats = self.img_backbone(img.float())
else:
return None
if self.with_img_neck:
img_feats = self.img_neck(img_feats)
return img_feats
def extract_pts_feat(self, pts, img_feats, img_metas):
"""Extract features of points."""
if not self.with_pts_bbox:
return None
voxels, num_points, coors = self.voxelize(pts)
voxel_features = self.pts_voxel_encoder(voxels, num_points, coors,
)
batch_size = coors[-1, 0] + 1
x = self.pts_middle_encoder(voxel_features, coors, batch_size)
x = self.pts_backbone(x)
if self.with_pts_neck:
x = self.pts_neck(x)
return x
@torch.no_grad()
@force_fp32()
def voxelize(self, points):
"""Apply dynamic voxelization to points.
Args:
points (list[torch.Tensor]): Points of each sample.
Returns:
tuple[torch.Tensor]: Concatenated points, number of points
per voxel, and coordinates.
"""
voxels, coors, num_points = [], [], []
for res in points:
res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res)
voxels.append(res_voxels)
coors.append(res_coors)
num_points.append(res_num_points)
voxels = torch.cat(voxels, dim=0)
num_points = torch.cat(num_points, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
return voxels, num_points, coors_batch
def forward_train(self,
points=None,
img_metas=None,
gt_bboxes_3d=None,
gt_labels_3d=None,
gt_labels=None,
gt_bboxes=None,
img=None,
proposals=None,
gt_bboxes_ignore=None):
"""Forward training function.
Args:
points (list[torch.Tensor], optional): Points of each sample.
Defaults to None.
img_metas (list[dict], optional): Meta information of each sample.
Defaults to None.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional):
Ground truth 3D boxes. Defaults to None.
gt_labels_3d (list[torch.Tensor], optional): Ground truth labels
of 3D boxes. Defaults to None.
gt_labels (list[torch.Tensor], optional): Ground truth labels
of 2D boxes in images. Defaults to None.
gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in
images. Defaults to None.
img (torch.Tensor optional): Images of each sample with shape
(N, C, H, W). Defaults to None.
proposals ([list[torch.Tensor], optional): Predicted proposals
used for training Fast RCNN. Defaults to None.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
2D boxes in images to be ignored. Defaults to None.
Returns:
dict: Losses of different branches.
"""
img_feats, pts_feats = self.extract_feat(
points, img=img, img_metas=img_metas)
losses = dict()
if pts_feats:
losses_pts = self.forward_pts_train(pts_feats, img_feats, gt_bboxes_3d,
gt_labels_3d, img_metas,
gt_bboxes_ignore)
losses.update(losses_pts)
if img_feats:
losses_img = self.forward_img_train(
img_feats,
img_metas=img_metas,
gt_bboxes=gt_bboxes,
gt_labels=gt_labels,
gt_bboxes_ignore=gt_bboxes_ignore,
proposals=proposals)
losses.update(losses_img)
return losses
def forward_pts_train(self,
pts_feats,
img_feats,
gt_bboxes_3d,
gt_labels_3d,
img_metas,
gt_bboxes_ignore=None):
"""Forward function for point cloud branch.
Args:
pts_feats (list[torch.Tensor]): Features of point cloud branch
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
img_metas (list[dict]): Meta information of samples.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
outs = self.pts_bbox_head(pts_feats, img_feats, img_metas)
loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
losses = self.pts_bbox_head.loss(*loss_inputs)
return losses
def simple_test_pts(self, x, x_img, img_metas, rescale=False):
"""Test function of point cloud branch."""
outs = self.pts_bbox_head(x, x_img, img_metas)
bbox_list = self.pts_bbox_head.get_bboxes(
outs, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def simple_test(self, points, img_metas, img=None, rescale=False):
"""Test function without augmentaiton."""
img_feats, pts_feats = self.extract_feat(
points, img=img, img_metas=img_metas)
bbox_list = [dict() for i in range(len(img_metas))]
if pts_feats and self.with_pts_bbox:
bbox_pts = self.simple_test_pts(
pts_feats, img_feats, img_metas, rescale=rescale)
for result_dict, pts_bbox in zip(bbox_list, bbox_pts):
result_dict['pts_bbox'] = pts_bbox
if img_feats and self.with_img_bbox:
bbox_img = self.simple_test_img(
img_feats, img_metas, rescale=rescale)
for result_dict, img_bbox in zip(bbox_list, bbox_img):
result_dict['img_bbox'] = img_bbox
return bbox_list
================================================
FILE: mmdet3d/models/detectors/two_stage.py
================================================
from mmdet.models import DETECTORS, TwoStageDetector
from .base import Base3DDetector
@DETECTORS.register_module()
class TwoStage3DDetector(Base3DDetector, TwoStageDetector):
"""Base class of two-stage 3D detector.
It inherits original ``:class:TwoStageDetector`` and
``:class:Base3DDetector``. This class could serve as a base class for all
two-stage 3D detectors.
"""
def __init__(self, **kwargs):
super(TwoStage3DDetector, self).__init__(**kwargs)
================================================
FILE: mmdet3d/models/detectors/votenet.py
================================================
import torch
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet.models import DETECTORS
from .single_stage import SingleStage3DDetector
@DETECTORS.register_module()
class VoteNet(SingleStage3DDetector):
r"""`VoteNet `_ for 3D detection."""
def __init__(self,
backbone,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(VoteNet, self).__init__(
backbone=backbone,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained)
def forward_train(self,
points,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
gt_bboxes_ignore=None):
"""Forward of training.
Args:
points (list[torch.Tensor]): Points of each batch.
img_metas (list): Image metas.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): gt class labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): point-wise instance
label of each batch.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict: Losses.
"""
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod)
loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask,
pts_instance_mask, img_metas)
losses = self.bbox_head.loss(
bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
def simple_test(self, points, img_metas, imgs=None, rescale=False):
"""Forward of testing.
Args:
points (list[torch.Tensor]): Points of each sample.
img_metas (list): Image metas.
rescale (bool): Whether to rescale results.
Returns:
list: Predicted 3d boxes.
"""
points_cat = torch.stack(points)
x = self.extract_feat(points_cat)
bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
bbox_list = self.bbox_head.get_bboxes(
points_cat, bbox_preds, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test with augmentation."""
points_cat = [torch.stack(pts) for pts in points]
feats = self.extract_feats(points_cat, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, pts_cat, img_meta in zip(feats, points_cat, img_metas):
bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod)
bbox_list = self.bbox_head.get_bboxes(
pts_cat, bbox_preds, img_meta, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
return [merged_bboxes]
================================================
FILE: mmdet3d/models/detectors/voxelnet.py
================================================
import torch
from mmcv.runner import force_fp32
from torch.nn import functional as F
from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d
from mmdet3d.ops import Voxelization
from mmdet.models import DETECTORS
from .. import builder
from .single_stage import SingleStage3DDetector
@DETECTORS.register_module()
class VoxelNet(SingleStage3DDetector):
r"""`VoxelNet `_ for 3D detection."""
def __init__(self,
voxel_layer,
voxel_encoder,
middle_encoder,
backbone,
neck=None,
bbox_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(VoxelNet, self).__init__(
backbone=backbone,
neck=neck,
bbox_head=bbox_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained,
)
self.voxel_layer = Voxelization(**voxel_layer)
self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder)
self.middle_encoder = builder.build_middle_encoder(middle_encoder)
def extract_feat(self, points, img_metas):
"""Extract features from points."""
voxels, num_points, coors = self.voxelize(points)
voxel_features = self.voxel_encoder(voxels, num_points, coors)
batch_size = coors[-1, 0].item() + 1
x = self.middle_encoder(voxel_features, coors, batch_size)
x = self.backbone(x)
if self.with_neck:
x = self.neck(x)
return x
@torch.no_grad()
@force_fp32()
def voxelize(self, points):
"""Apply hard voxelization to points."""
voxels, coors, num_points = [], [], []
for res in points:
res_voxels, res_coors, res_num_points = self.voxel_layer(res)
voxels.append(res_voxels)
coors.append(res_coors)
num_points.append(res_num_points)
voxels = torch.cat(voxels, dim=0)
num_points = torch.cat(num_points, dim=0)
coors_batch = []
for i, coor in enumerate(coors):
coor_pad = F.pad(coor, (1, 0), mode='constant', value=i)
coors_batch.append(coor_pad)
coors_batch = torch.cat(coors_batch, dim=0)
return voxels, num_points, coors_batch
def forward_train(self,
points,
img_metas,
gt_bboxes_3d,
gt_labels_3d,
gt_bboxes_ignore=None):
"""Training forward function.
Args:
points (list[torch.Tensor]): Point cloud of each sample.
img_metas (list[dict]): Meta information of each sample
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
x = self.extract_feat(points, img_metas)
outs = self.bbox_head(x)
loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas)
losses = self.bbox_head.loss(
*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
return losses
def simple_test(self, points, img_metas, imgs=None, rescale=False):
"""Test function without augmentaiton."""
x = self.extract_feat(points, img_metas)
outs = self.bbox_head(x)
bbox_list = self.bbox_head.get_bboxes(
*outs, img_metas, rescale=rescale)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def aug_test(self, points, img_metas, imgs=None, rescale=False):
"""Test function with augmentaiton."""
feats = self.extract_feats(points, img_metas)
# only support aug_test for one sample
aug_bboxes = []
for x, img_meta in zip(feats, img_metas):
outs = self.bbox_head(x)
bbox_list = self.bbox_head.get_bboxes(
*outs, img_meta, rescale=rescale)
bbox_list = [
dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels)
for bboxes, scores, labels in bbox_list
]
aug_bboxes.append(bbox_list[0])
# after merging, bboxes will be rescaled to the original image size
merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas,
self.bbox_head.test_cfg)
return [merged_bboxes]
================================================
FILE: mmdet3d/models/fusion_layers/__init__.py
================================================
from .coord_transform import (apply_3d_transformation, bbox_2d_transform,
coord_2d_transform)
from .point_fusion import PointFusion
from .vote_fusion import VoteFusion
__all__ = [
'PointFusion', 'VoteFusion', 'apply_3d_transformation',
'bbox_2d_transform', 'coord_2d_transform'
]
================================================
FILE: mmdet3d/models/fusion_layers/coord_transform.py
================================================
import torch
from functools import partial
from mmdet3d.core.points import get_points_type
def apply_3d_transformation(pcd, coords_type, img_meta, reverse=False):
"""Apply transformation to input point cloud.
Args:
pcd (torch.Tensor): The point cloud to be transformed.
coords_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'
img_meta(dict): Meta info regarding data transformation.
reverse (bool): Reversed transformation or not.
Note:
The elements in img_meta['transformation_3d_flow']:
"T" stands for translation;
"S" stands for scale;
"R" stands for rotation;
"HF" stands for horizontal flip;
"VF" stands for vertical flip.
Returns:
torch.Tensor: The transformed point cloud.
"""
dtype = pcd.dtype
device = pcd.device
pcd_rotate_mat = (
torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device)
if 'pcd_rotation' in img_meta else torch.eye(
3, dtype=dtype, device=device))
pcd_scale_factor = (
img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.)
pcd_trans_factor = (
torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device)
if 'pcd_trans' in img_meta else torch.zeros(
(3), dtype=dtype, device=device))
pcd_horizontal_flip = img_meta[
'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \
img_meta else False
pcd_vertical_flip = img_meta[
'pcd_vertical_flip'] if 'pcd_vertical_flip' in \
img_meta else False
flow = img_meta['transformation_3d_flow'] \
if 'transformation_3d_flow' in img_meta else []
pcd = pcd.clone() # prevent inplace modification
pcd = get_points_type(coords_type)(pcd)
horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \
if pcd_horizontal_flip else lambda: None
vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \
if pcd_vertical_flip else lambda: None
if reverse:
scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor)
translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor)
# pcd_rotate_mat @ pcd_rotate_mat.inverse() is not
# exactly an identity matrix
# use angle to create the inverse rot matrix neither.
rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse())
# reverse the pipeline
flow = flow[::-1]
else:
scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor)
translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor)
rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat)
flow_mapping = {
'T': translate_func,
'S': scale_func,
'R': rotate_func,
'HF': horizontal_flip_func,
'VF': vertical_flip_func
}
for op in flow:
assert op in flow_mapping, f'This 3D data '\
f'transformation op ({op}) is not supported'
func = flow_mapping[op]
func()
return pcd.coord
def extract_2d_info(img_meta, tensor):
"""Extract image augmentation information from img_meta.
Args:
img_meta(dict): Meta info regarding data transformation.
tensor(torch.Tensor): Input tensor used to create new ones.
Returns:
(int, int, int, int, torch.Tensor, bool, torch.Tensor):
The extracted information.
"""
img_shape = img_meta['img_shape']
ori_shape = img_meta['ori_shape']
img_h, img_w, _ = img_shape
ori_h, ori_w, _ = ori_shape
img_scale_factor = (
tensor.new_tensor(img_meta['scale_factor'][:2])
if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0]))
img_flip = img_meta['flip'] if 'flip' in img_meta else False
img_crop_offset = (
tensor.new_tensor(img_meta['img_crop_offset'])
if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0]))
return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip,
img_crop_offset)
def bbox_2d_transform(img_meta, bbox_2d, ori2new):
"""Transform 2d bbox according to img_meta.
Args:
img_meta(dict): Meta info regarding data transformation.
bbox_2d (torch.Tensor): Shape (..., >4)
The input 2d bboxes to transform.
ori2new (bool): Origin img coord system to new or not.
Returns:
torch.Tensor: The transformed 2d bboxes.
"""
img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
img_crop_offset = extract_2d_info(img_meta, bbox_2d)
bbox_2d_new = bbox_2d.clone()
if ori2new:
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1]
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1]
if img_flip:
bbox_2d_r = img_w - bbox_2d_new[:, 0]
bbox_2d_l = img_w - bbox_2d_new[:, 2]
bbox_2d_new[:, 0] = bbox_2d_l
bbox_2d_new[:, 2] = bbox_2d_r
else:
if img_flip:
bbox_2d_r = img_w - bbox_2d_new[:, 0]
bbox_2d_l = img_w - bbox_2d_new[:, 2]
bbox_2d_new[:, 0] = bbox_2d_l
bbox_2d_new[:, 2] = bbox_2d_r
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1]
bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0]
bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0]
bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1]
bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1]
return bbox_2d_new
def coord_2d_transform(img_meta, coord_2d, ori2new):
"""Transform 2d pixel coordinates according to img_meta.
Args:
img_meta(dict): Meta info regarding data transformation.
coord_2d (torch.Tensor): Shape (..., 2)
The input 2d coords to transform.
ori2new (bool): Origin img coord system to new or not.
Returns:
torch.Tensor: The transformed 2d coordinates.
"""
img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \
img_crop_offset = extract_2d_info(img_meta, coord_2d)
coord_2d_new = coord_2d.clone()
if ori2new:
# TODO here we assume this order of transformation
coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0]
coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1]
coord_2d_new[..., 0] += img_crop_offset[0]
coord_2d_new[..., 1] += img_crop_offset[1]
# flip uv coordinates and bbox
if img_flip:
coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
else:
if img_flip:
coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0]
coord_2d_new[..., 0] -= img_crop_offset[0]
coord_2d_new[..., 1] -= img_crop_offset[1]
coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0]
coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1]
return coord_2d_new
================================================
FILE: mmdet3d/models/fusion_layers/point_fusion.py
================================================
import torch
from mmcv.cnn import ConvModule, xavier_init
from torch import nn as nn
from torch.nn import functional as F
from ..registry import FUSION_LAYERS
from . import apply_3d_transformation
def point_sample(
img_meta,
img_features,
points,
lidar2img_rt,
img_scale_factor,
img_crop_offset,
img_flip,
img_pad_shape,
img_shape,
aligned=True,
padding_mode='zeros',
align_corners=True,
):
"""Obtain image features using points.
Args:
img_meta (dict): Meta info.
img_features (torch.Tensor): 1 x C x H x W image features.
points (torch.Tensor): Nx3 point cloud in LiDAR coordinates.
lidar2img_rt (torch.Tensor): 4x4 transformation matrix.
img_scale_factor (torch.Tensor): Scale factor with shape of \
(w_scale, h_scale).
img_crop_offset (torch.Tensor): Crop offset used to crop \
image during data augmentation with shape of (w_offset, h_offset).
img_flip (bool): Whether the image is flipped.
img_pad_shape (tuple[int]): int tuple indicates the h & w after
padding, this is necessary to obtain features in feature map.
img_shape (tuple[int]): int tuple indicates the h & w before padding
after scaling, this is necessary for flipping coordinates.
aligned (bool, optional): Whether use bilinear interpolation when
sampling image features for each point. Defaults to True.
padding_mode (str, optional): Padding mode when padding values for
features of out-of-image points. Defaults to 'zeros'.
align_corners (bool, optional): Whether to align corners when
sampling image features for each point. Defaults to True.
Returns:
torch.Tensor: NxC image features sampled by point coordinates.
"""
# apply transformation based on info in img_meta
points = apply_3d_transformation(points, 'LIDAR', img_meta, reverse=True)
# project points from velo coordinate to camera coordinate
num_points = points.shape[0]
pts_4d = torch.cat([points, points.new_ones(size=(num_points, 1))], dim=-1)
pts_2d = pts_4d @ lidar2img_rt.t()
# cam_points is Tensor of Nx4 whose last column is 1
# transform camera coordinate to image coordinate
pts_2d[:, 2] = torch.clamp(pts_2d[:, 2], min=1e-5)
pts_2d[:, 0] /= pts_2d[:, 2]
pts_2d[:, 1] /= pts_2d[:, 2]
# img transformation: scale -> crop -> flip
# the image is resized by img_scale_factor
img_coors = pts_2d[:, 0:2] * img_scale_factor # Nx2
img_coors -= img_crop_offset
# grid sample, the valid grid range should be in [-1,1]
coor_x, coor_y = torch.split(img_coors, 1, dim=1) # each is Nx1
if img_flip:
# by default we take it as horizontal flip
# use img_shape before padding for flip
orig_h, orig_w = img_shape
coor_x = orig_w - coor_x
h, w = img_pad_shape
coor_y = coor_y / h * 2 - 1
coor_x = coor_x / w * 2 - 1
grid = torch.cat([coor_x, coor_y],
dim=1).unsqueeze(0).unsqueeze(0) # Nx2 -> 1x1xNx2
# align_corner=True provides higher performance
mode = 'bilinear' if aligned else 'nearest'
point_features = F.grid_sample(
img_features,
grid,
mode=mode,
padding_mode=padding_mode,
align_corners=align_corners) # 1xCx1xN feats
return point_features.squeeze().t()
@FUSION_LAYERS.register_module()
class PointFusion(nn.Module):
"""Fuse image features from multi-scale features.
Args:
img_channels (list[int] | int): Channels of image features.
It could be a list if the input is multi-scale image features.
pts_channels (int): Channels of point features
mid_channels (int): Channels of middle layers
out_channels (int): Channels of output fused features
img_levels (int, optional): Number of image levels. Defaults to 3.
conv_cfg (dict, optional): Dict config of conv layers of middle
layers. Defaults to None.
norm_cfg (dict, optional): Dict config of norm layers of middle
layers. Defaults to None.
act_cfg (dict, optional): Dict config of activatation layers.
Defaults to None.
activate_out (bool, optional): Whether to apply relu activation
to output features. Defaults to True.
fuse_out (bool, optional): Whether apply conv layer to the fused
features. Defaults to False.
dropout_ratio (int, float, optional): Dropout ratio of image
features to prevent overfitting. Defaults to 0.
aligned (bool, optional): Whether apply aligned feature fusion.
Defaults to True.
align_corners (bool, optional): Whether to align corner when
sampling features according to points. Defaults to True.
padding_mode (str, optional): Mode used to pad the features of
points that do not have corresponding image features.
Defaults to 'zeros'.
lateral_conv (bool, optional): Whether to apply lateral convs
to image features. Defaults to True.
"""
def __init__(self,
img_channels,
pts_channels,
mid_channels,
out_channels,
img_levels=3,
conv_cfg=None,
norm_cfg=None,
act_cfg=None,
activate_out=True,
fuse_out=False,
dropout_ratio=0,
aligned=True,
align_corners=True,
padding_mode='zeros',
lateral_conv=True):
super(PointFusion, self).__init__()
if isinstance(img_levels, int):
img_levels = [img_levels]
if isinstance(img_channels, int):
img_channels = [img_channels] * len(img_levels)
assert isinstance(img_levels, list)
assert isinstance(img_channels, list)
assert len(img_channels) == len(img_levels)
self.img_levels = img_levels
self.act_cfg = act_cfg
self.activate_out = activate_out
self.fuse_out = fuse_out
self.dropout_ratio = dropout_ratio
self.img_channels = img_channels
self.aligned = aligned
self.align_corners = align_corners
self.padding_mode = padding_mode
self.lateral_convs = None
if lateral_conv:
self.lateral_convs = nn.ModuleList()
for i in range(len(img_channels)):
l_conv = ConvModule(
img_channels[i],
mid_channels,
3,
padding=1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=self.act_cfg,
inplace=False)
self.lateral_convs.append(l_conv)
self.img_transform = nn.Sequential(
nn.Linear(mid_channels * len(img_channels), out_channels),
nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
)
else:
self.img_transform = nn.Sequential(
nn.Linear(sum(img_channels), out_channels),
nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
)
self.pts_transform = nn.Sequential(
nn.Linear(pts_channels, out_channels),
nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
)
if self.fuse_out:
self.fuse_conv = nn.Sequential(
nn.Linear(mid_channels, out_channels),
# For pts the BN is initialized differently by default
# TODO: check whether this is necessary
nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01),
nn.ReLU(inplace=False))
self.init_weights()
# default init_weights for conv(msra) and norm in ConvModule
def init_weights(self):
"""Initialize the weights of modules."""
for m in self.modules():
if isinstance(m, (nn.Conv2d, nn.Linear)):
xavier_init(m, distribution='uniform')
def forward(self, img_feats, pts, pts_feats, img_metas):
"""Forward function.
Args:
img_feats (list[torch.Tensor]): Image features.
pts: [list[torch.Tensor]]: A batch of points with shape N x 3.
pts_feats (torch.Tensor): A tensor consist of point features of the
total batch.
img_metas (list[dict]): Meta information of images.
Returns:
torch.Tensor: Fused features of each point.
"""
img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas)
img_pre_fuse = self.img_transform(img_pts)
if self.training and self.dropout_ratio > 0:
img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio)
pts_pre_fuse = self.pts_transform(pts_feats)
fuse_out = img_pre_fuse + pts_pre_fuse
if self.activate_out:
fuse_out = F.relu(fuse_out)
if self.fuse_out:
fuse_out = self.fuse_conv(fuse_out)
return fuse_out
def obtain_mlvl_feats(self, img_feats, pts, img_metas):
"""Obtain multi-level features for each point.
Args:
img_feats (list(torch.Tensor)): Multi-scale image features produced
by image backbone in shape (N, C, H, W).
pts (list[torch.Tensor]): Points of each sample.
img_metas (list[dict]): Meta information for each sample.
Returns:
torch.Tensor: Corresponding image features of each point.
"""
if self.lateral_convs is not None:
img_ins = [
lateral_conv(img_feats[i])
for i, lateral_conv in zip(self.img_levels, self.lateral_convs)
]
else:
img_ins = img_feats
img_feats_per_point = []
# Sample multi-level features
for i in range(len(img_metas)):
mlvl_img_feats = []
for level in range(len(self.img_levels)):
mlvl_img_feats.append(
self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3],
img_metas[i]))
mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1)
img_feats_per_point.append(mlvl_img_feats)
img_pts = torch.cat(img_feats_per_point, dim=0)
return img_pts
def sample_single(self, img_feats, pts, img_meta):
"""Sample features from single level image feature map.
Args:
img_feats (torch.Tensor): Image feature map in shape
(1, C, H, W).
pts (torch.Tensor): Points of a single sample.
img_meta (dict): Meta information of the single sample.
Returns:
torch.Tensor: Single level image features of each point.
"""
# TODO: image transformation also extracted
img_scale_factor = (
pts.new_tensor(img_meta['scale_factor'][:2])
if 'scale_factor' in img_meta.keys() else 1)
img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False
img_crop_offset = (
pts.new_tensor(img_meta['img_crop_offset'])
if 'img_crop_offset' in img_meta.keys() else 0)
img_pts = point_sample(
img_meta,
img_feats,
pts,
pts.new_tensor(img_meta['lidar2img']),
img_scale_factor,
img_crop_offset,
img_flip=img_flip,
img_pad_shape=img_meta['input_shape'][:2],
img_shape=img_meta['img_shape'][:2],
aligned=self.aligned,
padding_mode=self.padding_mode,
align_corners=self.align_corners,
)
return img_pts
================================================
FILE: mmdet3d/models/fusion_layers/vote_fusion.py
================================================
import torch
from torch import nn as nn
from mmdet3d.core.bbox import Coord3DMode, points_cam2img
from ..registry import FUSION_LAYERS
from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform
EPS = 1e-6
@FUSION_LAYERS.register_module()
class VoteFusion(nn.Module):
"""Fuse 2d features from 3d seeds.
Args:
num_classes (int): number of classes.
max_imvote_per_pixel (int): max number of imvotes.
"""
def __init__(self, num_classes=10, max_imvote_per_pixel=3):
super(VoteFusion, self).__init__()
self.num_classes = num_classes
self.max_imvote_per_pixel = max_imvote_per_pixel
def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas,
calibs):
"""Forward function.
Args:
imgs (list[torch.Tensor]): Image features.
bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes.
seeds_3d_depth (torch.Tensor): 3D seeds.
img_metas (list[dict]): Meta information of images.
calibs: Camera calibration information of the images.
Returns:
torch.Tensor: Concatenated cues of each point.
torch.Tensor: Validity mask of each feature.
"""
img_features = []
masks = []
for i, data in enumerate(
zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)):
img, bbox_2d_rescaled, seed_3d_depth, img_meta = data
bbox_num = bbox_2d_rescaled.shape[0]
seed_num = seed_3d_depth.shape[0]
img_shape = img_meta['img_shape']
img_h, img_w, _ = img_shape
# first reverse the data transformations
xyz_depth = apply_3d_transformation(
seed_3d_depth, 'DEPTH', img_meta, reverse=True)
# then convert from depth coords to camera coords
xyz_cam = Coord3DMode.convert_point(
xyz_depth,
Coord3DMode.DEPTH,
Coord3DMode.CAM,
rt_mat=calibs['Rt'][i])
# project to 2d to get image coords (uv)
uv_origin = points_cam2img(xyz_cam, calibs['K'][i])
uv_origin = (uv_origin - 1).round()
# rescale 2d coordinates and bboxes
uv_rescaled = coord_2d_transform(img_meta, uv_origin, True)
bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled,
False)
if bbox_num == 0:
imvote_num = seed_num * self.max_imvote_per_pixel
# use zero features
two_cues = torch.zeros((15, imvote_num),
device=seed_3d_depth.device)
mask_zero = torch.zeros(
imvote_num - seed_num, device=seed_3d_depth.device).bool()
mask_one = torch.ones(
seed_num, device=seed_3d_depth.device).bool()
mask = torch.cat([mask_one, mask_zero], dim=0)
else:
# expand bboxes and seeds
bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand(
seed_num, -1, -1)
seed_2d_expanded = uv_origin.view(seed_num, 1,
-1).expand(-1, bbox_num, -1)
seed_2d_expanded_x, seed_2d_expanded_y = \
seed_2d_expanded.split(1, dim=-1)
bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \
bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \
bbox_expanded.split(1, dim=-1)
bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2
bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2
seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \
(seed_2d_expanded_x < bbox_expanded_r)
seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \
(seed_2d_expanded_y < bbox_expanded_b)
seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y
# semantic cues, dim=class_num
sem_cue = torch.zeros_like(bbox_expanded_conf).expand(
-1, -1, self.num_classes)
sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(),
bbox_expanded_conf)
# bbox center - uv
delta_u = bbox_expanded_midx - seed_2d_expanded_x
delta_v = bbox_expanded_midy - seed_2d_expanded_y
seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand(
-1, bbox_num, -1)
z_cam = xyz_cam[..., 2:3].view(seed_num, 1,
1).expand(-1, bbox_num, -1)
delta_u = delta_u * z_cam / calibs['K'][i, 0, 0]
delta_v = delta_v * z_cam / calibs['K'][i, 0, 0]
imvote = torch.cat(
[delta_u, delta_v,
torch.zeros_like(delta_v)], dim=-1).view(-1, 3)
# convert from camera coords to depth coords
imvote = Coord3DMode.convert_point(
imvote.view((-1, 3)),
Coord3DMode.CAM,
Coord3DMode.DEPTH,
rt_mat=calibs['Rt'][i])
# apply transformation to lifted imvotes
imvote = apply_3d_transformation(
imvote, 'DEPTH', img_meta, reverse=False)
seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape)
# ray angle
ray_angle = seed_3d_expanded + imvote
ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) +
EPS).unsqueeze(-1)
# imvote lifted to 3d
xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \
* seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]]
# geometric cues, dim=5
geo_cue = torch.cat([xz, ray_angle],
dim=-1).view(seed_num, -1, 5)
two_cues = torch.cat([geo_cue, sem_cue], dim=-1)
# mask to 0 if seed not in bbox
two_cues = two_cues * seed_2d_in_bbox.float()
feature_size = two_cues.shape[-1]
# if bbox number is too small, append zeros
if bbox_num < self.max_imvote_per_pixel:
append_num = self.max_imvote_per_pixel - bbox_num
append_zeros = torch.zeros(
(seed_num, append_num, 1),
device=seed_2d_in_bbox.device).bool()
seed_2d_in_bbox = torch.cat(
[seed_2d_in_bbox, append_zeros], dim=1)
append_zeros = torch.zeros(
(seed_num, append_num, feature_size),
device=two_cues.device)
two_cues = torch.cat([two_cues, append_zeros], dim=1)
append_zeros = torch.zeros((seed_num, append_num, 1),
device=two_cues.device)
bbox_expanded_conf = torch.cat(
[bbox_expanded_conf, append_zeros], dim=1)
# sort the valid seed-bbox pair according to confidence
pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf
# and find the largests
mask, indices = pair_score.topk(
self.max_imvote_per_pixel,
dim=1,
largest=True,
sorted=True)
indices_img = indices.expand(-1, -1, feature_size)
two_cues = two_cues.gather(dim=1, index=indices_img)
two_cues = two_cues.transpose(1, 0)
two_cues = two_cues.reshape(-1, feature_size).transpose(
1, 0).contiguous()
# since conf is ~ (0, 1), floor gives us validity
mask = mask.floor().int()
mask = mask.transpose(1, 0).reshape(-1).bool()
# clear the padding
img = img[:, :img_shape[0], :img_shape[1]]
img_flatten = img.reshape(3, -1).float()
img_flatten /= 255.
# take the normalized pixel value as texture cue
uv_flatten = uv_rescaled[:, 1].round() * \
img_shape[1] + uv_rescaled[:, 0].round()
uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long()
txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded)
txt_cue = txt_cue.unsqueeze(1).expand(-1,
self.max_imvote_per_pixel,
-1).reshape(3, -1)
# append texture cue
img_feature = torch.cat([two_cues, txt_cue], dim=0)
img_features.append(img_feature)
masks.append(mask)
return torch.stack(img_features, 0), torch.stack(masks, 0)
================================================
FILE: mmdet3d/models/losses/__init__.py
================================================
from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy
from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss
from .chamfer_distance import ChamferDistance, chamfer_distance
from .uncertainty_loss import LaplaceL1Loss
__all__ = [
'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance',
'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss',
'LaplaceL1Loss'
]
================================================
FILE: mmdet3d/models/losses/axis_aligned_iou_loss.py
================================================
import torch
from torch import nn as nn
from mmdet.models.builder import LOSSES
from mmdet.models.losses.utils import weighted_loss
from ...core.bbox import AxisAlignedBboxOverlaps3D
@weighted_loss
def axis_aligned_iou_loss(pred, target):
"""Calculate the IoU loss (1-IoU) of two set of axis aligned bounding
boxes. Note that predictions and targets are one-to-one corresponded.
Args:
pred (torch.Tensor): Bbox predictions with shape [..., 3].
target (torch.Tensor): Bbox targets (gt) with shape [..., 3].
Returns:
torch.Tensor: IoU loss between predictions and targets.
"""
axis_aligned_iou = AxisAlignedBboxOverlaps3D()(
pred, target, is_aligned=True)
iou_loss = 1 - axis_aligned_iou
return iou_loss
@LOSSES.register_module()
class AxisAlignedIoULoss(nn.Module):
"""Calculate the IoU loss (1-IoU) of axis aligned bounding boxes.
Args:
reduction (str): Method to reduce losses.
The valid reduction method are none, sum or mean.
loss_weight (float, optional): Weight of loss. Defaults to 1.0.
"""
def __init__(self, reduction='mean', loss_weight=1.0):
super(AxisAlignedIoULoss, self).__init__()
assert reduction in ['none', 'sum', 'mean']
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self,
pred,
target,
weight=None,
avg_factor=None,
reduction_override=None,
**kwargs):
"""Forward function of loss calculation.
Args:
pred (torch.Tensor): Bbox predictions with shape [..., 3].
target (torch.Tensor): Bbox targets (gt) with shape [..., 3].
weight (torch.Tensor|float, optional): Weight of loss. \
Defaults to None.
avg_factor (int, optional): Average factor that is used to average
the loss. Defaults to None.
reduction_override (str, optional): Method to reduce losses.
The valid reduction method are 'none', 'sum' or 'mean'.
Defaults to None.
Returns:
torch.Tensor: IoU loss between predictions and targets.
"""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
if (weight is not None) and (not torch.any(weight > 0)) and (
reduction != 'none'):
return (pred * weight).sum()
return axis_aligned_iou_loss(
pred,
target,
weight=weight,
avg_factor=avg_factor,
reduction=reduction) * self.loss_weight
================================================
FILE: mmdet3d/models/losses/chamfer_distance.py
================================================
import torch
from torch import nn as nn
from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss
from mmdet.models.builder import LOSSES
def chamfer_distance(src,
dst,
src_weight=1.0,
dst_weight=1.0,
criterion_mode='l2',
reduction='mean'):
"""Calculate Chamfer Distance of two sets.
Args:
src (torch.Tensor): Source set with shape [B, N, C] to
calculate Chamfer Distance.
dst (torch.Tensor): Destination set with shape [B, M, C] to
calculate Chamfer Distance.
src_weight (torch.Tensor or float): Weight of source loss.
dst_weight (torch.Tensor or float): Weight of destination loss.
criterion_mode (str): Criterion mode to calculate distance.
The valid modes are smooth_l1, l1 or l2.
reduction (str): Method to reduce losses.
The valid reduction method are 'none', 'sum' or 'mean'.
Returns:
tuple: Source and Destination loss with the corresponding indices.
- loss_src (torch.Tensor): The min distance \
from source to destination.
- loss_dst (torch.Tensor): The min distance \
from destination to source.
- indices1 (torch.Tensor): Index the min distance point \
for each point in source to destination.
- indices2 (torch.Tensor): Index the min distance point \
for each point in destination to source.
"""
if criterion_mode == 'smooth_l1':
criterion = smooth_l1_loss
elif criterion_mode == 'l1':
criterion = l1_loss
elif criterion_mode == 'l2':
criterion = mse_loss
else:
raise NotImplementedError
src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1)
dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1)
distance = criterion(src_expand, dst_expand, reduction='none').sum(-1)
src2dst_distance, indices1 = torch.min(distance, dim=2) # (B,N)
dst2src_distance, indices2 = torch.min(distance, dim=1) # (B,M)
loss_src = (src2dst_distance * src_weight)
loss_dst = (dst2src_distance * dst_weight)
if reduction == 'sum':
loss_src = torch.sum(loss_src)
loss_dst = torch.sum(loss_dst)
elif reduction == 'mean':
loss_src = torch.mean(loss_src)
loss_dst = torch.mean(loss_dst)
elif reduction == 'none':
pass
else:
raise NotImplementedError
return loss_src, loss_dst, indices1, indices2
@LOSSES.register_module()
class ChamferDistance(nn.Module):
"""Calculate Chamfer Distance of two sets.
Args:
mode (str): Criterion mode to calculate distance.
The valid modes are smooth_l1, l1 or l2.
reduction (str): Method to reduce losses.
The valid reduction method are none, sum or mean.
loss_src_weight (float): Weight of loss_source.
loss_dst_weight (float): Weight of loss_target.
"""
def __init__(self,
mode='l2',
reduction='mean',
loss_src_weight=1.0,
loss_dst_weight=1.0):
super(ChamferDistance, self).__init__()
assert mode in ['smooth_l1', 'l1', 'l2']
assert reduction in ['none', 'sum', 'mean']
self.mode = mode
self.reduction = reduction
self.loss_src_weight = loss_src_weight
self.loss_dst_weight = loss_dst_weight
def forward(self,
source,
target,
src_weight=1.0,
dst_weight=1.0,
reduction_override=None,
return_indices=False,
**kwargs):
"""Forward function of loss calculation.
Args:
source (torch.Tensor): Source set with shape [B, N, C] to
calculate Chamfer Distance.
target (torch.Tensor): Destination set with shape [B, M, C] to
calculate Chamfer Distance.
src_weight (torch.Tensor | float, optional):
Weight of source loss. Defaults to 1.0.
dst_weight (torch.Tensor | float, optional):
Weight of destination loss. Defaults to 1.0.
reduction_override (str, optional): Method to reduce losses.
The valid reduction method are 'none', 'sum' or 'mean'.
Defaults to None.
return_indices (bool, optional): Whether to return indices.
Defaults to False.
Returns:
tuple[torch.Tensor]: If ``return_indices=True``, return losses of \
source and target with their corresponding indices in the \
order of ``(loss_source, loss_target, indices1, indices2)``. \
If ``return_indices=False``, return \
``(loss_source, loss_target)``.
"""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
loss_source, loss_target, indices1, indices2 = chamfer_distance(
source, target, src_weight, dst_weight, self.mode, reduction)
loss_source *= self.loss_src_weight
loss_target *= self.loss_dst_weight
if return_indices:
return loss_source, loss_target, indices1, indices2
else:
return loss_source, loss_target
================================================
FILE: mmdet3d/models/losses/uncertainty_loss.py
================================================
import torch
from torch import nn as nn
from mmdet.models.builder import LOSSES
from mmdet.models.losses.utils import weighted_loss
@weighted_loss
def laplacian_aleatoric_uncertainty_loss(pred, target):
'''
References:
MonoPair: Monocular 3D Object Detection Using Pairwise Spatial Relationships, CVPR'20
Geometry and Uncertainty in Deep Learning for Computer Vision, University of Cambridge
'''
log_variance = pred[..., 1:]
pred = pred[..., :1]
if target.numel() == 0:
return pred.sum() * 0
assert pred.size() == target.size()
assert pred.size() == log_variance.size()
loss = 1.4142 * torch.exp(-log_variance) * torch.abs(pred - target) + log_variance
return loss
@LOSSES.register_module()
class LaplaceL1Loss(nn.Module):
"""L1 loss.
Args:
reduction (str, optional): The method to reduce the loss.
Options are "none", "mean" and "sum".
loss_weight (float, optional): The weight of loss.
"""
def __init__(self, reduction='mean', loss_weight=1.0):
super(LaplaceL1Loss, self).__init__()
self.reduction = reduction
self.loss_weight = loss_weight
def forward(self,
pred,
target,
weight=None,
avg_factor=None,
reduction_override=None):
"""Forward function.
Args:
pred (torch.Tensor): The prediction.
target (torch.Tensor): The learning target of the prediction.
weight (torch.Tensor, optional): The weight of loss for each
prediction. Defaults to None.
avg_factor (int, optional): Average factor that is used to average
the loss. Defaults to None.
reduction_override (str, optional): The reduction method used to
override the original reduction method of the loss.
Defaults to None.
"""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
loss = laplacian_aleatoric_uncertainty_loss(pred, target, weight=weight, reduction=reduction, avg_factor=avg_factor)
loss_bbox = self.loss_weight * loss
return loss_bbox
================================================
FILE: mmdet3d/models/middle_encoders/__init__.py
================================================
from .pillar_scatter import PointPillarsScatter
from .sparse_encoder import SparseEncoder
from .sparse_unet import SparseUNet
__all__ = ['PointPillarsScatter', 'SparseEncoder', 'SparseUNet']
================================================
FILE: mmdet3d/models/middle_encoders/pillar_scatter.py
================================================
import torch
from mmcv.runner import auto_fp16
from torch import nn
from ..registry import MIDDLE_ENCODERS
@MIDDLE_ENCODERS.register_module()
class PointPillarsScatter(nn.Module):
"""Point Pillar's Scatter.
Converts learned features from dense tensor to sparse pseudo image.
Args:
in_channels (int): Channels of input features.
output_shape (list[int]): Required output shape of features.
"""
def __init__(self, in_channels, output_shape):
super().__init__()
self.output_shape = output_shape
self.ny = output_shape[0]
self.nx = output_shape[1]
self.in_channels = in_channels
self.fp16_enabled = False
@auto_fp16(apply_to=('voxel_features', ))
def forward(self, voxel_features, coors, batch_size=None):
"""Foraward function to scatter features."""
# TODO: rewrite the function in a batch manner
# no need to deal with different batch cases
if batch_size is not None:
return self.forward_batch(voxel_features, coors, batch_size)
else:
return self.forward_single(voxel_features, coors)
def forward_single(self, voxel_features, coors):
"""Scatter features of single sample.
Args:
voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
coors (torch.Tensor): Coordinates of each voxel.
The first column indicates the sample ID.
"""
# Create the canvas for this sample
canvas = torch.zeros(
self.in_channels,
self.nx * self.ny,
dtype=voxel_features.dtype,
device=voxel_features.device)
indices = coors[:, 1] * self.nx + coors[:, 2]
indices = indices.long()
voxels = voxel_features.t()
# Now scatter the blob back to the canvas.
canvas[:, indices] = voxels
# Undo the column stacking to final 4-dim tensor
canvas = canvas.view(1, self.in_channels, self.ny, self.nx)
return [canvas]
def forward_batch(self, voxel_features, coors, batch_size):
"""Scatter features of single sample.
Args:
voxel_features (torch.Tensor): Voxel features in shape (N, M, C).
coors (torch.Tensor): Coordinates of each voxel in shape (N, 4).
The first column indicates the sample ID.
batch_size (int): Number of samples in the current batch.
"""
# batch_canvas will be the final output.
batch_canvas = []
for batch_itt in range(batch_size):
# Create the canvas for this sample
canvas = torch.zeros(
self.in_channels,
self.nx * self.ny,
dtype=voxel_features.dtype,
device=voxel_features.device)
# Only include non-empty pillars
batch_mask = coors[:, 0] == batch_itt
this_coors = coors[batch_mask, :]
indices = this_coors[:, 2] * self.nx + this_coors[:, 3]
indices = indices.type(torch.long)
voxels = voxel_features[batch_mask, :]
voxels = voxels.t()
# Now scatter the blob back to the canvas.
canvas[:, indices] = voxels
# Append to a list for later stacking.
batch_canvas.append(canvas)
# Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols)
batch_canvas = torch.stack(batch_canvas, 0)
# Undo the column stacking to final 4-dim tensor
batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny,
self.nx)
return batch_canvas
================================================
FILE: mmdet3d/models/middle_encoders/sparse_encoder.py
================================================
from mmcv.runner import auto_fp16
from torch import nn as nn
from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
from mmdet3d.ops import spconv as spconv
from ..registry import MIDDLE_ENCODERS
@MIDDLE_ENCODERS.register_module()
class SparseEncoder(nn.Module):
r"""Sparse encoder for SECOND and Part-A2.
Args:
in_channels (int): The number of input channels.
sparse_shape (list[int]): The sparse shape of input tensor.
order (list[str]): Order of conv module. Defaults to ('conv',
'norm', 'act').
norm_cfg (dict): Config of normalization layer. Defaults to
dict(type='BN1d', eps=1e-3, momentum=0.01).
base_channels (int): Out channels for conv_input layer.
Defaults to 16.
output_channels (int): Out channels for conv_out layer.
Defaults to 128.
encoder_channels (tuple[tuple[int]]):
Convolutional channels of each encode block.
encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)).
block_type (str): Type of the block to use. Defaults to 'conv_module'.
"""
def __init__(self,
in_channels,
sparse_shape,
order=('conv', 'norm', 'act'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
base_channels=16,
output_channels=128,
encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
64)),
encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
1)),
block_type='conv_module'):
super().__init__()
assert block_type in ['conv_module', 'basicblock']
self.sparse_shape = sparse_shape
self.in_channels = in_channels
self.order = order
self.base_channels = base_channels
self.output_channels = output_channels
self.encoder_channels = encoder_channels
self.encoder_paddings = encoder_paddings
self.stage_num = len(self.encoder_channels)
self.fp16_enabled = False
# Spconv init all weight on its own
assert isinstance(order, tuple) and len(order) == 3
assert set(order) == {'conv', 'norm', 'act'}
if self.order[0] != 'conv': # pre activate
self.conv_input = make_sparse_convmodule(
in_channels,
self.base_channels,
3,
norm_cfg=norm_cfg,
padding=1,
indice_key='subm1',
conv_type='SubMConv3d',
order=('conv', ))
else: # post activate
self.conv_input = make_sparse_convmodule(
in_channels,
self.base_channels,
3,
norm_cfg=norm_cfg,
padding=1,
indice_key='subm1',
conv_type='SubMConv3d')
encoder_out_channels = self.make_encoder_layers(
make_sparse_convmodule,
norm_cfg,
self.base_channels,
block_type=block_type)
self.conv_out = make_sparse_convmodule(
encoder_out_channels,
self.output_channels,
kernel_size=(3, 1, 1),
stride=(2, 1, 1),
norm_cfg=norm_cfg,
padding=0,
indice_key='spconv_down2',
conv_type='SparseConv3d')
@auto_fp16(apply_to=('voxel_features', ))
def forward(self, voxel_features, coors, batch_size):
"""Forward of SparseEncoder.
Args:
voxel_features (torch.float32): Voxel features in shape (N, C).
coors (torch.int32): Coordinates in shape (N, 4), \
the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
batch_size (int): Batch size.
Returns:
dict: Backbone features.
"""
coors = coors.int()
input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
self.sparse_shape,
batch_size)
x = self.conv_input(input_sp_tensor)
encode_features = []
for encoder_layer in self.encoder_layers:
x = encoder_layer(x)
encode_features.append(x)
# for detection head
# [200, 176, 5] -> [200, 176, 2]
out = self.conv_out(encode_features[-1])
spatial_features = out.dense()
N, C, D, H, W = spatial_features.shape
spatial_features = spatial_features.view(N, C * D, H, W)
return spatial_features
def make_encoder_layers(self,
make_block,
norm_cfg,
in_channels,
block_type='conv_module',
conv_cfg=dict(type='SubMConv3d')):
"""make encoder layers using sparse convs.
Args:
make_block (method): A bounded function to build blocks.
norm_cfg (dict[str]): Config of normalization layer.
in_channels (int): The number of encoder input channels.
block_type (str): Type of the block to use. Defaults to
'conv_module'.
conv_cfg (dict): Config of conv layer. Defaults to
dict(type='SubMConv3d').
Returns:
int: The number of encoder output channels.
"""
assert block_type in ['conv_module', 'basicblock']
self.encoder_layers = spconv.SparseSequential()
for i, blocks in enumerate(self.encoder_channels):
blocks_list = []
for j, out_channels in enumerate(tuple(blocks)):
padding = tuple(self.encoder_paddings[i])[j]
# each stage started with a spconv layer
# except the first stage
if i != 0 and j == 0 and block_type == 'conv_module':
blocks_list.append(
make_block(
in_channels,
out_channels,
3,
norm_cfg=norm_cfg,
stride=2,
padding=padding,
indice_key=f'spconv{i + 1}',
conv_type='SparseConv3d'))
elif block_type == 'basicblock':
if j == len(blocks) - 1 and i != len(
self.encoder_channels) - 1:
blocks_list.append(
make_block(
in_channels,
out_channels,
3,
norm_cfg=norm_cfg,
stride=2,
padding=padding,
indice_key=f'spconv{i + 1}',
conv_type='SparseConv3d'))
else:
blocks_list.append(
SparseBasicBlock(
out_channels,
out_channels,
norm_cfg=norm_cfg,
conv_cfg=conv_cfg))
else:
blocks_list.append(
make_block(
in_channels,
out_channels,
3,
norm_cfg=norm_cfg,
padding=padding,
indice_key=f'subm{i + 1}',
conv_type='SubMConv3d'))
in_channels = out_channels
stage_name = f'encoder_layer{i + 1}'
stage_layers = spconv.SparseSequential(*blocks_list)
self.encoder_layers.add_module(stage_name, stage_layers)
return out_channels
================================================
FILE: mmdet3d/models/middle_encoders/sparse_unet.py
================================================
import torch
from mmcv.runner import auto_fp16
from torch import nn as nn
from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule
from mmdet3d.ops import spconv as spconv
from ..registry import MIDDLE_ENCODERS
@MIDDLE_ENCODERS.register_module()
class SparseUNet(nn.Module):
r"""SparseUNet for PartA^2.
See the `paper `_ for more detials.
Args:
in_channels (int): The number of input channels.
sparse_shape (list[int]): The sparse shape of input tensor.
norm_cfg (dict): Config of normalization layer.
base_channels (int): Out channels for conv_input layer.
output_channels (int): Out channels for conv_out layer.
encoder_channels (tuple[tuple[int]]):
Convolutional channels of each encode block.
encoder_paddings (tuple[tuple[int]]): Paddings of each encode block.
decoder_channels (tuple[tuple[int]]):
Convolutional channels of each decode block.
decoder_paddings (tuple[tuple[int]]): Paddings of each decode block.
"""
def __init__(self,
in_channels,
sparse_shape,
order=('conv', 'norm', 'act'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
base_channels=16,
output_channels=128,
encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64,
64)),
encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1,
1)),
decoder_channels=((64, 64, 64), (64, 64, 32), (32, 32, 16),
(16, 16, 16)),
decoder_paddings=((1, 0), (1, 0), (0, 0), (0, 1))):
super().__init__()
self.sparse_shape = sparse_shape
self.in_channels = in_channels
self.order = order
self.base_channels = base_channels
self.output_channels = output_channels
self.encoder_channels = encoder_channels
self.encoder_paddings = encoder_paddings
self.decoder_channels = decoder_channels
self.decoder_paddings = decoder_paddings
self.stage_num = len(self.encoder_channels)
self.fp16_enabled = False
# Spconv init all weight on its own
assert isinstance(order, tuple) and len(order) == 3
assert set(order) == {'conv', 'norm', 'act'}
if self.order[0] != 'conv': # pre activate
self.conv_input = make_sparse_convmodule(
in_channels,
self.base_channels,
3,
norm_cfg=norm_cfg,
padding=1,
indice_key='subm1',
conv_type='SubMConv3d',
order=('conv', ))
else: # post activate
self.conv_input = make_sparse_convmodule(
in_channels,
self.base_channels,
3,
norm_cfg=norm_cfg,
padding=1,
indice_key='subm1',
conv_type='SubMConv3d')
encoder_out_channels = self.make_encoder_layers(
make_sparse_convmodule, norm_cfg, self.base_channels)
self.make_decoder_layers(make_sparse_convmodule, norm_cfg,
encoder_out_channels)
self.conv_out = make_sparse_convmodule(
encoder_out_channels,
self.output_channels,
kernel_size=(3, 1, 1),
stride=(2, 1, 1),
norm_cfg=norm_cfg,
padding=0,
indice_key='spconv_down2',
conv_type='SparseConv3d')
@auto_fp16(apply_to=('voxel_features', ))
def forward(self, voxel_features, coors, batch_size):
"""Forward of SparseUNet.
Args:
voxel_features (torch.float32): Voxel features in shape [N, C].
coors (torch.int32): Coordinates in shape [N, 4],
the columns in the order of (batch_idx, z_idx, y_idx, x_idx).
batch_size (int): Batch size.
Returns:
dict[str, torch.Tensor]: Backbone features.
"""
coors = coors.int()
input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors,
self.sparse_shape,
batch_size)
x = self.conv_input(input_sp_tensor)
encode_features = []
for encoder_layer in self.encoder_layers:
x = encoder_layer(x)
encode_features.append(x)
# for detection head
# [200, 176, 5] -> [200, 176, 2]
out = self.conv_out(encode_features[-1])
spatial_features = out.dense()
N, C, D, H, W = spatial_features.shape
spatial_features = spatial_features.view(N, C * D, H, W)
# for segmentation head, with output shape:
# [400, 352, 11] <- [200, 176, 5]
# [800, 704, 21] <- [400, 352, 11]
# [1600, 1408, 41] <- [800, 704, 21]
# [1600, 1408, 41] <- [1600, 1408, 41]
decode_features = []
x = encode_features[-1]
for i in range(self.stage_num, 0, -1):
x = self.decoder_layer_forward(encode_features[i - 1], x,
getattr(self, f'lateral_layer{i}'),
getattr(self, f'merge_layer{i}'),
getattr(self, f'upsample_layer{i}'))
decode_features.append(x)
seg_features = decode_features[-1].features
ret = dict(
spatial_features=spatial_features, seg_features=seg_features)
return ret
def decoder_layer_forward(self, x_lateral, x_bottom, lateral_layer,
merge_layer, upsample_layer):
"""Forward of upsample and residual block.
Args:
x_lateral (:obj:`SparseConvTensor`): Lateral tensor.
x_bottom (:obj:`SparseConvTensor`): Feature from bottom layer.
lateral_layer (SparseBasicBlock): Convolution for lateral tensor.
merge_layer (SparseSequential): Convolution for merging features.
upsample_layer (SparseSequential): Convolution for upsampling.
Returns:
:obj:`SparseConvTensor`: Upsampled feature.
"""
x = lateral_layer(x_lateral)
x.features = torch.cat((x_bottom.features, x.features), dim=1)
x_merge = merge_layer(x)
x = self.reduce_channel(x, x_merge.features.shape[1])
x.features = x_merge.features + x.features
x = upsample_layer(x)
return x
@staticmethod
def reduce_channel(x, out_channels):
"""reduce channel for element-wise addition.
Args:
x (:obj:`SparseConvTensor`): Sparse tensor, ``x.features``
are in shape (N, C1).
out_channels (int): The number of channel after reduction.
Returns:
:obj:`SparseConvTensor`: Channel reduced feature.
"""
features = x.features
n, in_channels = features.shape
assert (in_channels % out_channels
== 0) and (in_channels >= out_channels)
x.features = features.view(n, out_channels, -1).sum(dim=2)
return x
def make_encoder_layers(self, make_block, norm_cfg, in_channels):
"""make encoder layers using sparse convs.
Args:
make_block (method): A bounded function to build blocks.
norm_cfg (dict[str]): Config of normalization layer.
in_channels (int): The number of encoder input channels.
Returns:
int: The number of encoder output channels.
"""
self.encoder_layers = spconv.SparseSequential()
for i, blocks in enumerate(self.encoder_channels):
blocks_list = []
for j, out_channels in enumerate(tuple(blocks)):
padding = tuple(self.encoder_paddings[i])[j]
# each stage started with a spconv layer
# except the first stage
if i != 0 and j == 0:
blocks_list.append(
make_block(
in_channels,
out_channels,
3,
norm_cfg=norm_cfg,
stride=2,
padding=padding,
indice_key=f'spconv{i + 1}',
conv_type='SparseConv3d'))
else:
blocks_list.append(
make_block(
in_channels,
out_channels,
3,
norm_cfg=norm_cfg,
padding=padding,
indice_key=f'subm{i + 1}',
conv_type='SubMConv3d'))
in_channels = out_channels
stage_name = f'encoder_layer{i + 1}'
stage_layers = spconv.SparseSequential(*blocks_list)
self.encoder_layers.add_module(stage_name, stage_layers)
return out_channels
def make_decoder_layers(self, make_block, norm_cfg, in_channels):
"""make decoder layers using sparse convs.
Args:
make_block (method): A bounded function to build blocks.
norm_cfg (dict[str]): Config of normalization layer.
in_channels (int): The number of encoder input channels.
Returns:
int: The number of encoder output channels.
"""
block_num = len(self.decoder_channels)
for i, block_channels in enumerate(self.decoder_channels):
paddings = self.decoder_paddings[i]
setattr(
self, f'lateral_layer{block_num - i}',
SparseBasicBlock(
in_channels,
block_channels[0],
conv_cfg=dict(
type='SubMConv3d', indice_key=f'subm{block_num - i}'),
norm_cfg=norm_cfg))
setattr(
self, f'merge_layer{block_num - i}',
make_block(
in_channels * 2,
block_channels[1],
3,
norm_cfg=norm_cfg,
padding=paddings[0],
indice_key=f'subm{block_num - i}',
conv_type='SubMConv3d'))
if block_num - i != 1:
setattr(
self, f'upsample_layer{block_num - i}',
make_block(
in_channels,
block_channels[2],
3,
norm_cfg=norm_cfg,
indice_key=f'spconv{block_num - i}',
conv_type='SparseInverseConv3d'))
else:
# use submanifold conv instead of inverse conv
# in the last block
setattr(
self, f'upsample_layer{block_num - i}',
make_block(
in_channels,
block_channels[2],
3,
norm_cfg=norm_cfg,
padding=paddings[1],
indice_key='subm1',
conv_type='SubMConv3d'))
in_channels = block_channels[2]
================================================
FILE: mmdet3d/models/model_utils/__init__.py
================================================
from .vote_module import VoteModule
__all__ = ['VoteModule']
================================================
FILE: mmdet3d/models/model_utils/vote_module.py
================================================
import torch
from mmcv import is_tuple_of
from mmcv.cnn import ConvModule
from torch import nn as nn
from mmdet3d.models.builder import build_loss
class VoteModule(nn.Module):
"""Vote module.
Generate votes from seed point features.
Args:
in_channels (int): Number of channels of seed point features.
vote_per_seed (int): Number of votes generated from each seed point.
gt_per_seed (int): Number of ground truth votes generated
from each seed point.
num_points (int): Number of points to be used for voting.
conv_channels (tuple[int]): Out channels of vote
generating convolution.
conv_cfg (dict): Config of convolution.
Default: dict(type='Conv1d').
norm_cfg (dict): Config of normalization.
Default: dict(type='BN1d').
norm_feats (bool): Whether to normalize features.
Default: True.
with_res_feat (bool): Whether to predict residual features.
Default: True.
vote_xyz_range (list[float], None): The range of points translation.
vote_loss (dict): Config of vote loss.
"""
def __init__(self,
in_channels,
vote_per_seed=1,
gt_per_seed=3,
num_points=-1,
conv_channels=(16, 16),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU'),
norm_feats=True,
with_res_feat=True,
vote_xyz_range=None,
vote_loss=None):
super().__init__()
self.in_channels = in_channels
self.vote_per_seed = vote_per_seed
self.gt_per_seed = gt_per_seed
self.num_points = num_points
self.norm_feats = norm_feats
self.with_res_feat = with_res_feat
assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float)
self.vote_xyz_range = vote_xyz_range
if vote_loss is not None:
self.vote_loss = build_loss(vote_loss)
prev_channels = in_channels
vote_conv_list = list()
for k in range(len(conv_channels)):
vote_conv_list.append(
ConvModule(
prev_channels,
conv_channels[k],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
bias=True,
inplace=True))
prev_channels = conv_channels[k]
self.vote_conv = nn.Sequential(*vote_conv_list)
# conv_out predicts coordinate and residual features
if with_res_feat:
out_channel = (3 + in_channels) * self.vote_per_seed
else:
out_channel = 3 * self.vote_per_seed
self.conv_out = nn.Conv1d(prev_channels, out_channel, 1)
def forward(self, seed_points, seed_feats):
"""forward.
Args:
seed_points (torch.Tensor): Coordinate of the seed
points in shape (B, N, 3).
seed_feats (torch.Tensor): Features of the seed points in shape
(B, C, N).
Returns:
tuple[torch.Tensor]:
- vote_points: Voted xyz based on the seed points \
with shape (B, M, 3), ``M=num_seed*vote_per_seed``.
- vote_features: Voted features based on the seed points with \
shape (B, C, M) where ``M=num_seed*vote_per_seed``, \
``C=vote_feature_dim``.
"""
if self.num_points != -1:
assert self.num_points < seed_points.shape[1], \
f'Number of vote points ({self.num_points}) should be '\
f'smaller than seed points size ({seed_points.shape[1]})'
seed_points = seed_points[:, :self.num_points]
seed_feats = seed_feats[..., :self.num_points]
batch_size, feat_channels, num_seed = seed_feats.shape
num_vote = num_seed * self.vote_per_seed
x = self.vote_conv(seed_feats)
# (batch_size, (3+out_dim)*vote_per_seed, num_seed)
votes = self.conv_out(x)
votes = votes.transpose(2, 1).view(batch_size, num_seed,
self.vote_per_seed, -1)
offset = votes[:, :, :, 0:3]
if self.vote_xyz_range is not None:
limited_offset_list = []
for axis in range(len(self.vote_xyz_range)):
limited_offset_list.append(offset[..., axis].clamp(
min=-self.vote_xyz_range[axis],
max=self.vote_xyz_range[axis]))
limited_offset = torch.stack(limited_offset_list, -1)
vote_points = (seed_points.unsqueeze(2) +
limited_offset).contiguous()
else:
vote_points = (seed_points.unsqueeze(2) + offset).contiguous()
vote_points = vote_points.view(batch_size, num_vote, 3)
offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1)
if self.with_res_feat:
res_feats = votes[:, :, :, 3:]
vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) +
res_feats).contiguous()
vote_feats = vote_feats.view(batch_size,
num_vote, feat_channels).transpose(
2, 1).contiguous()
if self.norm_feats:
features_norm = torch.norm(vote_feats, p=2, dim=1)
vote_feats = vote_feats.div(features_norm.unsqueeze(1))
else:
vote_feats = seed_feats
return vote_points, vote_feats, offset
def get_loss(self, seed_points, vote_points, seed_indices,
vote_targets_mask, vote_targets):
"""Calculate loss of voting module.
Args:
seed_points (torch.Tensor): Coordinate of the seed points.
vote_points (torch.Tensor): Coordinate of the vote points.
seed_indices (torch.Tensor): Indices of seed points in raw points.
vote_targets_mask (torch.Tensor): Mask of valid vote targets.
vote_targets (torch.Tensor): Targets of votes.
Returns:
torch.Tensor: Weighted vote loss.
"""
batch_size, num_seed = seed_points.shape[:2]
seed_gt_votes_mask = torch.gather(vote_targets_mask, 1,
seed_indices).float()
seed_indices_expand = seed_indices.unsqueeze(-1).repeat(
1, 1, 3 * self.gt_per_seed)
seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand)
seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed)
weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6)
distance = self.vote_loss(
vote_points.view(batch_size * num_seed, -1, 3),
seed_gt_votes.view(batch_size * num_seed, -1, 3),
dst_weight=weight.view(batch_size * num_seed, 1))[1]
vote_loss = torch.sum(torch.min(distance, dim=1)[0])
return vote_loss
================================================
FILE: mmdet3d/models/necks/__init__.py
================================================
from mmdet.models.necks.fpn import FPN
from .second_fpn import SECONDFPN
__all__ = ['FPN', 'SECONDFPN']
================================================
FILE: mmdet3d/models/necks/second_fpn.py
================================================
import numpy as np
import torch
from mmcv.cnn import (build_conv_layer, build_norm_layer, build_upsample_layer,
constant_init, is_norm, kaiming_init)
from mmcv.runner import auto_fp16
from torch import nn as nn
from mmdet.models import NECKS
@NECKS.register_module()
class SECONDFPN(nn.Module):
"""FPN used in SECOND/PointPillars/PartA2/MVXNet.
Args:
in_channels (list[int]): Input channels of multi-scale feature maps.
out_channels (list[int]): Output channels of feature maps.
upsample_strides (list[int]): Strides used to upsample the
feature maps.
norm_cfg (dict): Config dict of normalization layers.
upsample_cfg (dict): Config dict of upsample layers.
conv_cfg (dict): Config dict of conv layers.
use_conv_for_no_stride (bool): Whether to use conv when stride is 1.
"""
def __init__(self,
in_channels=[128, 128, 256],
out_channels=[256, 256, 256],
upsample_strides=[1, 2, 4],
norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01),
upsample_cfg=dict(type='deconv', bias=False),
conv_cfg=dict(type='Conv2d', bias=False),
use_conv_for_no_stride=False):
# if for GroupNorm,
# cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True)
super(SECONDFPN, self).__init__()
assert len(out_channels) == len(upsample_strides) == len(in_channels)
self.in_channels = in_channels
self.out_channels = out_channels
self.fp16_enabled = False
deblocks = []
for i, out_channel in enumerate(out_channels):
stride = upsample_strides[i]
if stride > 1 or (stride == 1 and not use_conv_for_no_stride):
upsample_layer = build_upsample_layer(
upsample_cfg,
in_channels=in_channels[i],
out_channels=out_channel,
kernel_size=upsample_strides[i],
stride=upsample_strides[i])
else:
stride = np.round(1 / stride).astype(np.int64)
upsample_layer = build_conv_layer(
conv_cfg,
in_channels=in_channels[i],
out_channels=out_channel,
kernel_size=stride,
stride=stride)
deblock = nn.Sequential(upsample_layer,
build_norm_layer(norm_cfg, out_channel)[1],
nn.ReLU(inplace=True))
deblocks.append(deblock)
self.deblocks = nn.ModuleList(deblocks)
def init_weights(self):
"""Initialize weights of FPN."""
for m in self.modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
elif is_norm(m):
constant_init(m, 1)
@auto_fp16()
def forward(self, x):
"""Forward function.
Args:
x (torch.Tensor): 4D Tensor in (N, C, H, W) shape.
Returns:
list[torch.Tensor]: Multi-level feature maps.
"""
assert len(x) == len(self.in_channels)
ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)]
if len(ups) > 1:
out = torch.cat(ups, dim=1)
else:
out = ups[0]
return [out]
================================================
FILE: mmdet3d/models/registry.py
================================================
from mmcv.utils import Registry
VOXEL_ENCODERS = Registry('voxel_encoder')
MIDDLE_ENCODERS = Registry('middle_encoder')
FUSION_LAYERS = Registry('fusion_layer')
# ACTIVATION_LAYERS = Registry('activation layer')
DROPOUT_LAYERS = Registry('drop out layers')
POSITIONAL_ENCODING = Registry('position encoding')
ATTENTION = Registry('attention')
FEEDFORWARD_NETWORK = Registry('feed-forward Network')
TRANSFORMER_LAYER = Registry('transformerLayer')
TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
================================================
FILE: mmdet3d/models/roi_heads/__init__.py
================================================
from .base_3droi_head import Base3DRoIHead
from .bbox_heads import PartA2BboxHead
from .h3d_roi_head import H3DRoIHead
from .mask_heads import PointwiseSemanticHead, PrimitiveHead
from .part_aggregation_roi_head import PartAggregationROIHead
from .roi_extractors import Single3DRoIAwareExtractor, SingleRoIExtractor
__all__ = [
'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead',
'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor',
'H3DRoIHead', 'PrimitiveHead'
]
================================================
FILE: mmdet3d/models/roi_heads/base_3droi_head.py
================================================
from abc import ABCMeta, abstractmethod
from torch import nn as nn
class Base3DRoIHead(nn.Module, metaclass=ABCMeta):
"""Base class for 3d RoIHeads."""
def __init__(self,
bbox_head=None,
mask_roi_extractor=None,
mask_head=None,
train_cfg=None,
test_cfg=None):
super(Base3DRoIHead, self).__init__()
self.train_cfg = train_cfg
self.test_cfg = test_cfg
if bbox_head is not None:
self.init_bbox_head(bbox_head)
if mask_head is not None:
self.init_mask_head(mask_roi_extractor, mask_head)
self.init_assigner_sampler()
@property
def with_bbox(self):
"""bool: whether the RoIHead has box head"""
return hasattr(self, 'bbox_head') and self.bbox_head is not None
@property
def with_mask(self):
"""bool: whether the RoIHead has mask head"""
return hasattr(self, 'mask_head') and self.mask_head is not None
@abstractmethod
def init_weights(self, pretrained):
"""Initialize the module with pre-trained weights."""
pass
@abstractmethod
def init_bbox_head(self):
"""Initialize the box head."""
pass
@abstractmethod
def init_mask_head(self):
"""Initialize maek head."""
pass
@abstractmethod
def init_assigner_sampler(self):
"""Initialize assigner and sampler."""
pass
@abstractmethod
def forward_train(self,
x,
img_metas,
proposal_list,
gt_bboxes,
gt_labels,
gt_bboxes_ignore=None,
**kwargs):
"""Forward function during training.
Args:
x (dict): Contains features from the first stage.
img_metas (list[dict]): Meta info of each image.
proposal_list (list[dict]): Proposal information from rpn.
gt_bboxes (list[:obj:`BaseInstance3DBoxes`]):
GT bboxes of each sample. The bboxes are encapsulated
by 3D box structures.
gt_labels (list[torch.LongTensor]): GT labels of each sample.
gt_bboxes_ignore (list[torch.Tensor], optional):
Ground truth boxes to be ignored.
Returns:
dict[str, torch.Tensor]: Losses from each head.
"""
pass
def simple_test(self,
x,
proposal_list,
img_metas,
proposals=None,
rescale=False,
**kwargs):
"""Test without augmentation."""
pass
def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs):
"""Test with augmentations.
If rescale is False, then returned bboxes and masks will fit the scale
of imgs[0].
"""
pass
================================================
FILE: mmdet3d/models/roi_heads/bbox_heads/__init__.py
================================================
from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead,
DoubleConvFCBBoxHead,
Shared2FCBBoxHead,
Shared4Conv1FCBBoxHead)
from .h3d_bbox_head import H3DBboxHead
from .parta2_bbox_head import PartA2BboxHead
__all__ = [
'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead',
'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead',
'H3DBboxHead'
]
================================================
FILE: mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py
================================================
import torch
from mmcv.cnn import ConvModule
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.core.bbox import DepthInstance3DBoxes
from mmdet3d.core.post_processing import aligned_3d_nms
from mmdet3d.models.builder import build_loss
from mmdet3d.models.losses import chamfer_distance
from mmdet3d.ops import build_sa_module
from mmdet.core import build_bbox_coder, multi_apply
from mmdet.models import HEADS
@HEADS.register_module()
class H3DBboxHead(nn.Module):
r"""Bbox head of `H3DNet `_.
Args:
num_classes (int): The number of classes.
suface_matching_cfg (dict): Config for suface primitive matching.
line_matching_cfg (dict): Config for line primitive matching.
bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
decoding boxes.
train_cfg (dict): Config for training.
test_cfg (dict): Config for testing.
gt_per_seed (int): Number of ground truth votes generated
from each seed point.
num_proposal (int): Number of proposal votes generated.
feat_channels (tuple[int]): Convolution channels of
prediction layer.
primitive_feat_refine_streams (int): The number of mlps to
refine primitive feature.
primitive_refine_channels (tuple[int]): Convolution channels of
prediction layer.
upper_thresh (float): Threshold for line matching.
surface_thresh (float): Threshold for suface matching.
line_thresh (float): Threshold for line matching.
conv_cfg (dict): Config of convolution in prediction layer.
norm_cfg (dict): Config of BN in prediction layer.
objectness_loss (dict): Config of objectness loss.
center_loss (dict): Config of center loss.
dir_class_loss (dict): Config of direction classification loss.
dir_res_loss (dict): Config of direction residual regression loss.
size_class_loss (dict): Config of size classification loss.
size_res_loss (dict): Config of size residual regression loss.
semantic_loss (dict): Config of point-wise semantic segmentation loss.
cues_objectness_loss (dict): Config of cues objectness loss.
cues_semantic_loss (dict): Config of cues semantic loss.
proposal_objectness_loss (dict): Config of proposal objectness
loss.
primitive_center_loss (dict): Config of primitive center regression
loss.
"""
def __init__(self,
num_classes,
suface_matching_cfg,
line_matching_cfg,
bbox_coder,
train_cfg=None,
test_cfg=None,
gt_per_seed=1,
num_proposal=256,
feat_channels=(128, 128),
primitive_feat_refine_streams=2,
primitive_refine_channels=[128, 128, 128],
upper_thresh=100.0,
surface_thresh=0.5,
line_thresh=0.5,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=None,
center_loss=None,
dir_class_loss=None,
dir_res_loss=None,
size_class_loss=None,
size_res_loss=None,
semantic_loss=None,
cues_objectness_loss=None,
cues_semantic_loss=None,
proposal_objectness_loss=None,
primitive_center_loss=None):
super(H3DBboxHead, self).__init__()
self.num_classes = num_classes
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.gt_per_seed = gt_per_seed
self.num_proposal = num_proposal
self.with_angle = bbox_coder['with_rot']
self.upper_thresh = upper_thresh
self.surface_thresh = surface_thresh
self.line_thresh = line_thresh
self.objectness_loss = build_loss(objectness_loss)
self.center_loss = build_loss(center_loss)
self.dir_class_loss = build_loss(dir_class_loss)
self.dir_res_loss = build_loss(dir_res_loss)
self.size_class_loss = build_loss(size_class_loss)
self.size_res_loss = build_loss(size_res_loss)
self.semantic_loss = build_loss(semantic_loss)
self.bbox_coder = build_bbox_coder(bbox_coder)
self.num_sizes = self.bbox_coder.num_sizes
self.num_dir_bins = self.bbox_coder.num_dir_bins
self.cues_objectness_loss = build_loss(cues_objectness_loss)
self.cues_semantic_loss = build_loss(cues_semantic_loss)
self.proposal_objectness_loss = build_loss(proposal_objectness_loss)
self.primitive_center_loss = build_loss(primitive_center_loss)
assert suface_matching_cfg['mlp_channels'][-1] == \
line_matching_cfg['mlp_channels'][-1]
# surface center matching
self.surface_center_matcher = build_sa_module(suface_matching_cfg)
# line center matching
self.line_center_matcher = build_sa_module(line_matching_cfg)
# Compute the matching scores
matching_feat_dims = suface_matching_cfg['mlp_channels'][-1]
self.matching_conv = ConvModule(
matching_feat_dims,
matching_feat_dims,
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=True)
self.matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
# Compute the semantic matching scores
self.semantic_matching_conv = ConvModule(
matching_feat_dims,
matching_feat_dims,
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=True)
self.semantic_matching_pred = nn.Conv1d(matching_feat_dims, 2, 1)
# Surface feature aggregation
self.surface_feats_aggregation = list()
for k in range(primitive_feat_refine_streams):
self.surface_feats_aggregation.append(
ConvModule(
matching_feat_dims,
matching_feat_dims,
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=True))
self.surface_feats_aggregation = nn.Sequential(
*self.surface_feats_aggregation)
# Line feature aggregation
self.line_feats_aggregation = list()
for k in range(primitive_feat_refine_streams):
self.line_feats_aggregation.append(
ConvModule(
matching_feat_dims,
matching_feat_dims,
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=True))
self.line_feats_aggregation = nn.Sequential(
*self.line_feats_aggregation)
# surface center(6) + line center(12)
prev_channel = 18 * matching_feat_dims
self.bbox_pred = nn.ModuleList()
for k in range(len(primitive_refine_channels)):
self.bbox_pred.append(
ConvModule(
prev_channel,
primitive_refine_channels[k],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=False))
prev_channel = primitive_refine_channels[k]
# Final object detection
# Objectness scores (2), center residual (3),
# heading class+residual (num_heading_bin*2), size class +
# residual(num_size_cluster*4)
conv_out_channel = (2 + 3 + bbox_coder['num_dir_bins'] * 2 +
bbox_coder['num_sizes'] * 4 + self.num_classes)
self.bbox_pred.append(nn.Conv1d(prev_channel, conv_out_channel, 1))
def init_weights(self, pretrained=None):
"""Initialize the weights in detector.
Args:
pretrained (str, optional): Path to pre-trained weights.
Defaults to None.
"""
pass
def forward(self, feats_dict, sample_mod):
"""Forward pass.
Args:
feats_dict (dict): Feature dict from backbone.
sample_mod (str): Sample mode for vote aggregation layer.
valid modes are "vote", "seed" and "random".
Returns:
dict: Predictions of vote head.
"""
ret_dict = {}
aggregated_points = feats_dict['aggregated_points']
original_feature = feats_dict['aggregated_features']
batch_size = original_feature.shape[0]
object_proposal = original_feature.shape[2]
# Extract surface center, features and semantic predictions
z_center = feats_dict['pred_z_center']
xy_center = feats_dict['pred_xy_center']
z_semantic = feats_dict['sem_cls_scores_z']
xy_semantic = feats_dict['sem_cls_scores_xy']
z_feature = feats_dict['aggregated_features_z']
xy_feature = feats_dict['aggregated_features_xy']
# Extract line points and features
line_center = feats_dict['pred_line_center']
line_feature = feats_dict['aggregated_features_line']
surface_center_pred = torch.cat((z_center, xy_center), dim=1)
ret_dict['surface_center_pred'] = surface_center_pred
ret_dict['surface_sem_pred'] = torch.cat((z_semantic, xy_semantic),
dim=1)
# Extract the surface and line centers of rpn proposals
rpn_proposals = feats_dict['proposal_list']
rpn_proposals_bbox = DepthInstance3DBoxes(
rpn_proposals.reshape(-1, 7).clone(),
box_dim=rpn_proposals.shape[-1],
with_yaw=self.with_angle,
origin=(0.5, 0.5, 0.5))
obj_surface_center, obj_line_center = \
rpn_proposals_bbox.get_surface_line_center()
obj_surface_center = obj_surface_center.reshape(
batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
obj_line_center = obj_line_center.reshape(batch_size, -1, 12,
3).transpose(1, 2).reshape(
batch_size, -1, 3)
ret_dict['surface_center_object'] = obj_surface_center
ret_dict['line_center_object'] = obj_line_center
# aggregate primitive z and xy features to rpn proposals
surface_center_feature_pred = torch.cat((z_feature, xy_feature), dim=2)
surface_center_feature_pred = torch.cat(
(surface_center_feature_pred.new_zeros(
(batch_size, 6, surface_center_feature_pred.shape[2])),
surface_center_feature_pred),
dim=1)
surface_xyz, surface_features, _ = self.surface_center_matcher(
surface_center_pred,
surface_center_feature_pred,
target_xyz=obj_surface_center)
# aggregate primitive line features to rpn proposals
line_feature = torch.cat((line_feature.new_zeros(
(batch_size, 12, line_feature.shape[2])), line_feature),
dim=1)
line_xyz, line_features, _ = self.line_center_matcher(
line_center, line_feature, target_xyz=obj_line_center)
# combine the surface and line features
combine_features = torch.cat((surface_features, line_features), dim=2)
matching_features = self.matching_conv(combine_features)
matching_score = self.matching_pred(matching_features)
ret_dict['matching_score'] = matching_score.transpose(2, 1)
semantic_matching_features = self.semantic_matching_conv(
combine_features)
semantic_matching_score = self.semantic_matching_pred(
semantic_matching_features)
ret_dict['semantic_matching_score'] = \
semantic_matching_score.transpose(2, 1)
surface_features = self.surface_feats_aggregation(surface_features)
line_features = self.line_feats_aggregation(line_features)
# Combine all surface and line features
surface_features = surface_features.view(batch_size, -1,
object_proposal)
line_features = line_features.view(batch_size, -1, object_proposal)
combine_feature = torch.cat((surface_features, line_features), dim=1)
# Final bbox predictions
bbox_predictions = self.bbox_pred[0](combine_feature)
bbox_predictions += original_feature
for conv_module in self.bbox_pred[1:]:
bbox_predictions = conv_module(bbox_predictions)
refine_decode_res = self.bbox_coder.split_pred(
bbox_predictions[:, :self.num_classes + 2],
bbox_predictions[:, self.num_classes + 2:], aggregated_points)
for key in refine_decode_res.keys():
ret_dict[key + '_optimized'] = refine_decode_res[key]
return ret_dict
def loss(self,
bbox_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
img_metas=None,
rpn_targets=None,
gt_bboxes_ignore=None):
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of h3d bbox head.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise
semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise
instance mask.
img_metas (list[dict]): Contain pcd and img's meta info.
rpn_targets (Tuple) : Targets generated by rpn head.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict: Losses of H3dnet.
"""
(vote_targets, vote_target_masks, size_class_targets, size_res_targets,
dir_class_targets, dir_res_targets, center_targets, mask_targets,
valid_gt_masks, objectness_targets, objectness_weights,
box_loss_weights, valid_gt_weights) = rpn_targets
losses = {}
# calculate refined proposal loss
refined_proposal_loss = self.get_proposal_stage_loss(
bbox_preds,
size_class_targets,
size_res_targets,
dir_class_targets,
dir_res_targets,
center_targets,
mask_targets,
objectness_targets,
objectness_weights,
box_loss_weights,
valid_gt_weights,
suffix='_optimized')
for key in refined_proposal_loss.keys():
losses[key + '_optimized'] = refined_proposal_loss[key]
bbox3d_optimized = self.bbox_coder.decode(
bbox_preds, suffix='_optimized')
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
bbox_preds)
(cues_objectness_label, cues_sem_label, proposal_objectness_label,
cues_mask, cues_match_mask, proposal_objectness_mask,
cues_matching_label, obj_surface_line_center) = targets
# match scores for each geometric primitive
objectness_scores = bbox_preds['matching_score']
# match scores for the semantics of primitives
objectness_scores_sem = bbox_preds['semantic_matching_score']
primitive_objectness_loss = self.cues_objectness_loss(
objectness_scores.transpose(2, 1),
cues_objectness_label,
weight=cues_mask,
avg_factor=cues_mask.sum() + 1e-6)
primitive_sem_loss = self.cues_semantic_loss(
objectness_scores_sem.transpose(2, 1),
cues_sem_label,
weight=cues_mask,
avg_factor=cues_mask.sum() + 1e-6)
objectness_scores = bbox_preds['obj_scores_optimized']
objectness_loss_refine = self.proposal_objectness_loss(
objectness_scores.transpose(2, 1), proposal_objectness_label)
primitive_matching_loss = (objectness_loss_refine *
cues_match_mask).sum() / (
cues_match_mask.sum() + 1e-6) * 0.5
primitive_sem_matching_loss = (
objectness_loss_refine * proposal_objectness_mask).sum() / (
proposal_objectness_mask.sum() + 1e-6) * 0.5
# Get the object surface center here
batch_size, object_proposal = bbox3d_optimized.shape[:2]
refined_bbox = DepthInstance3DBoxes(
bbox3d_optimized.reshape(-1, 7).clone(),
box_dim=bbox3d_optimized.shape[-1],
with_yaw=self.with_angle,
origin=(0.5, 0.5, 0.5))
pred_obj_surface_center, pred_obj_line_center = \
refined_bbox.get_surface_line_center()
pred_obj_surface_center = pred_obj_surface_center.reshape(
batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3)
pred_obj_line_center = pred_obj_line_center.reshape(
batch_size, -1, 12, 3).transpose(1, 2).reshape(batch_size, -1, 3)
pred_surface_line_center = torch.cat(
(pred_obj_surface_center, pred_obj_line_center), 1)
square_dist = self.primitive_center_loss(pred_surface_line_center,
obj_surface_line_center)
match_dist = torch.sqrt(square_dist.sum(dim=-1) + 1e-6)
primitive_centroid_reg_loss = torch.sum(
match_dist * cues_matching_label) / (
cues_matching_label.sum() + 1e-6)
refined_loss = dict(
primitive_objectness_loss=primitive_objectness_loss,
primitive_sem_loss=primitive_sem_loss,
primitive_matching_loss=primitive_matching_loss,
primitive_sem_matching_loss=primitive_sem_matching_loss,
primitive_centroid_reg_loss=primitive_centroid_reg_loss)
losses.update(refined_loss)
return losses
def get_bboxes(self,
points,
bbox_preds,
input_metas,
rescale=False,
suffix=''):
"""Generate bboxes from vote head predictions.
Args:
points (torch.Tensor): Input points.
bbox_preds (dict): Predictions from vote head.
input_metas (list[dict]): Point cloud and image's meta info.
rescale (bool): Whether to rescale bboxes.
Returns:
list[tuple[torch.Tensor]]: Bounding boxes, scores and labels.
"""
# decode boxes
obj_scores = F.softmax(
bbox_preds['obj_scores' + suffix], dim=-1)[..., -1]
sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1)
prediction_collection = {}
prediction_collection['center'] = bbox_preds['center' + suffix]
prediction_collection['dir_class'] = bbox_preds['dir_class']
prediction_collection['dir_res'] = bbox_preds['dir_res' + suffix]
prediction_collection['size_class'] = bbox_preds['size_class']
prediction_collection['size_res'] = bbox_preds['size_res' + suffix]
bbox3d = self.bbox_coder.decode(prediction_collection)
batch_size = bbox3d.shape[0]
results = list()
for b in range(batch_size):
bbox_selected, score_selected, labels = self.multiclass_nms_single(
obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3],
input_metas[b])
bbox = input_metas[b]['box_type_3d'](
bbox_selected,
box_dim=bbox_selected.shape[-1],
with_yaw=self.bbox_coder.with_rot)
results.append((bbox, score_selected, labels))
return results
def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points,
input_meta):
"""Multi-class nms in single batch.
Args:
obj_scores (torch.Tensor): Objectness score of bounding boxes.
sem_scores (torch.Tensor): semantic class score of bounding boxes.
bbox (torch.Tensor): Predicted bounding boxes.
points (torch.Tensor): Input points.
input_meta (dict): Point cloud and image's meta info.
Returns:
tuple[torch.Tensor]: Bounding boxes, scores and labels.
"""
bbox = input_meta['box_type_3d'](
bbox,
box_dim=bbox.shape[-1],
with_yaw=self.bbox_coder.with_rot,
origin=(0.5, 0.5, 0.5))
box_indices = bbox.points_in_boxes(points)
corner3d = bbox.corners
minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6)))
minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0]
minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0]
nonempty_box_mask = box_indices.T.sum(1) > 5
bbox_classes = torch.argmax(sem_scores, -1)
nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask],
obj_scores[nonempty_box_mask],
bbox_classes[nonempty_box_mask],
self.test_cfg.nms_thr)
# filter empty boxes and boxes with low score
scores_mask = (obj_scores > self.test_cfg.score_thr)
nonempty_box_inds = torch.nonzero(
nonempty_box_mask, as_tuple=False).flatten()
nonempty_mask = torch.zeros_like(bbox_classes).scatter(
0, nonempty_box_inds[nms_selected], 1)
selected = (nonempty_mask.bool() & scores_mask.bool())
if self.test_cfg.per_class_proposal:
bbox_selected, score_selected, labels = [], [], []
for k in range(sem_scores.shape[-1]):
bbox_selected.append(bbox[selected].tensor)
score_selected.append(obj_scores[selected] *
sem_scores[selected][:, k])
labels.append(
torch.zeros_like(bbox_classes[selected]).fill_(k))
bbox_selected = torch.cat(bbox_selected, 0)
score_selected = torch.cat(score_selected, 0)
labels = torch.cat(labels, 0)
else:
bbox_selected = bbox[selected].tensor
score_selected = obj_scores[selected]
labels = bbox_classes[selected]
return bbox_selected, score_selected, labels
def get_proposal_stage_loss(self,
bbox_preds,
size_class_targets,
size_res_targets,
dir_class_targets,
dir_res_targets,
center_targets,
mask_targets,
objectness_targets,
objectness_weights,
box_loss_weights,
valid_gt_weights,
suffix=''):
"""Compute loss for the aggregation module.
Args:
bbox_preds (dict): Predictions from forward of vote head.
size_class_targets (torch.Tensor): Ground truth \
size class of each prediction bounding box.
size_res_targets (torch.Tensor): Ground truth \
size residual of each prediction bounding box.
dir_class_targets (torch.Tensor): Ground truth \
direction class of each prediction bounding box.
dir_res_targets (torch.Tensor): Ground truth \
direction residual of each prediction bounding box.
center_targets (torch.Tensor): Ground truth center \
of each prediction bounding box.
mask_targets (torch.Tensor): Validation of each \
prediction bounding box.
objectness_targets (torch.Tensor): Ground truth \
objectness label of each prediction bounding box.
objectness_weights (torch.Tensor): Weights of objectness \
loss for each prediction bounding box.
box_loss_weights (torch.Tensor): Weights of regression \
loss for each prediction bounding box.
valid_gt_weights (torch.Tensor): Validation of each \
ground truth bounding box.
Returns:
dict: Losses of aggregation module.
"""
# calculate objectness loss
objectness_loss = self.objectness_loss(
bbox_preds['obj_scores' + suffix].transpose(2, 1),
objectness_targets,
weight=objectness_weights)
# calculate center loss
source2target_loss, target2source_loss = self.center_loss(
bbox_preds['center' + suffix],
center_targets,
src_weight=box_loss_weights,
dst_weight=valid_gt_weights)
center_loss = source2target_loss + target2source_loss
# calculate direction class loss
dir_class_loss = self.dir_class_loss(
bbox_preds['dir_class' + suffix].transpose(2, 1),
dir_class_targets,
weight=box_loss_weights)
# calculate direction residual loss
batch_size, proposal_num = size_class_targets.shape[:2]
heading_label_one_hot = dir_class_targets.new_zeros(
(batch_size, proposal_num, self.num_dir_bins))
heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1)
dir_res_norm = (bbox_preds['dir_res_norm' + suffix] *
heading_label_one_hot).sum(dim=-1)
dir_res_loss = self.dir_res_loss(
dir_res_norm, dir_res_targets, weight=box_loss_weights)
# calculate size class loss
size_class_loss = self.size_class_loss(
bbox_preds['size_class' + suffix].transpose(2, 1),
size_class_targets,
weight=box_loss_weights)
# calculate size residual loss
one_hot_size_targets = box_loss_weights.new_zeros(
(batch_size, proposal_num, self.num_sizes))
one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1)
one_hot_size_targets_expand = one_hot_size_targets.unsqueeze(
-1).repeat(1, 1, 1, 3)
size_residual_norm = (bbox_preds['size_res_norm' + suffix] *
one_hot_size_targets_expand).sum(dim=2)
box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat(
1, 1, 3)
size_res_loss = self.size_res_loss(
size_residual_norm,
size_res_targets,
weight=box_loss_weights_expand)
# calculate semantic loss
semantic_loss = self.semantic_loss(
bbox_preds['sem_scores' + suffix].transpose(2, 1),
mask_targets,
weight=box_loss_weights)
losses = dict(
objectness_loss=objectness_loss,
semantic_loss=semantic_loss,
center_loss=center_loss,
dir_class_loss=dir_class_loss,
dir_res_loss=dir_res_loss,
size_class_loss=size_class_loss,
size_res_loss=size_res_loss)
return losses
def get_targets(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
bbox_preds=None):
"""Generate targets of proposal module.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance
label of each batch.
bbox_preds (torch.Tensor): Bounding box predictions of vote head.
Returns:
tuple[torch.Tensor]: Targets of proposal module.
"""
# find empty example
valid_gt_masks = list()
gt_num = list()
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
valid_gt_masks.append(gt_labels_3d[index].new_zeros(1))
gt_num.append(1)
else:
valid_gt_masks.append(gt_labels_3d[index].new_ones(
gt_labels_3d[index].shape))
gt_num.append(gt_labels_3d[index].shape[0])
if pts_semantic_mask is None:
pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
pts_instance_mask = [None for i in range(len(gt_labels_3d))]
aggregated_points = [
bbox_preds['aggregated_points'][i]
for i in range(len(gt_labels_3d))
]
surface_center_pred = [
bbox_preds['surface_center_pred'][i]
for i in range(len(gt_labels_3d))
]
line_center_pred = [
bbox_preds['pred_line_center'][i]
for i in range(len(gt_labels_3d))
]
surface_center_object = [
bbox_preds['surface_center_object'][i]
for i in range(len(gt_labels_3d))
]
line_center_object = [
bbox_preds['line_center_object'][i]
for i in range(len(gt_labels_3d))
]
surface_sem_pred = [
bbox_preds['surface_sem_pred'][i]
for i in range(len(gt_labels_3d))
]
line_sem_pred = [
bbox_preds['sem_cls_scores_line'][i]
for i in range(len(gt_labels_3d))
]
(cues_objectness_label, cues_sem_label, proposal_objectness_label,
cues_mask, cues_match_mask, proposal_objectness_mask,
cues_matching_label, obj_surface_line_center) = multi_apply(
self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask, aggregated_points,
surface_center_pred, line_center_pred, surface_center_object,
line_center_object, surface_sem_pred, line_sem_pred)
cues_objectness_label = torch.stack(cues_objectness_label)
cues_sem_label = torch.stack(cues_sem_label)
proposal_objectness_label = torch.stack(proposal_objectness_label)
cues_mask = torch.stack(cues_mask)
cues_match_mask = torch.stack(cues_match_mask)
proposal_objectness_mask = torch.stack(proposal_objectness_mask)
cues_matching_label = torch.stack(cues_matching_label)
obj_surface_line_center = torch.stack(obj_surface_line_center)
return (cues_objectness_label, cues_sem_label,
proposal_objectness_label, cues_mask, cues_match_mask,
proposal_objectness_mask, cues_matching_label,
obj_surface_line_center)
def get_targets_single(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
aggregated_points=None,
pred_surface_center=None,
pred_line_center=None,
pred_obj_surface_center=None,
pred_obj_line_center=None,
pred_surface_sem=None,
pred_line_sem=None):
"""Generate targets for primitive cues for single batch.
Args:
points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \
boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic
label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance
label of each batch.
aggregated_points (torch.Tensor): Aggregated points from
vote aggregation layer.
pred_surface_center (torch.Tensor): Prediction of surface center.
pred_line_center (torch.Tensor): Prediction of line center.
pred_obj_surface_center (torch.Tensor): Objectness prediction \
of surface center.
pred_obj_line_center (torch.Tensor): Objectness prediction of \
line center.
pred_surface_sem (torch.Tensor): Semantic prediction of \
surface center.
pred_line_sem (torch.Tensor): Semantic prediction of line center.
Returns:
tuple[torch.Tensor]: Targets for primitive cues.
"""
device = points.device
gt_bboxes_3d = gt_bboxes_3d.to(device)
num_proposals = aggregated_points.shape[0]
gt_center = gt_bboxes_3d.gravity_center
dist1, dist2, ind1, _ = chamfer_distance(
aggregated_points.unsqueeze(0),
gt_center.unsqueeze(0),
reduction='none')
# Set assignment
object_assignment = ind1.squeeze(0)
# Generate objectness label and mask
# objectness_label: 1 if pred object center is within
# self.train_cfg['near_threshold'] of any GT object
# objectness_mask: 0 if pred object center is in gray
# zone (DONOTCARE), 1 otherwise
euclidean_dist1 = torch.sqrt(dist1.squeeze(0) + 1e-6)
proposal_objectness_label = euclidean_dist1.new_zeros(
num_proposals, dtype=torch.long)
proposal_objectness_mask = euclidean_dist1.new_zeros(num_proposals)
gt_sem = gt_labels_3d[object_assignment]
obj_surface_center, obj_line_center = \
gt_bboxes_3d.get_surface_line_center()
obj_surface_center = obj_surface_center.reshape(-1, 6,
3).transpose(0, 1)
obj_line_center = obj_line_center.reshape(-1, 12, 3).transpose(0, 1)
obj_surface_center = obj_surface_center[:, object_assignment].reshape(
1, -1, 3)
obj_line_center = obj_line_center[:,
object_assignment].reshape(1, -1, 3)
surface_sem = torch.argmax(pred_surface_sem, dim=1).float()
line_sem = torch.argmax(pred_line_sem, dim=1).float()
dist_surface, _, surface_ind, _ = chamfer_distance(
obj_surface_center,
pred_surface_center.unsqueeze(0),
reduction='none')
dist_line, _, line_ind, _ = chamfer_distance(
obj_line_center, pred_line_center.unsqueeze(0), reduction='none')
surface_sel = pred_surface_center[surface_ind.squeeze(0)]
line_sel = pred_line_center[line_ind.squeeze(0)]
surface_sel_sem = surface_sem[surface_ind.squeeze(0)]
line_sel_sem = line_sem[line_ind.squeeze(0)]
surface_sel_sem_gt = gt_sem.repeat(6).float()
line_sel_sem_gt = gt_sem.repeat(12).float()
euclidean_dist_surface = torch.sqrt(dist_surface.squeeze(0) + 1e-6)
euclidean_dist_line = torch.sqrt(dist_line.squeeze(0) + 1e-6)
objectness_label_surface = euclidean_dist_line.new_zeros(
num_proposals * 6, dtype=torch.long)
objectness_mask_surface = euclidean_dist_line.new_zeros(num_proposals *
6)
objectness_label_line = euclidean_dist_line.new_zeros(
num_proposals * 12, dtype=torch.long)
objectness_mask_line = euclidean_dist_line.new_zeros(num_proposals *
12)
objectness_label_surface_sem = euclidean_dist_line.new_zeros(
num_proposals * 6, dtype=torch.long)
objectness_label_line_sem = euclidean_dist_line.new_zeros(
num_proposals * 12, dtype=torch.long)
euclidean_dist_obj_surface = torch.sqrt((
(pred_obj_surface_center - surface_sel)**2).sum(dim=-1) + 1e-6)
euclidean_dist_obj_line = torch.sqrt(
torch.sum((pred_obj_line_center - line_sel)**2, dim=-1) + 1e-6)
# Objectness score just with centers
proposal_objectness_label[
euclidean_dist1 < self.train_cfg['near_threshold']] = 1
proposal_objectness_mask[
euclidean_dist1 < self.train_cfg['near_threshold']] = 1
proposal_objectness_mask[
euclidean_dist1 > self.train_cfg['far_threshold']] = 1
objectness_label_surface[
(euclidean_dist_obj_surface <
self.train_cfg['label_surface_threshold']) *
(euclidean_dist_surface <
self.train_cfg['mask_surface_threshold'])] = 1
objectness_label_surface_sem[
(euclidean_dist_obj_surface <
self.train_cfg['label_surface_threshold']) *
(euclidean_dist_surface < self.train_cfg['mask_surface_threshold'])
* (surface_sel_sem == surface_sel_sem_gt)] = 1
objectness_label_line[
(euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
*
(euclidean_dist_line < self.train_cfg['mask_line_threshold'])] = 1
objectness_label_line_sem[
(euclidean_dist_obj_line < self.train_cfg['label_line_threshold'])
* (euclidean_dist_line < self.train_cfg['mask_line_threshold']) *
(line_sel_sem == line_sel_sem_gt)] = 1
objectness_label_surface_obj = proposal_objectness_label.repeat(6)
objectness_mask_surface_obj = proposal_objectness_mask.repeat(6)
objectness_label_line_obj = proposal_objectness_label.repeat(12)
objectness_mask_line_obj = proposal_objectness_mask.repeat(12)
objectness_mask_surface = objectness_mask_surface_obj
objectness_mask_line = objectness_mask_line_obj
cues_objectness_label = torch.cat(
(objectness_label_surface, objectness_label_line), 0)
cues_sem_label = torch.cat(
(objectness_label_surface_sem, objectness_label_line_sem), 0)
cues_mask = torch.cat((objectness_mask_surface, objectness_mask_line),
0)
objectness_label_surface *= objectness_label_surface_obj
objectness_label_line *= objectness_label_line_obj
cues_matching_label = torch.cat(
(objectness_label_surface, objectness_label_line), 0)
objectness_label_surface_sem *= objectness_label_surface_obj
objectness_label_line_sem *= objectness_label_line_obj
cues_match_mask = (torch.sum(
cues_objectness_label.view(18, num_proposals), dim=0) >=
1).float()
obj_surface_line_center = torch.cat(
(obj_surface_center, obj_line_center), 1).squeeze(0)
return (cues_objectness_label, cues_sem_label,
proposal_objectness_label, cues_mask, cues_match_mask,
proposal_objectness_mask, cues_matching_label,
obj_surface_line_center)
================================================
FILE: mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py
================================================
import numpy as np
import torch
from mmcv.cnn import ConvModule, normal_init, xavier_init
from torch import nn as nn
from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes,
rotation_3d_in_axis, xywhr2xyxyr)
from mmdet3d.models.builder import build_loss
from mmdet3d.ops import make_sparse_convmodule
from mmdet3d.ops import spconv as spconv
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu, nms_normal_gpu
from mmdet.core import build_bbox_coder, multi_apply
from mmdet.models import HEADS
@HEADS.register_module()
class PartA2BboxHead(nn.Module):
"""PartA2 RoI head.
Args:
num_classes (int): The number of classes to prediction.
seg_in_channels (int): Input channels of segmentation
convolution layer.
part_in_channels (int): Input channels of part convolution layer.
seg_conv_channels (list(int)): Out channels of each
segmentation convolution layer.
part_conv_channels (list(int)): Out channels of each
part convolution layer.
merge_conv_channels (list(int)): Out channels of each
feature merged convolution layer.
down_conv_channels (list(int)): Out channels of each
downsampled convolution layer.
shared_fc_channels (list(int)): Out channels of each shared fc layer.
cls_channels (list(int)): Out channels of each classification layer.
reg_channels (list(int)): Out channels of each regression layer.
dropout_ratio (float): Dropout ratio of classification and
regression layers.
roi_feat_size (int): The size of pooled roi features.
with_corner_loss (bool): Whether to use corner loss or not.
bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head.
conv_cfg (dict): Config dict of convolutional layers
norm_cfg (dict): Config dict of normalization layers
loss_bbox (dict): Config dict of box regression loss.
loss_cls (dict): Config dict of classifacation loss.
"""
def __init__(self,
num_classes,
seg_in_channels,
part_in_channels,
seg_conv_channels=None,
part_conv_channels=None,
merge_conv_channels=None,
down_conv_channels=None,
shared_fc_channels=None,
cls_channels=None,
reg_channels=None,
dropout_ratio=0.1,
roi_feat_size=14,
with_corner_loss=True,
bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
loss_bbox=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0),
loss_cls=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
reduction='none',
loss_weight=1.0)):
super(PartA2BboxHead, self).__init__()
self.num_classes = num_classes
self.with_corner_loss = with_corner_loss
self.bbox_coder = build_bbox_coder(bbox_coder)
self.loss_bbox = build_loss(loss_bbox)
self.loss_cls = build_loss(loss_cls)
self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False)
assert down_conv_channels[-1] == shared_fc_channels[0]
# init layers
part_channel_last = part_in_channels
part_conv = []
for i, channel in enumerate(part_conv_channels):
part_conv.append(
make_sparse_convmodule(
part_channel_last,
channel,
3,
padding=1,
norm_cfg=norm_cfg,
indice_key=f'rcnn_part{i}',
conv_type='SubMConv3d'))
part_channel_last = channel
self.part_conv = spconv.SparseSequential(*part_conv)
seg_channel_last = seg_in_channels
seg_conv = []
for i, channel in enumerate(seg_conv_channels):
seg_conv.append(
make_sparse_convmodule(
seg_channel_last,
channel,
3,
padding=1,
norm_cfg=norm_cfg,
indice_key=f'rcnn_seg{i}',
conv_type='SubMConv3d'))
seg_channel_last = channel
self.seg_conv = spconv.SparseSequential(*seg_conv)
self.conv_down = spconv.SparseSequential()
merge_conv_channel_last = part_channel_last + seg_channel_last
merge_conv = []
for i, channel in enumerate(merge_conv_channels):
merge_conv.append(
make_sparse_convmodule(
merge_conv_channel_last,
channel,
3,
padding=1,
norm_cfg=norm_cfg,
indice_key='rcnn_down0'))
merge_conv_channel_last = channel
down_conv_channel_last = merge_conv_channel_last
conv_down = []
for i, channel in enumerate(down_conv_channels):
conv_down.append(
make_sparse_convmodule(
down_conv_channel_last,
channel,
3,
padding=1,
norm_cfg=norm_cfg,
indice_key='rcnn_down1'))
down_conv_channel_last = channel
self.conv_down.add_module('merge_conv',
spconv.SparseSequential(*merge_conv))
self.conv_down.add_module(
'max_pool3d', spconv.SparseMaxPool3d(kernel_size=2, stride=2))
self.conv_down.add_module('down_conv',
spconv.SparseSequential(*conv_down))
shared_fc_list = []
pool_size = roi_feat_size // 2
pre_channel = shared_fc_channels[0] * pool_size**3
for k in range(1, len(shared_fc_channels)):
shared_fc_list.append(
ConvModule(
pre_channel,
shared_fc_channels[k],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
inplace=True))
pre_channel = shared_fc_channels[k]
if k != len(shared_fc_channels) - 1 and dropout_ratio > 0:
shared_fc_list.append(nn.Dropout(dropout_ratio))
self.shared_fc = nn.Sequential(*shared_fc_list)
# Classification layer
channel_in = shared_fc_channels[-1]
cls_channel = 1
cls_layers = []
pre_channel = channel_in
for k in range(0, len(cls_channels)):
cls_layers.append(
ConvModule(
pre_channel,
cls_channels[k],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
inplace=True))
pre_channel = cls_channels[k]
cls_layers.append(
ConvModule(
pre_channel,
cls_channel,
1,
padding=0,
conv_cfg=conv_cfg,
act_cfg=None))
if dropout_ratio >= 0:
cls_layers.insert(1, nn.Dropout(dropout_ratio))
self.conv_cls = nn.Sequential(*cls_layers)
# Regression layer
reg_layers = []
pre_channel = channel_in
for k in range(0, len(reg_channels)):
reg_layers.append(
ConvModule(
pre_channel,
reg_channels[k],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
inplace=True))
pre_channel = reg_channels[k]
reg_layers.append(
ConvModule(
pre_channel,
self.bbox_coder.code_size,
1,
padding=0,
conv_cfg=conv_cfg,
act_cfg=None))
if dropout_ratio >= 0:
reg_layers.insert(1, nn.Dropout(dropout_ratio))
self.conv_reg = nn.Sequential(*reg_layers)
self.init_weights()
def init_weights(self):
"""Initialize weights of the bbox head."""
for m in self.modules():
if isinstance(m, (nn.Conv2d, nn.Conv1d)):
xavier_init(m, distribution='uniform')
normal_init(self.conv_reg[-1].conv, mean=0, std=0.001)
def forward(self, seg_feats, part_feats):
"""Forward pass.
Args:
seg_feats (torch.Tensor): Point-wise semantic features.
part_feats (torch.Tensor): Point-wise part prediction features.
Returns:
tuple[torch.Tensor]: Score of class and bbox predictions.
"""
# (B * N, out_x, out_y, out_z, 4)
rcnn_batch_size = part_feats.shape[0]
# transform to sparse tensors
sparse_shape = part_feats.shape[1:4]
# (non_empty_num, 4) ==> [bs_idx, x_idx, y_idx, z_idx]
sparse_idx = part_feats.sum(dim=-1).nonzero(as_tuple=False)
part_features = part_feats[sparse_idx[:, 0], sparse_idx[:, 1],
sparse_idx[:, 2], sparse_idx[:, 3]]
seg_features = seg_feats[sparse_idx[:, 0], sparse_idx[:, 1],
sparse_idx[:, 2], sparse_idx[:, 3]]
coords = sparse_idx.int()
part_features = spconv.SparseConvTensor(part_features, coords,
sparse_shape, rcnn_batch_size)
seg_features = spconv.SparseConvTensor(seg_features, coords,
sparse_shape, rcnn_batch_size)
# forward rcnn network
x_part = self.part_conv(part_features)
x_rpn = self.seg_conv(seg_features)
merged_feature = torch.cat((x_rpn.features, x_part.features),
dim=1) # (N, C)
shared_feature = spconv.SparseConvTensor(merged_feature, coords,
sparse_shape, rcnn_batch_size)
x = self.conv_down(shared_feature)
shared_feature = x.dense().view(rcnn_batch_size, -1, 1)
shared_feature = self.shared_fc(shared_feature)
cls_score = self.conv_cls(shared_feature).transpose(
1, 2).contiguous().squeeze(dim=1) # (B, 1)
bbox_pred = self.conv_reg(shared_feature).transpose(
1, 2).contiguous().squeeze(dim=1) # (B, C)
return cls_score, bbox_pred
def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets,
pos_gt_bboxes, reg_mask, label_weights, bbox_weights):
"""Coumputing losses.
Args:
cls_score (torch.Tensor): Scores of each roi.
bbox_pred (torch.Tensor): Predictions of bboxes.
rois (torch.Tensor): Roi bboxes.
labels (torch.Tensor): Labels of class.
bbox_targets (torch.Tensor): Target of positive bboxes.
pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes.
reg_mask (torch.Tensor): Mask for positive bboxes.
label_weights (torch.Tensor): Weights of class loss.
bbox_weights (torch.Tensor): Weights of bbox loss.
Returns:
dict: Computed losses.
- loss_cls (torch.Tensor): Loss of classes.
- loss_bbox (torch.Tensor): Loss of bboxes.
- loss_corner (torch.Tensor): Loss of corners.
"""
losses = dict()
rcnn_batch_size = cls_score.shape[0]
# calculate class loss
cls_flat = cls_score.view(-1)
loss_cls = self.loss_cls(cls_flat, labels, label_weights)
losses['loss_cls'] = loss_cls
# calculate regression loss
code_size = self.bbox_coder.code_size
pos_inds = (reg_mask > 0)
if pos_inds.any() == 0:
# fake a part loss
losses['loss_bbox'] = loss_cls.new_tensor(0)
if self.with_corner_loss:
losses['loss_corner'] = loss_cls.new_tensor(0)
else:
pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds]
bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat(
1, pos_bbox_pred.shape[-1])
loss_bbox = self.loss_bbox(
pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0),
bbox_weights_flat.unsqueeze(dim=0))
losses['loss_bbox'] = loss_bbox
if self.with_corner_loss:
pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds]
pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size)
batch_anchors = pos_roi_boxes3d.clone().detach()
pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1)
roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3)
batch_anchors[..., 0:3] = 0
# decode boxes
pred_boxes3d = self.bbox_coder.decode(
batch_anchors,
pos_bbox_pred.view(-1, code_size)).view(-1, code_size)
pred_boxes3d[..., 0:3] = rotation_3d_in_axis(
pred_boxes3d[..., 0:3].unsqueeze(1),
(pos_rois_rotation + np.pi / 2),
axis=2).squeeze(1)
pred_boxes3d[:, 0:3] += roi_xyz
# calculate corner loss
loss_corner = self.get_corner_loss_lidar(
pred_boxes3d, pos_gt_bboxes)
losses['loss_corner'] = loss_corner
return losses
def get_targets(self, sampling_results, rcnn_train_cfg, concat=True):
"""Generate targets.
Args:
sampling_results (list[:obj:`SamplingResult`]):
Sampled results from rois.
rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn.
concat (bool): Whether to concatenate targets between batches.
Returns:
tuple[torch.Tensor]: Targets of boxes and class prediction.
"""
pos_bboxes_list = [res.pos_bboxes for res in sampling_results]
pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results]
iou_list = [res.iou for res in sampling_results]
targets = multi_apply(
self._get_target_single,
pos_bboxes_list,
pos_gt_bboxes_list,
iou_list,
cfg=rcnn_train_cfg)
(label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
bbox_weights) = targets
if concat:
label = torch.cat(label, 0)
bbox_targets = torch.cat(bbox_targets, 0)
pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0)
reg_mask = torch.cat(reg_mask, 0)
label_weights = torch.cat(label_weights, 0)
label_weights /= torch.clamp(label_weights.sum(), min=1.0)
bbox_weights = torch.cat(bbox_weights, 0)
bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0)
return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
bbox_weights)
def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg):
"""Generate training targets for a single sample.
Args:
pos_bboxes (torch.Tensor): Positive boxes with shape
(N, 7).
pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape
(M, 7).
ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes`
in shape (N, M).
cfg (dict): Training configs.
Returns:
tuple[torch.Tensor]: Target for positive boxes.
(label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
bbox_weights)
"""
cls_pos_mask = ious > cfg.cls_pos_thr
cls_neg_mask = ious < cfg.cls_neg_thr
interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0)
# iou regression target
label = (cls_pos_mask > 0).float()
label[interval_mask] = ious[interval_mask] * 2 - 0.5
# label weights
label_weights = (label >= 0).float()
# box regression target
reg_mask = pos_bboxes.new_zeros(ious.size(0)).long()
reg_mask[0:pos_gt_bboxes.size(0)] = 1
bbox_weights = (reg_mask > 0).float()
if reg_mask.bool().any():
pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach()
roi_center = pos_bboxes[..., 0:3]
roi_ry = pos_bboxes[..., 6] % (2 * np.pi)
# canonical transformation
pos_gt_bboxes_ct[..., 0:3] -= roi_center
pos_gt_bboxes_ct[..., 6] -= roi_ry
pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis(
pos_gt_bboxes_ct[..., 0:3].unsqueeze(1),
-(roi_ry + np.pi / 2),
axis=2).squeeze(1)
# flip orientation if rois have opposite orientation
ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi) # 0 ~ 2pi
opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5)
ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % (
2 * np.pi) # (0 ~ pi/2, 3pi/2 ~ 2pi)
flag = ry_label > np.pi
ry_label[flag] = ry_label[flag] - np.pi * 2 # (-pi/2, pi/2)
ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2)
pos_gt_bboxes_ct[..., 6] = ry_label
rois_anchor = pos_bboxes.clone().detach()
rois_anchor[:, 0:3] = 0
rois_anchor[:, 6] = 0
bbox_targets = self.bbox_coder.encode(rois_anchor,
pos_gt_bboxes_ct)
else:
# no fg bbox
bbox_targets = pos_gt_bboxes.new_empty((0, 7))
return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights,
bbox_weights)
def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1):
"""Calculate corner loss of given boxes.
Args:
pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7).
gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7).
Returns:
torch.FloatTensor: Calculated corner loss in shape (N).
"""
assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0]
# This is a little bit hack here because we assume the box for
# Part-A2 is in LiDAR coordinates
gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d)
pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners
gt_box_corners = gt_boxes_structure.corners
# This flip only changes the heading direction of GT boxes
gt_bbox3d_flip = gt_boxes_structure.clone()
gt_bbox3d_flip.tensor[:, 6] += np.pi
gt_box_corners_flip = gt_bbox3d_flip.corners
corner_dist = torch.min(
torch.norm(pred_box_corners - gt_box_corners, dim=2),
torch.norm(pred_box_corners - gt_box_corners_flip,
dim=2)) # (N, 8)
# huber loss
abs_error = torch.abs(corner_dist)
quadratic = torch.clamp(abs_error, max=delta)
linear = (abs_error - quadratic)
corner_loss = 0.5 * quadratic**2 + delta * linear
return corner_loss.mean(dim=1)
def get_bboxes(self,
rois,
cls_score,
bbox_pred,
class_labels,
class_pred,
img_metas,
cfg=None):
"""Generate bboxes from bbox head predictions.
Args:
rois (torch.Tensor): Roi bounding boxes.
cls_score (torch.Tensor): Scores of bounding boxes.
bbox_pred (torch.Tensor): Bounding boxes predictions
class_labels (torch.Tensor): Label of classes
class_pred (torch.Tensor): Score for nms.
img_metas (list[dict]): Point cloud and image's meta info.
cfg (:obj:`ConfigDict`): Testing config.
Returns:
list[tuple]: Decoded bbox, scores and labels after nms.
"""
roi_batch_id = rois[..., 0]
roi_boxes = rois[..., 1:] # boxes without batch id
batch_size = int(roi_batch_id.max().item() + 1)
# decode boxes
roi_ry = roi_boxes[..., 6].view(-1)
roi_xyz = roi_boxes[..., 0:3].view(-1, 3)
local_roi_boxes = roi_boxes.clone().detach()
local_roi_boxes[..., 0:3] = 0
rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred)
rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis(
rcnn_boxes3d[..., 0:3].unsqueeze(1), (roi_ry + np.pi / 2),
axis=2).squeeze(1)
rcnn_boxes3d[:, 0:3] += roi_xyz
# post processing
result_list = []
for batch_id in range(batch_size):
cur_class_labels = class_labels[batch_id]
cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1)
cur_box_prob = class_pred[batch_id]
cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id]
selected = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d,
cfg.score_thr, cfg.nms_thr,
img_metas[batch_id],
cfg.use_rotate_nms)
selected_bboxes = cur_rcnn_boxes3d[selected]
selected_label_preds = cur_class_labels[selected]
selected_scores = cur_cls_score[selected]
result_list.append(
(img_metas[batch_id]['box_type_3d'](selected_bboxes,
self.bbox_coder.code_size),
selected_scores, selected_label_preds))
return result_list
def multi_class_nms(self,
box_probs,
box_preds,
score_thr,
nms_thr,
input_meta,
use_rotate_nms=True):
"""Multi-class NMS for box head.
Note:
This function has large overlap with the `box3d_multiclass_nms`
implemented in `mmdet3d.core.post_processing`. We are considering
merging these two functions in the future.
Args:
box_probs (torch.Tensor): Predicted boxes probabitilies in
shape (N,).
box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C).
score_thr (float): Threshold of scores.
nms_thr (float): Threshold for NMS.
input_meta (dict): Meta informations of the current sample.
use_rotate_nms (bool, optional): Whether to use rotated nms.
Defaults to True.
Returns:
torch.Tensor: Selected indices.
"""
if use_rotate_nms:
nms_func = nms_gpu
else:
nms_func = nms_normal_gpu
assert box_probs.shape[
1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}'
selected_list = []
selected_labels = []
boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d'](
box_preds, self.bbox_coder.code_size).bev)
score_thresh = score_thr if isinstance(
score_thr, list) else [score_thr for x in range(self.num_classes)]
nms_thresh = nms_thr if isinstance(
nms_thr, list) else [nms_thr for x in range(self.num_classes)]
for k in range(0, self.num_classes):
class_scores_keep = box_probs[:, k] >= score_thresh[k]
if class_scores_keep.int().sum() > 0:
original_idxs = class_scores_keep.nonzero(
as_tuple=False).view(-1)
cur_boxes_for_nms = boxes_for_nms[class_scores_keep]
cur_rank_scores = box_probs[class_scores_keep, k]
cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores,
nms_thresh[k])
if cur_selected.shape[0] == 0:
continue
selected_list.append(original_idxs[cur_selected])
selected_labels.append(
torch.full([cur_selected.shape[0]],
k + 1,
dtype=torch.int64,
device=box_preds.device))
selected = torch.cat(
selected_list, dim=0) if len(selected_list) > 0 else []
return selected
================================================
FILE: mmdet3d/models/roi_heads/h3d_roi_head.py
================================================
from mmdet3d.core.bbox import bbox3d2result
from mmdet.models import HEADS
from ..builder import build_head
from .base_3droi_head import Base3DRoIHead
@HEADS.register_module()
class H3DRoIHead(Base3DRoIHead):
"""H3D roi head for H3DNet.
Args:
primitive_list (List): Configs of primitive heads.
bbox_head (ConfigDict): Config of bbox_head.
train_cfg (ConfigDict): Training config.
test_cfg (ConfigDict): Testing config.
"""
def __init__(self,
primitive_list,
bbox_head=None,
train_cfg=None,
test_cfg=None):
super(H3DRoIHead, self).__init__(
bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg)
# Primitive module
assert len(primitive_list) == 3
self.primitive_z = build_head(primitive_list[0])
self.primitive_xy = build_head(primitive_list[1])
self.primitive_line = build_head(primitive_list[2])
def init_weights(self, pretrained):
"""Initialize weights, skip since ``H3DROIHead`` does not need to
initialize weights."""
pass
def init_mask_head(self):
"""Initialize mask head, skip since ``H3DROIHead`` does not have
one."""
pass
def init_bbox_head(self, bbox_head):
"""Initialize box head."""
bbox_head['train_cfg'] = self.train_cfg
bbox_head['test_cfg'] = self.test_cfg
self.bbox_head = build_head(bbox_head)
def init_assigner_sampler(self):
"""Initialize assigner and sampler."""
pass
def forward_train(self,
feats_dict,
img_metas,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask,
pts_instance_mask,
gt_bboxes_ignore=None):
"""Training forward function of PartAggregationROIHead.
Args:
feats_dict (dict): Contains features from the first stage.
img_metas (list[dict]): Contain pcd and img's meta info.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise
semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise
instance mask.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict: losses from each head.
"""
losses = dict()
sample_mod = self.train_cfg.sample_mod
assert sample_mod in ['vote', 'seed', 'random']
result_z = self.primitive_z(feats_dict, sample_mod)
feats_dict.update(result_z)
result_xy = self.primitive_xy(feats_dict, sample_mod)
feats_dict.update(result_xy)
result_line = self.primitive_line(feats_dict, sample_mod)
feats_dict.update(result_line)
primitive_loss_inputs = (feats_dict, points, gt_bboxes_3d,
gt_labels_3d, pts_semantic_mask,
pts_instance_mask, img_metas,
gt_bboxes_ignore)
loss_z = self.primitive_z.loss(*primitive_loss_inputs)
losses.update(loss_z)
loss_xy = self.primitive_xy.loss(*primitive_loss_inputs)
losses.update(loss_xy)
loss_line = self.primitive_line.loss(*primitive_loss_inputs)
losses.update(loss_line)
targets = feats_dict.pop('targets')
bbox_results = self.bbox_head(feats_dict, sample_mod)
feats_dict.update(bbox_results)
bbox_loss = self.bbox_head.loss(feats_dict, points, gt_bboxes_3d,
gt_labels_3d, pts_semantic_mask,
pts_instance_mask, img_metas, targets,
gt_bboxes_ignore)
losses.update(bbox_loss)
return losses
def simple_test(self, feats_dict, img_metas, points, rescale=False):
"""Simple testing forward function of PartAggregationROIHead.
Note:
This function assumes that the batch size is 1
Args:
feats_dict (dict): Contains features from the first stage.
img_metas (list[dict]): Contain pcd and img's meta info.
points (torch.Tensor): Input points.
rescale (bool): Whether to rescale results.
Returns:
dict: Bbox results of one frame.
"""
sample_mod = self.test_cfg.sample_mod
assert sample_mod in ['vote', 'seed', 'random']
result_z = self.primitive_z(feats_dict, sample_mod)
feats_dict.update(result_z)
result_xy = self.primitive_xy(feats_dict, sample_mod)
feats_dict.update(result_xy)
result_line = self.primitive_line(feats_dict, sample_mod)
feats_dict.update(result_line)
bbox_preds = self.bbox_head(feats_dict, sample_mod)
feats_dict.update(bbox_preds)
bbox_list = self.bbox_head.get_bboxes(
points,
feats_dict,
img_metas,
rescale=rescale,
suffix='_optimized')
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
================================================
FILE: mmdet3d/models/roi_heads/mask_heads/__init__.py
================================================
from .pointwise_semantic_head import PointwiseSemanticHead
from .primitive_head import PrimitiveHead
__all__ = ['PointwiseSemanticHead', 'PrimitiveHead']
================================================
FILE: mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py
================================================
import torch
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.core.bbox.structures import rotation_3d_in_axis
from mmdet3d.models.builder import build_loss
from mmdet.core import multi_apply
from mmdet.models import HEADS
@HEADS.register_module()
class PointwiseSemanticHead(nn.Module):
"""Semantic segmentation head for point-wise segmentation.
Predict point-wise segmentation and part regression results for PartA2.
See `paper `_ for more detials.
Args:
in_channels (int): The number of input channel.
num_classes (int): The number of class.
extra_width (float): Boxes enlarge width.
loss_seg (dict): Config of segmentation loss.
loss_part (dict): Config of part prediction loss.
"""
def __init__(self,
in_channels,
num_classes=3,
extra_width=0.2,
seg_score_thr=0.3,
loss_seg=dict(
type='FocalLoss',
use_sigmoid=True,
reduction='sum',
gamma=2.0,
alpha=0.25,
loss_weight=1.0),
loss_part=dict(
type='CrossEntropyLoss',
use_sigmoid=True,
loss_weight=1.0)):
super(PointwiseSemanticHead, self).__init__()
self.extra_width = extra_width
self.num_classes = num_classes
self.seg_score_thr = seg_score_thr
self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True)
self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True)
self.loss_seg = build_loss(loss_seg)
self.loss_part = build_loss(loss_part)
def forward(self, x):
"""Forward pass.
Args:
x (torch.Tensor): Features from the first stage.
Returns:
dict: Part features, segmentation and part predictions.
- seg_preds (torch.Tensor): Segment predictions.
- part_preds (torch.Tensor): Part predictions.
- part_feats (torch.Tensor): Feature predictions.
"""
seg_preds = self.seg_cls_layer(x) # (N, 1)
part_preds = self.seg_reg_layer(x) # (N, 3)
seg_scores = torch.sigmoid(seg_preds).detach()
seg_mask = (seg_scores > self.seg_score_thr)
part_offsets = torch.sigmoid(part_preds).clone().detach()
part_offsets[seg_mask.view(-1) == 0] = 0
part_feats = torch.cat((part_offsets, seg_scores),
dim=-1) # shape (npoints, 4)
return dict(
seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats)
def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d):
"""generate segmentation and part prediction targets for a single
sample.
Args:
voxel_centers (torch.Tensor): The center of voxels in shape \
(voxel_num, 3).
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in \
shape (box_num, 7).
gt_labels_3d (torch.Tensor): Class labels of ground truths in \
shape (box_num).
Returns:
tuple[torch.Tensor]: Segmentation targets with shape [voxel_num] \
part prediction targets with shape [voxel_num, 3]
"""
gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device)
enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width)
part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3),
dtype=torch.float32)
box_idx = gt_bboxes_3d.points_in_boxes(voxel_centers)
enlarge_box_idx = enlarged_gt_boxes.points_in_boxes(
voxel_centers).long()
gt_labels_pad = F.pad(
gt_labels_3d, (1, 0), mode='constant', value=self.num_classes)
seg_targets = gt_labels_pad[(box_idx.long() + 1)]
fg_pt_flag = box_idx > -1
ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1)
seg_targets[ignore_flag] = -1
for k in range(len(gt_bboxes_3d)):
k_box_flag = box_idx == k
# no point in current box (caused by velodyne reduce)
if not k_box_flag.any():
continue
fg_voxels = voxel_centers[k_box_flag]
transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k]
transformed_voxels = rotation_3d_in_axis(
transformed_voxels.unsqueeze(0),
-gt_bboxes_3d.yaw[k].view(1),
axis=2)
part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[
k] + voxel_centers.new_tensor([0.5, 0.5, 0])
part_targets = torch.clamp(part_targets, min=0)
return seg_targets, part_targets
def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d):
"""generate segmentation and part prediction targets.
Args:
voxel_centers (torch.Tensor): The center of voxels in shape \
(voxel_num, 3).
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in \
shape (box_num, 7).
gt_labels_3d (torch.Tensor): Class labels of ground truths in \
shape (box_num).
Returns:
dict: Prediction targets
- seg_targets (torch.Tensor): Segmentation targets \
with shape [voxel_num].
- part_targets (torch.Tensor): Part prediction targets \
with shape [voxel_num, 3].
"""
batch_size = len(gt_labels_3d)
voxel_center_list = []
for idx in range(batch_size):
coords_idx = voxels_dict['coors'][:, 0] == idx
voxel_center_list.append(voxels_dict['voxel_centers'][coords_idx])
seg_targets, part_targets = multi_apply(self.get_targets_single,
voxel_center_list,
gt_bboxes_3d, gt_labels_3d)
seg_targets = torch.cat(seg_targets, dim=0)
part_targets = torch.cat(part_targets, dim=0)
return dict(seg_targets=seg_targets, part_targets=part_targets)
def loss(self, semantic_results, semantic_targets):
"""Calculate point-wise segmentation and part prediction losses.
Args:
semantic_results (dict): Results from semantic head.
- seg_preds: Segmentation predictions.
- part_preds: Part predictions.
semantic_targets (dict): Targets of semantic results.
- seg_preds: Segmentation targets.
- part_preds: Part targets.
Returns:
dict: Loss of segmentation and part prediction.
- loss_seg (torch.Tensor): Segmentation prediction loss.
- loss_part (torch.Tensor): Part prediction loss.
"""
seg_preds = semantic_results['seg_preds']
part_preds = semantic_results['part_preds']
seg_targets = semantic_targets['seg_targets']
part_targets = semantic_targets['part_targets']
pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes)
binary_seg_target = pos_mask.long()
pos = pos_mask.float()
neg = (seg_targets == self.num_classes).float()
seg_weights = pos + neg
pos_normalizer = pos.sum()
seg_weights = seg_weights / torch.clamp(pos_normalizer, min=1.0)
loss_seg = self.loss_seg(seg_preds, binary_seg_target, seg_weights)
if pos_normalizer > 0:
loss_part = self.loss_part(part_preds[pos_mask],
part_targets[pos_mask])
else:
# fake a part loss
loss_part = loss_seg.new_tensor(0)
return dict(loss_seg=loss_seg, loss_part=loss_part)
================================================
FILE: mmdet3d/models/roi_heads/mask_heads/primitive_head.py
================================================
import torch
from mmcv.cnn import ConvModule
from torch import nn as nn
from torch.nn import functional as F
from mmdet3d.models.builder import build_loss
from mmdet3d.models.model_utils import VoteModule
from mmdet3d.ops import build_sa_module, furthest_point_sample
from mmdet.core import multi_apply
from mmdet.models import HEADS
@HEADS.register_module()
class PrimitiveHead(nn.Module):
r"""Primitive head of `H3DNet `_.
Args:
num_dims (int): The dimension of primitive semantic information.
num_classes (int): The number of class.
primitive_mode (str): The mode of primitive module,
avaliable mode ['z', 'xy', 'line'].
bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and
decoding boxes.
train_cfg (dict): Config for training.
test_cfg (dict): Config for testing.
vote_module_cfg (dict): Config of VoteModule for point-wise votes.
vote_aggregation_cfg (dict): Config of vote aggregation layer.
feat_channels (tuple[int]): Convolution channels of
prediction layer.
upper_thresh (float): Threshold for line matching.
surface_thresh (float): Threshold for suface matching.
conv_cfg (dict): Config of convolution in prediction layer.
norm_cfg (dict): Config of BN in prediction layer.
objectness_loss (dict): Config of objectness loss.
center_loss (dict): Config of center loss.
semantic_loss (dict): Config of point-wise semantic segmentation loss.
"""
def __init__(self,
num_dims,
num_classes,
primitive_mode,
train_cfg=None,
test_cfg=None,
vote_module_cfg=None,
vote_aggregation_cfg=None,
feat_channels=(128, 128),
upper_thresh=100.0,
surface_thresh=0.5,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
objectness_loss=None,
center_loss=None,
semantic_reg_loss=None,
semantic_cls_loss=None):
super(PrimitiveHead, self).__init__()
assert primitive_mode in ['z', 'xy', 'line']
# The dimension of primitive semantic information.
self.num_dims = num_dims
self.num_classes = num_classes
self.primitive_mode = primitive_mode
self.train_cfg = train_cfg
self.test_cfg = test_cfg
self.gt_per_seed = vote_module_cfg['gt_per_seed']
self.num_proposal = vote_aggregation_cfg['num_point']
self.upper_thresh = upper_thresh
self.surface_thresh = surface_thresh
self.objectness_loss = build_loss(objectness_loss)
self.center_loss = build_loss(center_loss)
self.semantic_reg_loss = build_loss(semantic_reg_loss)
self.semantic_cls_loss = build_loss(semantic_cls_loss)
assert vote_aggregation_cfg['mlp_channels'][0] == vote_module_cfg[
'in_channels']
# Primitive existence flag prediction
self.flag_conv = ConvModule(
vote_module_cfg['conv_channels'][-1],
vote_module_cfg['conv_channels'][-1] // 2,
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=True)
self.flag_pred = torch.nn.Conv1d(
vote_module_cfg['conv_channels'][-1] // 2, 2, 1)
self.vote_module = VoteModule(**vote_module_cfg)
self.vote_aggregation = build_sa_module(vote_aggregation_cfg)
prev_channel = vote_aggregation_cfg['mlp_channels'][-1]
conv_pred_list = list()
for k in range(len(feat_channels)):
conv_pred_list.append(
ConvModule(
prev_channel,
feat_channels[k],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
bias=True,
inplace=True))
prev_channel = feat_channels[k]
self.conv_pred = nn.Sequential(*conv_pred_list)
conv_out_channel = 3 + num_dims + num_classes
self.conv_pred.add_module('conv_out',
nn.Conv1d(prev_channel, conv_out_channel, 1))
def init_weights(self):
"""Initialize weights of VoteHead."""
pass
def forward(self, feats_dict, sample_mod):
"""Forward pass.
Args:
feats_dict (dict): Feature dict from backbone.
sample_mod (str): Sample mode for vote aggregation layer.
valid modes are "vote", "seed" and "random".
Returns:
dict: Predictions of primitive head.
"""
assert sample_mod in ['vote', 'seed', 'random']
seed_points = feats_dict['fp_xyz_net0'][-1]
seed_features = feats_dict['hd_feature']
results = {}
primitive_flag = self.flag_conv(seed_features)
primitive_flag = self.flag_pred(primitive_flag)
results['pred_flag_' + self.primitive_mode] = primitive_flag
# 1. generate vote_points from seed_points
vote_points, vote_features, _ = self.vote_module(
seed_points, seed_features)
results['vote_' + self.primitive_mode] = vote_points
results['vote_features_' + self.primitive_mode] = vote_features
# 2. aggregate vote_points
if sample_mod == 'vote':
# use fps in vote_aggregation
sample_indices = None
elif sample_mod == 'seed':
# FPS on seed and choose the votes corresponding to the seeds
sample_indices = furthest_point_sample(seed_points,
self.num_proposal)
elif sample_mod == 'random':
# Random sampling from the votes
batch_size, num_seed = seed_points.shape[:2]
sample_indices = torch.randint(
0,
num_seed, (batch_size, self.num_proposal),
dtype=torch.int32,
device=seed_points.device)
else:
raise NotImplementedError('Unsupported sample mod!')
vote_aggregation_ret = self.vote_aggregation(vote_points,
vote_features,
sample_indices)
aggregated_points, features, aggregated_indices = vote_aggregation_ret
results['aggregated_points_' + self.primitive_mode] = aggregated_points
results['aggregated_features_' + self.primitive_mode] = features
results['aggregated_indices_' +
self.primitive_mode] = aggregated_indices
# 3. predict primitive offsets and semantic information
predictions = self.conv_pred(features)
# 4. decode predictions
decode_ret = self.primitive_decode_scores(predictions,
aggregated_points)
results.update(decode_ret)
center, pred_ind = self.get_primitive_center(
primitive_flag, decode_ret['center_' + self.primitive_mode])
results['pred_' + self.primitive_mode + '_ind'] = pred_ind
results['pred_' + self.primitive_mode + '_center'] = center
return results
def loss(self,
bbox_preds,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
img_metas=None,
gt_bboxes_ignore=None):
"""Compute loss.
Args:
bbox_preds (dict): Predictions from forward of primitive head.
points (list[torch.Tensor]): Input points.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each sample.
gt_labels_3d (list[torch.Tensor]): Labels of each sample.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise
semantic mask.
pts_instance_mask (None | list[torch.Tensor]): Point-wise
instance mask.
img_metas (list[dict]): Contain pcd and img's meta info.
gt_bboxes_ignore (None | list[torch.Tensor]): Specify
which bounding.
Returns:
dict: Losses of Primitive Head.
"""
targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask,
bbox_preds)
(point_mask, point_offset, gt_primitive_center, gt_primitive_semantic,
gt_sem_cls_label, gt_primitive_mask) = targets
losses = {}
# Compute the loss of primitive existence flag
pred_flag = bbox_preds['pred_flag_' + self.primitive_mode]
flag_loss = self.objectness_loss(pred_flag, gt_primitive_mask.long())
losses['flag_loss_' + self.primitive_mode] = flag_loss
# calculate vote loss
vote_loss = self.vote_module.get_loss(
bbox_preds['seed_points'],
bbox_preds['vote_' + self.primitive_mode],
bbox_preds['seed_indices'], point_mask, point_offset)
losses['vote_loss_' + self.primitive_mode] = vote_loss
num_proposal = bbox_preds['aggregated_points_' +
self.primitive_mode].shape[1]
primitive_center = bbox_preds['center_' + self.primitive_mode]
if self.primitive_mode != 'line':
primitive_semantic = bbox_preds['size_residuals_' +
self.primitive_mode].contiguous()
else:
primitive_semantic = None
semancitc_scores = bbox_preds['sem_cls_scores_' +
self.primitive_mode].transpose(2, 1)
gt_primitive_mask = gt_primitive_mask / \
(gt_primitive_mask.sum() + 1e-6)
center_loss, size_loss, sem_cls_loss = self.compute_primitive_loss(
primitive_center, primitive_semantic, semancitc_scores,
num_proposal, gt_primitive_center, gt_primitive_semantic,
gt_sem_cls_label, gt_primitive_mask)
losses['center_loss_' + self.primitive_mode] = center_loss
losses['size_loss_' + self.primitive_mode] = size_loss
losses['sem_loss_' + self.primitive_mode] = sem_cls_loss
return losses
def get_targets(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None,
bbox_preds=None):
"""Generate targets of primitive head.
Args:
points (list[torch.Tensor]): Points of each batch.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth \
bboxes of each batch.
gt_labels_3d (list[torch.Tensor]): Labels of each batch.
pts_semantic_mask (None | list[torch.Tensor]): Point-wise semantic
label of each batch.
pts_instance_mask (None | list[torch.Tensor]): Point-wise instance
label of each batch.
bbox_preds (dict): Predictions from forward of primitive head.
Returns:
tuple[torch.Tensor]: Targets of primitive head.
"""
for index in range(len(gt_labels_3d)):
if len(gt_labels_3d[index]) == 0:
fake_box = gt_bboxes_3d[index].tensor.new_zeros(
1, gt_bboxes_3d[index].tensor.shape[-1])
gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box)
gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1)
if pts_semantic_mask is None:
pts_semantic_mask = [None for i in range(len(gt_labels_3d))]
pts_instance_mask = [None for i in range(len(gt_labels_3d))]
(point_mask, point_sem,
point_offset) = multi_apply(self.get_targets_single, points,
gt_bboxes_3d, gt_labels_3d,
pts_semantic_mask, pts_instance_mask)
point_mask = torch.stack(point_mask)
point_sem = torch.stack(point_sem)
point_offset = torch.stack(point_offset)
batch_size = point_mask.shape[0]
num_proposal = bbox_preds['aggregated_points_' +
self.primitive_mode].shape[1]
num_seed = bbox_preds['seed_points'].shape[1]
seed_inds = bbox_preds['seed_indices'].long()
seed_inds_expand = seed_inds.view(batch_size, num_seed,
1).repeat(1, 1, 3)
seed_gt_votes = torch.gather(point_offset, 1, seed_inds_expand)
seed_gt_votes += bbox_preds['seed_points']
gt_primitive_center = seed_gt_votes.view(batch_size * num_proposal, 1,
3)
seed_inds_expand_sem = seed_inds.view(batch_size, num_seed, 1).repeat(
1, 1, 4 + self.num_dims)
seed_gt_sem = torch.gather(point_sem, 1, seed_inds_expand_sem)
gt_primitive_semantic = seed_gt_sem[:, :, 3:3 + self.num_dims].view(
batch_size * num_proposal, 1, self.num_dims).contiguous()
gt_sem_cls_label = seed_gt_sem[:, :, -1].long()
gt_votes_mask = torch.gather(point_mask, 1, seed_inds)
return (point_mask, point_offset, gt_primitive_center,
gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask)
def get_targets_single(self,
points,
gt_bboxes_3d,
gt_labels_3d,
pts_semantic_mask=None,
pts_instance_mask=None):
"""Generate targets of primitive head for single batch.
Args:
points (torch.Tensor): Points of each batch.
gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth \
boxes of each batch.
gt_labels_3d (torch.Tensor): Labels of each batch.
pts_semantic_mask (None | torch.Tensor): Point-wise semantic
label of each batch.
pts_instance_mask (None | torch.Tensor): Point-wise instance
label of each batch.
Returns:
tuple[torch.Tensor]: Targets of primitive head.
"""
gt_bboxes_3d = gt_bboxes_3d.to(points.device)
num_points = points.shape[0]
point_mask = points.new_zeros(num_points)
# Offset to the primitive center
point_offset = points.new_zeros([num_points, 3])
# Semantic information of primitive center
point_sem = points.new_zeros([num_points, 3 + self.num_dims + 1])
# Generate pts_semantic_mask and pts_instance_mask when they are None
if pts_semantic_mask is None or pts_instance_mask is None:
points2box_mask = gt_bboxes_3d.points_in_boxes(points)
assignment = points2box_mask.argmax(1)
background_mask = points2box_mask.max(1)[0] == 0
if pts_semantic_mask is None:
pts_semantic_mask = gt_labels_3d[assignment]
pts_semantic_mask[background_mask] = self.num_classes
if pts_instance_mask is None:
pts_instance_mask = assignment
pts_instance_mask[background_mask] = gt_labels_3d.shape[0]
instance_flag = torch.nonzero(
pts_semantic_mask != self.num_classes, as_tuple=False).squeeze(1)
instance_labels = pts_instance_mask[instance_flag].unique()
with_yaw = gt_bboxes_3d.with_yaw
for i, i_instance in enumerate(instance_labels):
indices = instance_flag[pts_instance_mask[instance_flag] ==
i_instance]
coords = points[indices, :3]
cur_cls_label = pts_semantic_mask[indices][0]
# Bbox Corners
cur_corners = gt_bboxes_3d.corners[i]
plane_lower_temp = points.new_tensor(
[0, 0, 1, -cur_corners[7, -1]])
upper_points = cur_corners[[1, 2, 5, 6]]
refined_distance = (upper_points * plane_lower_temp[:3]).sum(dim=1)
if self.check_horizon(upper_points) and \
plane_lower_temp[0] + plane_lower_temp[1] < \
self.train_cfg['lower_thresh']:
plane_lower = points.new_tensor(
[0, 0, 1, plane_lower_temp[-1]])
plane_upper = points.new_tensor(
[0, 0, 1, -torch.mean(refined_distance)])
else:
raise NotImplementedError('Only horizontal plane is support!')
if self.check_dist(plane_upper, upper_points) is False:
raise NotImplementedError(
'Mean distance to plane should be lower than thresh!')
# Get the boundary points here
point2plane_dist, selected = self.match_point2plane(
plane_lower, coords)
# Get bottom four lines
if self.primitive_mode == 'line':
point2line_matching = self.match_point2line(
coords[selected], cur_corners, with_yaw, mode='bottom')
point_mask, point_offset, point_sem = \
self._assign_primitive_line_targets(point_mask,
point_offset,
point_sem,
coords[selected],
indices[selected],
cur_cls_label,
point2line_matching,
cur_corners,
[1, 1, 0, 0],
with_yaw,
mode='bottom')
# Set the surface labels here
if self.primitive_mode == 'z' and \
selected.sum() > self.train_cfg['num_point'] and \
point2plane_dist[selected].var() < \
self.train_cfg['var_thresh']:
point_mask, point_offset, point_sem = \
self._assign_primitive_surface_targets(point_mask,
point_offset,
point_sem,
coords[selected],
indices[selected],
cur_cls_label,
cur_corners,
with_yaw,
mode='bottom')
# Get the boundary points here
point2plane_dist, selected = self.match_point2plane(
plane_upper, coords)
# Get top four lines
if self.primitive_mode == 'line':
point2line_matching = self.match_point2line(
coords[selected], cur_corners, with_yaw, mode='top')
point_mask, point_offset, point_sem = \
self._assign_primitive_line_targets(point_mask,
point_offset,
point_sem,
coords[selected],
indices[selected],
cur_cls_label,
point2line_matching,
cur_corners,
[1, 1, 0, 0],
with_yaw,
mode='top')
if self.primitive_mode == 'z' and \
selected.sum() > self.train_cfg['num_point'] and \
point2plane_dist[selected].var() < \
self.train_cfg['var_thresh']:
point_mask, point_offset, point_sem = \
self._assign_primitive_surface_targets(point_mask,
point_offset,
point_sem,
coords[selected],
indices[selected],
cur_cls_label,
cur_corners,
with_yaw,
mode='top')
# Get left two lines
plane_left_temp = self._get_plane_fomulation(
cur_corners[2] - cur_corners[3],
cur_corners[3] - cur_corners[0], cur_corners[0])
right_points = cur_corners[[4, 5, 7, 6]]
plane_left_temp /= torch.norm(plane_left_temp[:3])
refined_distance = (right_points * plane_left_temp[:3]).sum(dim=1)
if plane_left_temp[2] < self.train_cfg['lower_thresh']:
plane_left = plane_left_temp
plane_right = points.new_tensor([
plane_left_temp[0], plane_left_temp[1], plane_left_temp[2],
-refined_distance.mean()
])
else:
raise NotImplementedError(
'Normal vector of the plane should be horizontal!')
# Get the boundary points here
point2plane_dist, selected = self.match_point2plane(
plane_left, coords)
# Get left four lines
if self.primitive_mode == 'line':
point2line_matching = self.match_point2line(
coords[selected], cur_corners, with_yaw, mode='left')
point_mask, point_offset, point_sem = \
self._assign_primitive_line_targets(
point_mask, point_offset, point_sem,
coords[selected], indices[selected], cur_cls_label,
point2line_matching[2:], cur_corners, [2, 2],
with_yaw, mode='left')
if self.primitive_mode == 'xy' and \
selected.sum() > self.train_cfg['num_point'] and \
point2plane_dist[selected].var() < \
self.train_cfg['var_thresh']:
point_mask, point_offset, point_sem = \
self._assign_primitive_surface_targets(
point_mask, point_offset, point_sem,
coords[selected], indices[selected], cur_cls_label,
cur_corners, with_yaw, mode='left')
# Get the boundary points here
point2plane_dist, selected = self.match_point2plane(
plane_right, coords)
# Get right four lines
if self.primitive_mode == 'line':
point2line_matching = self.match_point2line(
coords[selected], cur_corners, with_yaw, mode='right')
point_mask, point_offset, point_sem = \
self._assign_primitive_line_targets(
point_mask, point_offset, point_sem,
coords[selected], indices[selected], cur_cls_label,
point2line_matching[2:], cur_corners, [2, 2],
with_yaw, mode='right')
if self.primitive_mode == 'xy' and \
selected.sum() > self.train_cfg['num_point'] and \
point2plane_dist[selected].var() < \
self.train_cfg['var_thresh']:
point_mask, point_offset, point_sem = \
self._assign_primitive_surface_targets(
point_mask, point_offset, point_sem,
coords[selected], indices[selected], cur_cls_label,
cur_corners, with_yaw, mode='right')
plane_front_temp = self._get_plane_fomulation(
cur_corners[0] - cur_corners[4],
cur_corners[4] - cur_corners[5], cur_corners[5])
back_points = cur_corners[[3, 2, 7, 6]]
plane_front_temp /= torch.norm(plane_front_temp[:3])
refined_distance = (back_points * plane_front_temp[:3]).sum(dim=1)
if plane_front_temp[2] < self.train_cfg['lower_thresh']:
plane_front = plane_front_temp
plane_back = points.new_tensor([
plane_front_temp[0], plane_front_temp[1],
plane_front_temp[2], -torch.mean(refined_distance)
])
else:
raise NotImplementedError(
'Normal vector of the plane should be horizontal!')
# Get the boundary points here
point2plane_dist, selected = self.match_point2plane(
plane_front, coords)
if self.primitive_mode == 'xy' and \
selected.sum() > self.train_cfg['num_point'] and \
(point2plane_dist[selected]).var() < \
self.train_cfg['var_thresh']:
point_mask, point_offset, point_sem = \
self._assign_primitive_surface_targets(
point_mask, point_offset, point_sem,
coords[selected], indices[selected], cur_cls_label,
cur_corners, with_yaw, mode='front')
# Get the boundary points here
point2plane_dist, selected = self.match_point2plane(
plane_back, coords)
if self.primitive_mode == 'xy' and \
selected.sum() > self.train_cfg['num_point'] and \
point2plane_dist[selected].var() < \
self.train_cfg['var_thresh']:
point_mask, point_offset, point_sem = \
self._assign_primitive_surface_targets(
point_mask, point_offset, point_sem,
coords[selected], indices[selected], cur_cls_label,
cur_corners, with_yaw, mode='back')
return (point_mask, point_sem, point_offset)
def primitive_decode_scores(self, predictions, aggregated_points):
"""Decode predicted parts to primitive head.
Args:
predictions (torch.Tensor): primitive pridictions of each batch.
aggregated_points (torch.Tensor): The aggregated points
of vote stage.
Returns:
Dict: Predictions of primitive head, including center,
semantic size and semantic scores.
"""
ret_dict = {}
pred_transposed = predictions.transpose(2, 1)
center = aggregated_points + pred_transposed[:, :, 0:3]
ret_dict['center_' + self.primitive_mode] = center
if self.primitive_mode in ['z', 'xy']:
ret_dict['size_residuals_' + self.primitive_mode] = \
pred_transposed[:, :, 3:3 + self.num_dims]
ret_dict['sem_cls_scores_' + self.primitive_mode] = \
pred_transposed[:, :, 3 + self.num_dims:]
return ret_dict
def check_horizon(self, points):
"""Check whether is a horizontal plane.
Args:
points (torch.Tensor): Points of input.
Returns:
Bool: Flag of result.
"""
return (points[0][-1] == points[1][-1]) and \
(points[1][-1] == points[2][-1]) and \
(points[2][-1] == points[3][-1])
def check_dist(self, plane_equ, points):
"""Whether the mean of points to plane distance is lower than thresh.
Args:
plane_equ (torch.Tensor): Plane to be checked.
points (torch.Tensor): Points to be checked.
Returns:
Tuple: Flag of result.
"""
return (points[:, 2] +
plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh']
def point2line_dist(self, points, pts_a, pts_b):
"""Calculate the distance from point to line.
Args:
points (torch.Tensor): Points of input.
pts_a (torch.Tensor): Point on the specific line.
pts_b (torch.Tensor): Point on the specific line.
Returns:
torch.Tensor: Distance between each point to line.
"""
line_a2b = pts_b - pts_a
line_a2pts = points - pts_a
length = (line_a2pts * line_a2b.view(1, 3)).sum(1) / \
line_a2b.norm()
dist = (line_a2pts.norm(dim=1)**2 - length**2).sqrt()
return dist
def match_point2line(self, points, corners, with_yaw, mode='bottom'):
"""Match points to corresponding line.
Args:
points (torch.Tensor): Points of input.
corners (torch.Tensor): Eight corners of a bounding box.
with_yaw (Bool): Whether the boundind box is with rotation.
mode (str, optional): Specify which line should be matched,
available mode are ('bottom', 'top', 'left', 'right').
Defaults to 'bottom'.
Returns:
Tuple: Flag of matching correspondence.
"""
if with_yaw:
corners_pair = {
'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
'left': [[0, 1], [3, 2], [0, 1], [3, 2]],
'right': [[4, 5], [7, 6], [4, 5], [7, 6]]
}
selected_list = []
for pair_index in corners_pair[mode]:
selected = self.point2line_dist(
points, corners[pair_index[0]], corners[pair_index[1]]) \
< self.train_cfg['line_thresh']
selected_list.append(selected)
else:
xmin, ymin, _ = corners.min(0)[0]
xmax, ymax, _ = corners.max(0)[0]
sel1 = torch.abs(points[:, 0] -
xmin) < self.train_cfg['line_thresh']
sel2 = torch.abs(points[:, 0] -
xmax) < self.train_cfg['line_thresh']
sel3 = torch.abs(points[:, 1] -
ymin) < self.train_cfg['line_thresh']
sel4 = torch.abs(points[:, 1] -
ymax) < self.train_cfg['line_thresh']
selected_list = [sel1, sel2, sel3, sel4]
return selected_list
def match_point2plane(self, plane, points):
"""Match points to plane.
Args:
plane (torch.Tensor): Equation of the plane.
points (torch.Tensor): Points of input.
Returns:
Tuple: Distance of each point to the plane and
flag of matching correspondence.
"""
point2plane_dist = torch.abs((points * plane[:3]).sum(dim=1) +
plane[-1])
min_dist = point2plane_dist.min()
selected = torch.abs(point2plane_dist -
min_dist) < self.train_cfg['dist_thresh']
return point2plane_dist, selected
def compute_primitive_loss(self, primitive_center, primitive_semantic,
semantic_scores, num_proposal,
gt_primitive_center, gt_primitive_semantic,
gt_sem_cls_label, gt_primitive_mask):
"""Compute loss of primitive module.
Args:
primitive_center (torch.Tensor): Pridictions of primitive center.
primitive_semantic (torch.Tensor): Pridictions of primitive
semantic.
semantic_scores (torch.Tensor): Pridictions of primitive
semantic scores.
num_proposal (int): The number of primitive proposal.
gt_primitive_center (torch.Tensor): Ground truth of
primitive center.
gt_votes_sem (torch.Tensor): Ground truth of primitive semantic.
gt_sem_cls_label (torch.Tensor): Ground truth of primitive
semantic class.
gt_primitive_mask (torch.Tensor): Ground truth of primitive mask.
Returns:
Tuple: Loss of primitive module.
"""
batch_size = primitive_center.shape[0]
vote_xyz_reshape = primitive_center.view(batch_size * num_proposal, -1,
3)
center_loss = self.center_loss(
vote_xyz_reshape,
gt_primitive_center,
dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1]
if self.primitive_mode != 'line':
size_xyz_reshape = primitive_semantic.view(
batch_size * num_proposal, -1, self.num_dims).contiguous()
size_loss = self.semantic_reg_loss(
size_xyz_reshape,
gt_primitive_semantic,
dst_weight=gt_primitive_mask.view(batch_size * num_proposal,
1))[1]
else:
size_loss = center_loss.new_tensor(0.0)
# Semantic cls loss
sem_cls_loss = self.semantic_cls_loss(
semantic_scores, gt_sem_cls_label, weight=gt_primitive_mask)
return center_loss, size_loss, sem_cls_loss
def get_primitive_center(self, pred_flag, center):
"""Generate primitive center from predictions.
Args:
pred_flag (torch.Tensor): Scores of primitive center.
center (torch.Tensor): Pridictions of primitive center.
Returns:
Tuple: Primitive center and the prediction indices.
"""
ind_normal = F.softmax(pred_flag, dim=1)
pred_indices = (ind_normal[:, 1, :] >
self.surface_thresh).detach().float()
selected = (ind_normal[:, 1, :] <=
self.surface_thresh).detach().float()
offset = torch.ones_like(center) * self.upper_thresh
center = center + offset * selected.unsqueeze(-1)
return center, pred_indices
def _assign_primitive_line_targets(self,
point_mask,
point_offset,
point_sem,
coords,
indices,
cls_label,
point2line_matching,
corners,
center_axises,
with_yaw,
mode='bottom'):
"""Generate targets of line primitive.
Args:
point_mask (torch.Tensor): Tensor to store the ground
truth of mask.
point_offset (torch.Tensor): Tensor to store the ground
truth of offset.
point_sem (torch.Tensor): Tensor to store the ground
truth of semantic.
coords (torch.Tensor): The selected points.
indices (torch.Tensor): Indices of the selected points.
cls_label (int): Class label of the ground truth bounding box.
point2line_matching (torch.Tensor): Flag indicate that
matching line of each point.
corners (torch.Tensor): Corners of the ground truth bounding box.
center_axises (list[int]): Indicate in which axis the line center
should be refined.
with_yaw (Bool): Whether the boundind box is with rotation.
mode (str, optional): Specify which line should be matched,
available mode are ('bottom', 'top', 'left', 'right').
Defaults to 'bottom'.
Returns:
Tuple: Targets of the line primitive.
"""
corners_pair = {
'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]],
'top': [[1, 2], [5, 6], [1, 5], [2, 6]],
'left': [[0, 1], [3, 2]],
'right': [[4, 5], [7, 6]]
}
corners_pair = corners_pair[mode]
assert len(corners_pair) == len(point2line_matching) == len(
center_axises)
for line_select, center_axis, pair_index in zip(
point2line_matching, center_axises, corners_pair):
if line_select.sum() > self.train_cfg['num_point_line']:
point_mask[indices[line_select]] = 1.0
if with_yaw:
line_center = (corners[pair_index[0]] +
corners[pair_index[1]]) / 2
else:
line_center = coords[line_select].mean(dim=0)
line_center[center_axis] = corners[:, center_axis].mean()
point_offset[indices[line_select]] = \
line_center - coords[line_select]
point_sem[indices[line_select]] = \
point_sem.new_tensor([line_center[0], line_center[1],
line_center[2], cls_label])
return point_mask, point_offset, point_sem
def _assign_primitive_surface_targets(self,
point_mask,
point_offset,
point_sem,
coords,
indices,
cls_label,
corners,
with_yaw,
mode='bottom'):
"""Generate targets for primitive z and primitive xy.
Args:
point_mask (torch.Tensor): Tensor to store the ground
truth of mask.
point_offset (torch.Tensor): Tensor to store the ground
truth of offset.
point_sem (torch.Tensor): Tensor to store the ground
truth of semantic.
coords (torch.Tensor): The selected points.
indices (torch.Tensor): Indices of the selected points.
cls_label (int): Class label of the ground truth bounding box.
corners (torch.Tensor): Corners of the ground truth bounding box.
with_yaw (Bool): Whether the boundind box is with rotation.
mode (str, optional): Specify which line should be matched,
available mode are ('bottom', 'top', 'left', 'right',
'front', 'back').
Defaults to 'bottom'.
Returns:
Tuple: Targets of the center primitive.
"""
point_mask[indices] = 1.0
corners_pair = {
'bottom': [0, 7],
'top': [1, 6],
'left': [0, 1],
'right': [4, 5],
'front': [0, 1],
'back': [3, 2]
}
pair_index = corners_pair[mode]
if self.primitive_mode == 'z':
if with_yaw:
center = (corners[pair_index[0]] +
corners[pair_index[1]]) / 2.0
center[2] = coords[:, 2].mean()
point_sem[indices] = point_sem.new_tensor([
center[0], center[1],
center[2], (corners[4] - corners[0]).norm(),
(corners[3] - corners[0]).norm(), cls_label
])
else:
center = point_mask.new_tensor([
corners[:, 0].mean(), corners[:, 1].mean(),
coords[:, 2].mean()
])
point_sem[indices] = point_sem.new_tensor([
center[0], center[1], center[2],
corners[:, 0].max() - corners[:, 0].min(),
corners[:, 1].max() - corners[:, 1].min(), cls_label
])
elif self.primitive_mode == 'xy':
if with_yaw:
center = coords.mean(0)
center[2] = (corners[pair_index[0], 2] +
corners[pair_index[1], 2]) / 2.0
point_sem[indices] = point_sem.new_tensor([
center[0], center[1], center[2],
corners[pair_index[1], 2] - corners[pair_index[0], 2],
cls_label
])
else:
center = point_mask.new_tensor([
coords[:, 0].mean(), coords[:, 1].mean(),
corners[:, 2].mean()
])
point_sem[indices] = point_sem.new_tensor([
center[0], center[1], center[2],
corners[:, 2].max() - corners[:, 2].min(), cls_label
])
point_offset[indices] = center - coords
return point_mask, point_offset, point_sem
def _get_plane_fomulation(self, vector1, vector2, point):
"""Compute the equation of the plane.
Args:
vector1 (torch.Tensor): Parallel vector of the plane.
vector2 (torch.Tensor): Parallel vector of the plane.
point (torch.Tensor): Point on the plane.
Returns:
torch.Tensor: Equation of the plane.
"""
surface_norm = torch.cross(vector1, vector2)
surface_dis = -torch.dot(surface_norm, point)
plane = point.new_tensor(
[surface_norm[0], surface_norm[1], surface_norm[2], surface_dis])
return plane
================================================
FILE: mmdet3d/models/roi_heads/part_aggregation_roi_head.py
================================================
from torch.nn import functional as F
from mmdet3d.core import AssignResult
from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi
from mmdet.core import build_assigner, build_sampler
from mmdet.models import HEADS
from ..builder import build_head, build_roi_extractor
from .base_3droi_head import Base3DRoIHead
@HEADS.register_module()
class PartAggregationROIHead(Base3DRoIHead):
"""Part aggregation roi head for PartA2.
Args:
semantic_head (ConfigDict): Config of semantic head.
num_classes (int): The number of classes.
seg_roi_extractor (ConfigDict): Config of seg_roi_extractor.
part_roi_extractor (ConfigDict): Config of part_roi_extractor.
bbox_head (ConfigDict): Config of bbox_head.
train_cfg (ConfigDict): Training config.
test_cfg (ConfigDict): Testing config.
"""
def __init__(self,
semantic_head,
num_classes=3,
seg_roi_extractor=None,
part_roi_extractor=None,
bbox_head=None,
train_cfg=None,
test_cfg=None):
super(PartAggregationROIHead, self).__init__(
bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg)
self.num_classes = num_classes
assert semantic_head is not None
self.semantic_head = build_head(semantic_head)
if seg_roi_extractor is not None:
self.seg_roi_extractor = build_roi_extractor(seg_roi_extractor)
if part_roi_extractor is not None:
self.part_roi_extractor = build_roi_extractor(part_roi_extractor)
self.init_assigner_sampler()
def init_weights(self, pretrained):
"""Initialize weights, skip since ``PartAggregationROIHead`` does not
need to initialize weights."""
pass
def init_mask_head(self):
"""Initialize mask head, skip since ``PartAggregationROIHead`` does not
have one."""
pass
def init_bbox_head(self, bbox_head):
"""Initialize box head."""
self.bbox_head = build_head(bbox_head)
def init_assigner_sampler(self):
"""Initialize assigner and sampler."""
self.bbox_assigner = None
self.bbox_sampler = None
if self.train_cfg:
if isinstance(self.train_cfg.assigner, dict):
self.bbox_assigner = build_assigner(self.train_cfg.assigner)
elif isinstance(self.train_cfg.assigner, list):
self.bbox_assigner = [
build_assigner(res) for res in self.train_cfg.assigner
]
self.bbox_sampler = build_sampler(self.train_cfg.sampler)
@property
def with_semantic(self):
"""bool: whether the head has semantic branch"""
return hasattr(self,
'semantic_head') and self.semantic_head is not None
def forward_train(self, feats_dict, voxels_dict, img_metas, proposal_list,
gt_bboxes_3d, gt_labels_3d):
"""Training forward function of PartAggregationROIHead.
Args:
feats_dict (dict): Contains features from the first stage.
voxels_dict (dict): Contains information of voxels.
img_metas (list[dict]): Meta info of each image.
proposal_list (list[dict]): Proposal information from rpn.
The dictionary should contain the following keys:
- boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes
- labels_3d (torch.Tensor): Labels of proposals
- cls_preds (torch.Tensor): Original scores of proposals
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]):
GT bboxes of each sample. The bboxes are encapsulated
by 3D box structures.
gt_labels_3d (list[LongTensor]): GT labels of each sample.
Returns:
dict: losses from each head.
- loss_semantic (torch.Tensor): loss of semantic head
- loss_bbox (torch.Tensor): loss of bboxes
"""
losses = dict()
if self.with_semantic:
semantic_results = self._semantic_forward_train(
feats_dict['seg_features'], voxels_dict, gt_bboxes_3d,
gt_labels_3d)
losses.update(semantic_results['loss_semantic'])
sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d,
gt_labels_3d)
if self.with_bbox:
bbox_results = self._bbox_forward_train(
feats_dict['seg_features'], semantic_results['part_feats'],
voxels_dict, sample_results)
losses.update(bbox_results['loss_bbox'])
return losses
def simple_test(self, feats_dict, voxels_dict, img_metas, proposal_list,
**kwargs):
"""Simple testing forward function of PartAggregationROIHead.
Note:
This function assumes that the batch size is 1
Args:
feats_dict (dict): Contains features from the first stage.
voxels_dict (dict): Contains information of voxels.
img_metas (list[dict]): Meta info of each image.
proposal_list (list[dict]): Proposal information from rpn.
Returns:
dict: Bbox results of one frame.
"""
assert self.with_bbox, 'Bbox head must be implemented.'
assert self.with_semantic
semantic_results = self.semantic_head(feats_dict['seg_features'])
rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list])
labels_3d = [res['labels_3d'] for res in proposal_list]
cls_preds = [res['cls_preds'] for res in proposal_list]
bbox_results = self._bbox_forward(feats_dict['seg_features'],
semantic_results['part_feats'],
voxels_dict, rois)
bbox_list = self.bbox_head.get_bboxes(
rois,
bbox_results['cls_score'],
bbox_results['bbox_pred'],
labels_3d,
cls_preds,
img_metas,
cfg=self.test_cfg)
bbox_results = [
bbox3d2result(bboxes, scores, labels)
for bboxes, scores, labels in bbox_list
]
return bbox_results
def _bbox_forward_train(self, seg_feats, part_feats, voxels_dict,
sampling_results):
"""Forward training function of roi_extractor and bbox_head.
Args:
seg_feats (torch.Tensor): Point-wise semantic features.
part_feats (torch.Tensor): Point-wise part prediction features.
voxels_dict (dict): Contains information of voxels.
sampling_results (:obj:`SamplingResult`): Sampled results used
for training.
Returns:
dict: Forward results including losses and predictions.
"""
rois = bbox3d2roi([res.bboxes for res in sampling_results])
bbox_results = self._bbox_forward(seg_feats, part_feats, voxels_dict,
rois)
bbox_targets = self.bbox_head.get_targets(sampling_results,
self.train_cfg)
loss_bbox = self.bbox_head.loss(bbox_results['cls_score'],
bbox_results['bbox_pred'], rois,
*bbox_targets)
bbox_results.update(loss_bbox=loss_bbox)
return bbox_results
def _bbox_forward(self, seg_feats, part_feats, voxels_dict, rois):
"""Forward function of roi_extractor and bbox_head used in both
training and testing.
Args:
seg_feats (torch.Tensor): Point-wise semantic features.
part_feats (torch.Tensor): Point-wise part prediction features.
voxels_dict (dict): Contains information of voxels.
rois (Tensor): Roi boxes.
Returns:
dict: Contains predictions of bbox_head and
features of roi_extractor.
"""
pooled_seg_feats = self.seg_roi_extractor(seg_feats,
voxels_dict['voxel_centers'],
voxels_dict['coors'][..., 0],
rois)
pooled_part_feats = self.part_roi_extractor(
part_feats, voxels_dict['voxel_centers'],
voxels_dict['coors'][..., 0], rois)
cls_score, bbox_pred = self.bbox_head(pooled_seg_feats,
pooled_part_feats)
bbox_results = dict(
cls_score=cls_score,
bbox_pred=bbox_pred,
pooled_seg_feats=pooled_seg_feats,
pooled_part_feats=pooled_part_feats)
return bbox_results
def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d):
"""Assign and sample proposals for training.
Args:
proposal_list (list[dict]): Proposals produced by RPN.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes.
gt_labels_3d (list[torch.Tensor]): Ground truth labels
Returns:
list[:obj:`SamplingResult`]: Sampled results of each training
sample.
"""
sampling_results = []
# bbox assign
for batch_idx in range(len(proposal_list)):
cur_proposal_list = proposal_list[batch_idx]
cur_boxes = cur_proposal_list['boxes_3d']
cur_labels_3d = cur_proposal_list['labels_3d']
cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device)
cur_gt_labels = gt_labels_3d[batch_idx]
batch_num_gts = 0
# 0 is bg
batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0)
batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes))
# -1 is bg
batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1)
# each class may have its own assigner
if isinstance(self.bbox_assigner, list):
for i, assigner in enumerate(self.bbox_assigner):
gt_per_cls = (cur_gt_labels == i)
pred_per_cls = (cur_labels_3d == i)
cur_assign_res = assigner.assign(
cur_boxes.tensor[pred_per_cls],
cur_gt_bboxes.tensor[gt_per_cls],
gt_labels=cur_gt_labels[gt_per_cls])
# gather assign_results in different class into one result
batch_num_gts += cur_assign_res.num_gts
# gt inds (1-based)
gt_inds_arange_pad = gt_per_cls.nonzero(
as_tuple=False).view(-1) + 1
# pad 0 for indice unassigned
gt_inds_arange_pad = F.pad(
gt_inds_arange_pad, (1, 0), mode='constant', value=0)
# pad -1 for indice ignore
gt_inds_arange_pad = F.pad(
gt_inds_arange_pad, (1, 0), mode='constant', value=-1)
# convert to 0~gt_num+2 for indices
gt_inds_arange_pad += 1
# now 0 is bg, >1 is fg in batch_gt_indis
batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[
cur_assign_res.gt_inds + 1] - 1
batch_max_overlaps[
pred_per_cls] = cur_assign_res.max_overlaps
batch_gt_labels[pred_per_cls] = cur_assign_res.labels
assign_result = AssignResult(batch_num_gts, batch_gt_indis,
batch_max_overlaps,
batch_gt_labels)
else: # for single class
assign_result = self.bbox_assigner.assign(
cur_boxes.tensor,
cur_gt_bboxes.tensor,
gt_labels=cur_gt_labels)
# sample boxes
sampling_result = self.bbox_sampler.sample(assign_result,
cur_boxes.tensor,
cur_gt_bboxes.tensor,
cur_gt_labels)
sampling_results.append(sampling_result)
return sampling_results
def _semantic_forward_train(self, x, voxels_dict, gt_bboxes_3d,
gt_labels_3d):
"""Train semantic head.
Args:
x (torch.Tensor): Point-wise semantic features for segmentation
voxels_dict (dict): Contains information of voxels.
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes.
gt_labels_3d (list[torch.Tensor]): Ground truth labels
Returns:
dict: Segmentation results including losses
"""
semantic_results = self.semantic_head(x)
semantic_targets = self.semantic_head.get_targets(
voxels_dict, gt_bboxes_3d, gt_labels_3d)
loss_semantic = self.semantic_head.loss(semantic_results,
semantic_targets)
semantic_results.update(loss_semantic=loss_semantic)
return semantic_results
================================================
FILE: mmdet3d/models/roi_heads/roi_extractors/__init__.py
================================================
from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor
from .single_roiaware_extractor import Single3DRoIAwareExtractor
__all__ = ['SingleRoIExtractor', 'Single3DRoIAwareExtractor']
================================================
FILE: mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py
================================================
import torch
from torch import nn as nn
from mmdet3d import ops
from mmdet.models.builder import ROI_EXTRACTORS
@ROI_EXTRACTORS.register_module()
class Single3DRoIAwareExtractor(nn.Module):
"""Point-wise roi-aware Extractor.
Extract Point-wise roi features.
Args:
roi_layer (dict): The config of roi layer.
"""
def __init__(self, roi_layer=None):
super(Single3DRoIAwareExtractor, self).__init__()
self.roi_layer = self.build_roi_layers(roi_layer)
def build_roi_layers(self, layer_cfg):
"""Build roi layers using `layer_cfg`"""
cfg = layer_cfg.copy()
layer_type = cfg.pop('type')
assert hasattr(ops, layer_type)
layer_cls = getattr(ops, layer_type)
roi_layers = layer_cls(**cfg)
return roi_layers
def forward(self, feats, coordinate, batch_inds, rois):
"""Extract point-wise roi features.
Args:
feats (torch.FloatTensor): Point-wise features with
shape (batch, npoints, channels) for pooling.
coordinate (torch.FloatTensor): Coordinate of each point.
batch_inds (torch.LongTensor): Indicate the batch of each point.
rois (torch.FloatTensor): Roi boxes with batch indices.
Returns:
torch.FloatTensor: Pooled features
"""
pooled_roi_feats = []
for batch_idx in range(int(batch_inds.max()) + 1):
roi_inds = (rois[..., 0].int() == batch_idx)
coors_inds = (batch_inds.int() == batch_idx)
pooled_roi_feat = self.roi_layer(rois[..., 1:][roi_inds],
coordinate[coors_inds],
feats[coors_inds])
pooled_roi_feats.append(pooled_roi_feat)
pooled_roi_feats = torch.cat(pooled_roi_feats, 0)
return pooled_roi_feats
================================================
FILE: mmdet3d/models/utils/__init__.py
================================================
from .clip_sigmoid import clip_sigmoid
from .inverse_sigmoid import inverse_sigmoid
from .mlp import MLP
from .transformerdecoder import PositionEmbeddingLearned, TransformerDecoderLayer, MultiheadAttention, PositionEmbeddingLearnedwoNorm
from .ffn import FFN, FFNLN
from .projection import ProjectionLayerNorm
from .sparsefusion_models import PointTransformer2D_3D, FusionTransformer2D_3D_Self, ImageTransformer_Cam_3D_MS, ViewTransformer
from .drop import Dropout, DropPath, build_dropout
from .deformable_decoder import DeformableTransformerDecoderLayer
from .depth_encoder import DepthEncoderResNet
from .network_modules import LayerNorm, ConvLN, denormalize_pos, normalize_pos
__all__ = ['clip_sigmoid', "MLP", 'PositionEmbeddingLearned', 'TransformerDecoderLayer', 'MultiheadAttention',
'FFN', 'inverse_sigmoid', 'PointTransformer2D_3D', 'FFNLN', 'PositionEmbeddingLearnedwoNorm',
'ProjectionLayerNorm', 'FusionTransformer2D_3D_Self',
'Dropout', 'DropPath', 'build_dropout',
'DeformableTransformerDecoderLayer' 'ImageTransformer_Cam_3D_MS',
'ViewTransformer', 'DepthEncoderResNet',
'LayerNorm', 'ConvLN', "normalize_pos", "denormalize_pos"
]
================================================
FILE: mmdet3d/models/utils/clip_sigmoid.py
================================================
import torch
def clip_sigmoid(x, eps=1e-4):
"""Sigmoid function for input feature.
Args:
x (torch.Tensor): Input feature map with the shape of [B, N, H, W].
eps (float): Lower bound of the range to be clamped to. Defaults
to 1e-4.
Returns:
torch.Tensor: Feature map after sigmoid.
"""
y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps)
return y
================================================
FILE: mmdet3d/models/utils/deformable_decoder.py
================================================
import copy
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn import Linear
import math
import warnings
from typing import Optional, no_type_check
from torch.autograd.function import Function, once_differentiable
from mmdet3d.models.utils import MultiheadAttention
from mmcv.runner import BaseModule
from mmcv import deprecated_api_warning
from mmcv.cnn import constant_init, xavier_init
from mmcv.runner import BaseModule
from mmdet3d.models.utils.ops.modules import MSDeformAttn
class DeformableTransformerDecoderLayer(nn.Module):
def __init__(self, d_model, nhead, level_num=4, dim_feedforward=2048, dropout=0.1, activation="relu",
self_posembed=None, cross_posembed=None, cross_only=False, n_points=4):
super().__init__()
self.cross_only = cross_only
if not self.cross_only:
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = MSDeformAttn(d_model, level_num, nhead, n_points)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
self.activation = _get_activation_fn(activation)
self.self_posembed = self_posembed
self.cross_posembed = cross_posembed
def with_pos_embed(self, tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, query, key, query_pos, key_pos, reference_points, level_start_index, spatial_shapes, query_padding_mask=None, input_padding_mask=None):
"""
:param query: B C Pq
:param key: B C Pk
:param query_pos: B Pq 3/6
:param key_pos: B Pk 3/6
:param value_pos: [B Pq 3/6]
:return:
"""
# NxCxP to PxNxC
if self.self_posembed is not None:
query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1)
else:
query_pos_embed = None
if self.cross_posembed is not None:
key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1)
else:
key_pos_embed = None
query = query.permute(2, 0, 1)
key = key.permute(2, 0, 1)
if not self.cross_only:
q = k = v = self.with_pos_embed(query, query_pos_embed)
query2 = self.self_attn(q, k, value=v, key_padding_mask=query_padding_mask)[0]
query = query + self.dropout1(query2)
query = self.norm1(query)
query_d = self.with_pos_embed(query, query_pos_embed)
input_flatten_d = self.with_pos_embed(key, key_pos_embed)
query2 = self.multihead_attn(query=query_d.permute(1, 0, 2),
input_flatten=input_flatten_d.permute(1, 0, 2), reference_points=reference_points,
input_spatial_shapes=spatial_shapes, input_level_start_index=level_start_index,
input_padding_mask=input_padding_mask
)
query2 = query2.permute(1, 0, 2)
query = query + self.dropout2(query2)
query = self.norm2(query)
query2 = self.linear2(self.dropout(self.activation(self.linear1(query))))
query = query + self.dropout3(query2)
query = self.norm3(query)
# NxCxP to PxNxC
query = query.permute(1, 2, 0)
return query
================================================
FILE: mmdet3d/models/utils/depth_encoder.py
================================================
import torch
import torch.nn as nn
from mmdet.models.backbones.resnet import BasicBlock
from mmdet3d.models.utils.network_modules import LayerNorm
from mmcv.cnn import ConvModule
class DepthEncoderResNet(nn.Module):
def __init__(self, input_channel, input_channel_img, hidden_channel, depth_layers):
super().__init__()
self.depth_layers = depth_layers
self.conv_depth = nn.Sequential(
nn.Conv2d(input_channel, hidden_channel, kernel_size=3, padding=1, bias=True),
nn.BatchNorm2d(hidden_channel),
nn.ReLU(inplace=True)
)
self.inplanes = hidden_channel
self._norm_layer = nn.BatchNorm2d
self.layers = nn.ModuleList()
self.fuse_layers = nn.ModuleList()
self.output_layers = nn.ModuleList()
for i in range(len(depth_layers)):
if i == 0:
stride = 1
else:
stride = 2
self.layers.append(self._make_layer(BasicBlock, hidden_channel, depth_layers[i], stride=stride))
self.fuse_layers.append(nn.Conv2d(input_channel_img+hidden_channel, hidden_channel, kernel_size=3, padding=1))
def _make_layer(self, block, planes, blocks, stride=1):
norm_layer = self._norm_layer
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride=stride, downsample=downsample))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, sparse_depth, img_inputs):
depth = self.conv_depth(sparse_depth)
img_outputs = []
for i in range(len(img_inputs)):
depth = self.layers[i](depth)
depth = torch.cat([depth, img_inputs[i]], dim=1)
depth = self.fuse_layers[i](depth)
img_outputs.append(depth.clone())
return img_outputs
================================================
FILE: mmdet3d/models/utils/drop.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Any, Dict, Optional
import torch
import torch.nn as nn
from mmcv import build_from_cfg
from mmdet3d.models.registry import DROPOUT_LAYERS
def drop_path(x: torch.Tensor,
drop_prob: float = 0.,
training: bool = False) -> torch.Tensor:
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks).
We follow the implementation
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
# handle tensors with different dimensions, not just 4D tensors.
shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
random_tensor = keep_prob + torch.rand(
shape, dtype=x.dtype, device=x.device)
output = x.div(keep_prob) * random_tensor.floor()
return output
@DROPOUT_LAYERS.register_module()
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of
residual blocks).
We follow the implementation
https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501
Args:
drop_prob (float): Probability of the path to be zeroed. Default: 0.1
"""
def __init__(self, drop_prob: float = 0.1):
super().__init__()
self.drop_prob = drop_prob
def forward(self, x: torch.Tensor) -> torch.Tensor:
return drop_path(x, self.drop_prob, self.training)
@DROPOUT_LAYERS.register_module()
class Dropout(nn.Dropout):
"""A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
``DropPath``
Args:
drop_prob (float): Probability of the elements to be
zeroed. Default: 0.5.
inplace (bool): Do the operation inplace or not. Default: False.
"""
def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
super().__init__(p=drop_prob, inplace=inplace)
def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
"""Builder for drop out layers."""
return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
================================================
FILE: mmdet3d/models/utils/ffn.py
================================================
import copy
import numpy as np
import torch
from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init
from mmcv.runner import force_fp32
from torch import nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn import Linear
from torch.nn.init import xavier_uniform_, constant_
class FFN(nn.Module):
def __init__(self,
in_channels,
heads,
head_conv=64,
final_kernel=1,
init_bias=-2.19,
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
bias='auto',
**kwargs):
super(FFN, self).__init__()
self.heads = heads
self.init_bias = init_bias
for head in self.heads:
if len(self.heads[head]) == 2:
classes, num_conv = self.heads[head]
need_bn = True
else:
classes, num_conv, need_bn = self.heads[head]
conv_layers = []
c_in = in_channels
for i in range(num_conv - 1):
if need_bn:
conv_layers.append(
ConvModule(
c_in,
head_conv,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=bias,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg))
else:
conv_layers.append(
ConvModule(
c_in,
head_conv,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=bias,
conv_cfg=conv_cfg,
norm_cfg=None))
c_in = head_conv
conv_layers.append(
build_conv_layer(
conv_cfg,
head_conv,
classes,
kernel_size=final_kernel,
stride=1,
padding=final_kernel // 2,
bias=True))
conv_layers = nn.Sequential(*conv_layers)
self.__setattr__(head, conv_layers)
def init_weights(self):
"""Initialize weights."""
for head in self.heads:
if 'heatmap' in head or 'cls' in head:
self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
else:
for m in self.__getattr__(head).modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
def forward(self, x):
"""Forward function for SepHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
dict[str: torch.Tensor]: contains the following keys:
-reg (torch.Tensor): 2D regression value with the \
shape of [B, 2, H, W].
-height (torch.Tensor): Height value with the \
shape of [B, 1, H, W].
-dim (torch.Tensor): Size value with the shape \
of [B, 3, H, W].
-rot (torch.Tensor): Rotation value with the \
shape of [B, 1, H, W].
-vel (torch.Tensor): Velocity value with the \
shape of [B, 2, H, W].
-heatmap (torch.Tensor): Heatmap with the shape of \
[B, N, H, W].
"""
ret_dict = dict()
for head in self.heads:
ret_dict[head] = self.__getattr__(head)(x)
return ret_dict
class FFNLN(nn.Module):
def __init__(self,
in_channels,
heads,
head_conv=64,
init_bias=-2.19,
**kwargs):
super(FFNLN, self).__init__()
self.heads = heads
self.init_bias = init_bias
for head in self.heads:
if len(self.heads[head]) == 2:
classes, num_conv = self.heads[head]
need_norm = True
else:
classes, num_conv, need_norm = self.heads[head]
conv_layers = []
c_in = in_channels
for i in range(num_conv - 1):
if need_norm:
conv_layers.append(
nn.Linear(
c_in,
head_conv,
bias=False,
)
)
conv_layers.append(nn.LayerNorm(head_conv))
else:
conv_layers.append(
nn.Linear(
c_in,
head_conv,
bias=True,
)
)
conv_layers.append(nn.ReLU(inplace=True))
c_in = head_conv
conv_layers.append(
nn.Linear(
head_conv,
classes,
bias=True,
)
)
conv_layers = nn.Sequential(*conv_layers)
self.__setattr__(head, conv_layers)
def init_weights(self):
"""Initialize weights."""
for head in self.heads:
if 'heatmap' in head or 'cls' in head:
self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
else:
for m in self.__getattr__(head).modules():
if isinstance(m, nn.Linear):
kaiming_init(m)
def forward(self, x):
"""Forward function for SepHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
dict[str: torch.Tensor]: contains the following keys:
-reg (torch.Tensor): 2D regression value with the \
shape of [B, 2, H, W].
-height (torch.Tensor): Height value with the \
shape of [B, 1, H, W].
-dim (torch.Tensor): Size value with the shape \
of [B, 3, H, W].
-rot (torch.Tensor): Rotation value with the \
shape of [B, 1, H, W].
-vel (torch.Tensor): Velocity value with the \
shape of [B, 2, H, W].
-heatmap (torch.Tensor): Heatmap with the shape of \
[B, N, H, W].
"""
ret_dict = dict()
x = x.permute(0, 2, 1).contiguous()
for head in self.heads:
ret_dict[head] = self.__getattr__(head)(x)
ret_dict[head] = ret_dict[head].permute(0, 2, 1).contiguous()
return ret_dict
class FFNReg(nn.Module):
def __init__(self,
in_channels,
heads,
head_conv=64,
init_bias=-2.19,
**kwargs):
super(FFNReg, self).__init__()
self.heads = heads
self.init_bias = init_bias
for head in self.heads:
classes, num_conv = self.heads[head]
conv_layers = []
c_in = in_channels
for i in range(num_conv - 1):
conv_layers.append(
nn.Linear(
c_in,
head_conv,
bias=False,
)
)
if head == "heatmap" or head == "cls":
conv_layers.append(nn.LayerNorm(head_conv))
conv_layers.append(nn.ReLU(inplace=True))
c_in = head_conv
conv_layers.append(
nn.Linear(
head_conv,
classes,
bias=True,
)
)
conv_layers = nn.Sequential(*conv_layers)
self.__setattr__(head, conv_layers)
def init_weights(self):
"""Initialize weights."""
for head in self.heads:
if head == 'heatmap' or head == 'cls':
self.__getattr__(head)[-1].bias.data.fill_(self.init_bias)
else:
for m in self.__getattr__(head).modules():
if isinstance(m, nn.Linear):
kaiming_init(m)
def forward(self, x):
"""Forward function for SepHead.
Args:
x (torch.Tensor): Input feature map with the shape of
[B, 512, 128, 128].
Returns:
dict[str: torch.Tensor]: contains the following keys:
-reg (torch.Tensor): 2D regression value with the \
shape of [B, 2, H, W].
-height (torch.Tensor): Height value with the \
shape of [B, 1, H, W].
-dim (torch.Tensor): Size value with the shape \
of [B, 3, H, W].
-rot (torch.Tensor): Rotation value with the \
shape of [B, 1, H, W].
-vel (torch.Tensor): Velocity value with the \
shape of [B, 2, H, W].
-heatmap (torch.Tensor): Heatmap with the shape of \
[B, N, H, W].
"""
ret_dict = dict()
x = x.permute(0, 2, 1).contiguous()
for head in self.heads:
ret_dict[head] = self.__getattr__(head)(x)
ret_dict[head] = ret_dict[head].permute(0, 2, 1).contiguous()
if 'bbox_3d' in ret_dict:
ret_dict['center'] = ret_dict['bbox_3d'][:, 0:2]
ret_dict['dim'] = ret_dict['bbox_3d'][:, 2:5]
ret_dict['height'] = ret_dict['bbox_3d'][:, 5:6]
ret_dict['rot'] = ret_dict['bbox_3d'][:, 6:8]
ret_dict['vel'] = ret_dict['bbox_3d'][:, 8:10]
del ret_dict['bbox_3d']
return ret_dict
================================================
FILE: mmdet3d/models/utils/inverse_sigmoid.py
================================================
import torch
def inverse_sigmoid(x, eps=1e-5):
x = x.clamp(min=0, max=1)
x1 = x.clamp(min=eps)
x2 = (1 - x).clamp(min=eps)
return torch.log(x1/x2)
================================================
FILE: mmdet3d/models/utils/mlp.py
================================================
from mmcv.cnn import ConvModule
from torch import nn as nn
class MLP(nn.Module):
"""A simple MLP module.
Pass features (B, C, N) through an MLP.
Args:
in_channels (int): Number of channels of input features.
Default: 18.
conv_channels (tuple[int]): Out channels of the convolution.
Default: (256, 256).
conv_cfg (dict): Config of convolution.
Default: dict(type='Conv1d').
norm_cfg (dict): Config of normalization.
Default: dict(type='BN1d').
act_cfg (dict): Config of activation.
Default: dict(type='ReLU').
"""
def __init__(self,
in_channel=18,
conv_channels=(256, 256),
conv_cfg=dict(type='Conv1d'),
norm_cfg=dict(type='BN1d'),
act_cfg=dict(type='ReLU')):
super().__init__()
self.mlp = nn.Sequential()
prev_channels = in_channel
for i, conv_channel in enumerate(conv_channels):
self.mlp.add_module(
f'layer{i}',
ConvModule(
prev_channels,
conv_channels[i],
1,
padding=0,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
bias=True,
inplace=True))
prev_channels = conv_channels[i]
def forward(self, img_features):
return self.mlp(img_features)
================================================
FILE: mmdet3d/models/utils/network_modules.py
================================================
import torch
import torch.nn as nn
import torch.nn.functional as F
from .inverse_sigmoid import inverse_sigmoid
def denormalize_pos(normal_pos, x_max, y_max, sigmoid=True):
max_xy = torch.Tensor([x_max, y_max]).to(normal_pos.device).view(1, 1, 2)
if sigmoid:
pos = normal_pos.sigmoid() * max_xy
else:
pos = normal_pos * max_xy
return pos
def normalize_pos(pos, x_max, y_max):
max_xy = torch.Tensor([x_max, y_max]).to(pos.device).view(1, 1, 2)
normal_pos = pos / max_xy
return inverse_sigmoid(normal_pos)
class LayerNorm(nn.Module):
r""" LayerNorm that supports two data formats: channels_last (default) or channels_first.
The ordering of the dimensions in the inputs. channels_last corresponds to inputs with
shape (batch_size, height, width, channels) while channels_first corresponds to inputs
with shape (batch_size, channels, height, width).
"""
def __init__(self, normalized_shape, eps=1e-6, data_format="channels_last"):
super().__init__()
self.weight = nn.Parameter(torch.ones(normalized_shape))
self.bias = nn.Parameter(torch.zeros(normalized_shape))
self.eps = eps
self.data_format = data_format
if self.data_format not in ["channels_last", "channels_first"]:
raise NotImplementedError
self.normalized_shape = (normalized_shape,)
def forward(self, x):
if self.data_format == "channels_last":
return F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
elif self.data_format == "channels_first":
u = x.mean(1, keepdim=True)
s = (x - u).pow(2).mean(1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.eps)
x = self.weight[:, None, None] * x + self.bias[:, None, None]
return x
class ConvLN(nn.Module):
def __init__(self, input_channel, hidden_channel, kernel_size=3, stride=1, padding=1, require_act=True):
super().__init__()
if require_act:
self.module = nn.Sequential(
nn.Conv2d(input_channel, hidden_channel, kernel_size=kernel_size, stride=stride, padding=padding),
LayerNorm(hidden_channel, data_format="channels_first"),
nn.ReLU()
)
else:
self.module = nn.Sequential(
nn.Conv2d(input_channel, hidden_channel, kernel_size=kernel_size, stride=stride, padding=padding),
LayerNorm(hidden_channel, data_format="channels_first"),
)
def forward(self, x):
# [bs, C, H, W]
x = self.module(x)
return x
class SE_Block(nn.Module):
def __init__(self, c):
super().__init__()
self.att = nn.Sequential(
nn.AdaptiveAvgPool2d(1),
nn.Conv2d(c, c, kernel_size=1, stride=1),
nn.Sigmoid()
)
def forward(self, x):
return x * self.att(x)
================================================
FILE: mmdet3d/models/utils/ops/functions/__init__.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from .ms_deform_attn_func import MSDeformAttnFunction
__all__ = ['MSDeformAttnFunction']
================================================
FILE: mmdet3d/models/utils/ops/functions/ms_deform_attn_func.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
import MultiScaleDeformableAttention as MSDA
class MSDeformAttnFunction(Function):
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
ctx.im2col_step = im2col_step
output = MSDA.ms_deform_attn_forward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
grad_value, grad_sampling_loc, grad_attn_weight = \
MSDA.ms_deform_attn_backward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
# for debug and test only,
# need to use cuda version instead
N_, S_, M_, D_ = value.shape
_, Lq_, M_, L_, P_, _ = sampling_locations.shape
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for lid_, (H_, W_) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
mode='bilinear', padding_mode='zeros', align_corners=False)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
return output.transpose(1, 2).contiguous()
================================================
FILE: mmdet3d/models/utils/ops/make.sh
================================================
#!/usr/bin/env bash
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
python setup.py build install
================================================
FILE: mmdet3d/models/utils/ops/modules/__init__.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from .ms_deform_attn import MSDeformAttn
================================================
FILE: mmdet3d/models/utils/ops/modules/ms_deform_attn.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import warnings
import math
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_
from ..functions import MSDeformAttnFunction
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
return (n & (n-1) == 0) and n != 0
class MSDeformAttn(nn.Module):
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
"""
Multi-Scale Deformable Attention Module
:param d_model hidden dimension
:param n_levels number of feature levels
:param n_heads number of attention heads
:param n_points number of sampling points per attention head per feature level
"""
super().__init__()
if d_model % n_heads != 0:
raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
_d_per_head = d_model // n_heads
# you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_head):
warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
self.im2col_step = 64
self.d_model = d_model
self.n_levels = n_levels
self.n_heads = n_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
self.value_proj = nn.Linear(d_model, d_model)
self.output_proj = nn.Linear(d_model, d_model)
self._reset_parameters()
def _reset_parameters(self):
constant_(self.sampling_offsets.weight.data, 0.)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
constant_(self.attention_weights.weight.data, 0.)
constant_(self.attention_weights.bias.data, 0.)
xavier_uniform_(self.value_proj.weight.data)
constant_(self.value_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
:param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
:param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
:param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
:param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
:return output (N, Length_{query}, C)
"""
N, Len_q, _ = query.shape
N, Len_in, _ = input_flatten.shape
assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
value = self.value_proj(input_flatten)
if input_padding_mask is not None:
value = value.masked_fill(input_padding_mask[..., None], float(0))
value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
# N, Len_q, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
sampling_locations = reference_points[:, :, None, :, None, :] \
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
elif reference_points.shape[-1] == 4:
sampling_locations = reference_points[:, :, None, :, None, :2] \
+ sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
else:
raise ValueError(
'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
output = self.output_proj(output)
return output
================================================
FILE: mmdet3d/models/utils/ops/setup.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
import os
import glob
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
if torch.cuda.is_available() and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
raise NotImplementedError('Cuda is not availabel')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"MultiScaleDeformableAttention",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="MultiScaleDeformableAttention",
version="1.0",
author="Weijie Su",
url="https://github.com/fundamentalvision/Deformable-DETR",
description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
packages=find_packages(exclude=("configs", "tests",)),
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)
================================================
FILE: mmdet3d/models/utils/ops/src/cpu/ms_deform_attn_cpu.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include
#include
#include
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
std::vector
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
================================================
FILE: mmdet3d/models/utils/ops/src/cpu/ms_deform_attn_cpu.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);
================================================
FILE: mmdet3d/models/utils/ops/src/cuda/ms_deform_attn_cuda.cu
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include
#include "cuda/ms_deform_im2col_cuda.cuh"
#include
#include
#include
#include
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
const int batch_n = im2col_step_;
auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto columns = output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
value.data() + n * im2col_step_ * per_value_size,
spatial_shapes.data(),
level_start_index.data(),
sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
columns.data());
}));
}
output = output.view({batch, num_query, num_heads*channels});
return output;
}
std::vector ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto grad_value = at::zeros_like(value);
auto grad_sampling_loc = at::zeros_like(sampling_loc);
auto grad_attn_weight = at::zeros_like(attn_weight);
const int batch_n = im2col_step_;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto grad_output_g = grad_output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
grad_output_g.data(),
value.data() + n * im2col_step_ * per_value_size,
spatial_shapes.data(),
level_start_index.data(),
sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
grad_value.data() + n * im2col_step_ * per_value_size,
grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size,
grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size);
}));
}
return {
grad_value, grad_sampling_loc, grad_attn_weight
};
}
================================================
FILE: mmdet3d/models/utils/ops/src/cuda/ms_deform_attn_cuda.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);
================================================
FILE: mmdet3d/models/utils/ops/src/cuda/ms_deform_im2col_cuda.cuh
================================================
/*!
**************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************
* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
* Copyright (c) 2018 Microsoft
**************************************************************************
*/
#include
#include
#include
#include
#include
#include
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
i < (n); \
i += blockDim.x * gridDim.x)
const int CUDA_NUM_THREADS = 1024;
inline int GET_BLOCKS(const int N, const int num_threads)
{
return (N + num_threads - 1) / num_threads;
}
template
__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
}
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
return val;
}
template
__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
*grad_attn_weight = top_grad * val;
*grad_sampling_loc = width * grad_w_weight * top_grad_value;
*(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
}
template
__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
const int &height, const int &width, const int &nheads, const int &channels,
const scalar_t &h, const scalar_t &w, const int &m, const int &c,
const scalar_t &top_grad,
const scalar_t &attn_weight,
scalar_t* &grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int h_low = floor(h);
const int w_low = floor(w);
const int h_high = h_low + 1;
const int w_high = w_low + 1;
const scalar_t lh = h - h_low;
const scalar_t lw = w - w_low;
const scalar_t hh = 1 - lh, hw = 1 - lw;
const int w_stride = nheads * channels;
const int h_stride = width * w_stride;
const int h_low_ptr_offset = h_low * h_stride;
const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
const int w_low_ptr_offset = w_low * w_stride;
const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
const int base_ptr = m * channels + c;
const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
const scalar_t top_grad_value = top_grad * attn_weight;
scalar_t grad_h_weight = 0, grad_w_weight = 0;
scalar_t v1 = 0;
if (h_low >= 0 && w_low >= 0)
{
const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
v1 = bottom_data[ptr1];
grad_h_weight -= hw * v1;
grad_w_weight -= hh * v1;
atomicAdd(grad_value+ptr1, w1*top_grad_value);
}
scalar_t v2 = 0;
if (h_low >= 0 && w_high <= width - 1)
{
const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
v2 = bottom_data[ptr2];
grad_h_weight -= lw * v2;
grad_w_weight += hh * v2;
atomicAdd(grad_value+ptr2, w2*top_grad_value);
}
scalar_t v3 = 0;
if (h_high <= height - 1 && w_low >= 0)
{
const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
v3 = bottom_data[ptr3];
grad_h_weight += hw * v3;
grad_w_weight -= lh * v3;
atomicAdd(grad_value+ptr3, w3*top_grad_value);
}
scalar_t v4 = 0;
if (h_high <= height - 1 && w_high <= width - 1)
{
const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
v4 = bottom_data[ptr4];
grad_h_weight += lw * v4;
grad_w_weight += lh * v4;
atomicAdd(grad_value+ptr4, w4*top_grad_value);
}
const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
atomicAdd(grad_attn_weight, top_grad * val);
atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
}
template
__global__ void ms_deformable_im2col_gpu_kernel(const int n,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *data_col)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
scalar_t *data_col_ptr = data_col + index;
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
scalar_t col = 0;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
}
}
*data_col_ptr = col;
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockSize; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
__shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
__shared__ scalar_t cache_grad_attn_weight[blockSize];
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockSize/2; s>0; s>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
if (tid == 0)
{
scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
int sid=2;
for (unsigned int tid = 1; tid < blockDim.x; ++tid)
{
_grad_w += cache_grad_sampling_loc[sid];
_grad_h += cache_grad_sampling_loc[sid + 1];
_grad_a += cache_grad_attn_weight[tid];
sid += 2;
}
*grad_sampling_loc = _grad_w;
*(grad_sampling_loc + 1) = _grad_h;
*grad_attn_weight = _grad_a;
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
*grad_sampling_loc = cache_grad_sampling_loc[0];
*(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
*grad_attn_weight = cache_grad_attn_weight[0];
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
extern __shared__ int _s[];
scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
unsigned int tid = threadIdx.x;
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
*(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
*(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
*(cache_grad_attn_weight+threadIdx.x)=0;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
}
__syncthreads();
for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
{
if (tid < s) {
const unsigned int xid1 = tid << 1;
const unsigned int xid2 = (tid + s) << 1;
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
if (tid + (s << 1) < spre)
{
cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
}
}
__syncthreads();
}
if (tid == 0)
{
atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
}
__syncthreads();
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
const scalar_t *grad_col,
const scalar_t *data_value,
const int64_t *data_spatial_shapes,
const int64_t *data_level_start_index,
const scalar_t *data_sampling_loc,
const scalar_t *data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t *grad_value,
scalar_t *grad_sampling_loc,
scalar_t *grad_attn_weight)
{
CUDA_KERNEL_LOOP(index, n)
{
int _temp = index;
const int c_col = _temp % channels;
_temp /= channels;
const int sampling_index = _temp;
const int m_col = _temp % num_heads;
_temp /= num_heads;
const int q_col = _temp % num_query;
_temp /= num_query;
const int b_col = _temp;
const scalar_t top_grad = grad_col[index];
int data_weight_ptr = sampling_index * num_levels * num_point;
int data_loc_w_ptr = data_weight_ptr << 1;
const int grad_sampling_ptr = data_weight_ptr;
grad_sampling_loc += grad_sampling_ptr << 1;
grad_attn_weight += grad_sampling_ptr;
const int grad_weight_stride = 1;
const int grad_loc_stride = 2;
const int qid_stride = num_heads * channels;
const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
for (int l_col=0; l_col < num_levels; ++l_col)
{
const int level_start_id = data_level_start_index[l_col];
const int spatial_h_ptr = l_col << 1;
const int spatial_h = data_spatial_shapes[spatial_h_ptr];
const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
const scalar_t *data_value_ptr = data_value + value_ptr_offset;
scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
for (int p_col=0; p_col < num_point; ++p_col)
{
const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
const scalar_t weight = data_attn_weight[data_weight_ptr];
const scalar_t h_im = loc_h * spatial_h - 0.5;
const scalar_t w_im = loc_w * spatial_w - 0.5;
if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
{
ms_deform_attn_col2im_bilinear_gm(
data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
top_grad, weight, grad_value_ptr,
grad_sampling_loc, grad_attn_weight);
}
data_weight_ptr += 1;
data_loc_w_ptr += 2;
grad_attn_weight += grad_weight_stride;
grad_sampling_loc += grad_loc_stride;
}
}
}
}
template
void ms_deformable_im2col_cuda(cudaStream_t stream,
const scalar_t* data_value,
const int64_t* data_spatial_shapes,
const int64_t* data_level_start_index,
const scalar_t* data_sampling_loc,
const scalar_t* data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* data_col)
{
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
const int num_threads = CUDA_NUM_THREADS;
ms_deformable_im2col_gpu_kernel
<<>>(
num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
}
}
template
void ms_deformable_col2im_cuda(cudaStream_t stream,
const scalar_t* grad_col,
const scalar_t* data_value,
const int64_t * data_spatial_shapes,
const int64_t * data_level_start_index,
const scalar_t * data_sampling_loc,
const scalar_t * data_attn_weight,
const int batch_size,
const int spatial_size,
const int num_heads,
const int channels,
const int num_levels,
const int num_query,
const int num_point,
scalar_t* grad_value,
scalar_t* grad_sampling_loc,
scalar_t* grad_attn_weight)
{
const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
const int num_kernels = batch_size * num_query * num_heads * channels;
const int num_actual_kernels = batch_size * num_query * num_heads * channels;
if (channels > 1024)
{
if ((channels & 1023) == 0)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
ms_deformable_col2im_gpu_kernel_gm
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
else{
switch(channels)
{
case 1:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 2:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 4:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 8:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 16:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 32:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 64:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 128:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 256:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 512:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
case 1024:
ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
break;
default:
if (channels < 64)
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v1
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
else
{
ms_deformable_col2im_gpu_kernel_shm_reduce_v2
<<>>(
num_kernels,
grad_col,
data_value,
data_spatial_shapes,
data_level_start_index,
data_sampling_loc,
data_attn_weight,
batch_size,
spatial_size,
num_heads,
channels,
num_levels,
num_query,
num_point,
grad_value,
grad_sampling_loc,
grad_attn_weight);
}
}
}
cudaError_t err = cudaGetLastError();
if (err != cudaSuccess)
{
printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
}
}
================================================
FILE: mmdet3d/models/utils/ops/src/ms_deform_attn.h
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#pragma once
#include "cpu/ms_deform_attn_cpu.h"
#ifdef WITH_CUDA
#include "cuda/ms_deform_attn_cuda.h"
#endif
at::Tensor
ms_deform_attn_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_forward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector
ms_deform_attn_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_backward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
================================================
FILE: mmdet3d/models/utils/ops/src/vision.cpp
================================================
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
#include "ms_deform_attn.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
}
================================================
FILE: mmdet3d/models/utils/ops/test.py
================================================
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import time
import torch
import torch.nn as nn
from torch.autograd import gradcheck
from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
N, M, D = 1, 2, 2
Lq, L, P = 2, 2, 2
shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
S = sum([(H*W).item() for H, W in shapes])
torch.manual_seed(3)
@torch.no_grad()
def check_forward_equal_with_pytorch_double():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
@torch.no_grad()
def check_forward_equal_with_pytorch_float():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
value = torch.rand(N, S, M, channels).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
func = MSDeformAttnFunction.apply
value.requires_grad = grad_value
sampling_locations.requires_grad = grad_sampling_loc
attention_weights.requires_grad = grad_attn_weight
gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
print(f'* {gradok} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_pytorch_double()
check_forward_equal_with_pytorch_float()
for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
check_gradient_numerical(channels, True, True, True)
================================================
FILE: mmdet3d/models/utils/projection.py
================================================
import torch
import torch.nn as nn
from mmdet3d.models.utils import PositionEmbeddingLearned
class PointProjection(nn.Module):
def __init__(self, pos_channel, hidden_channel):
super(PointProjection, self).__init__()
self.feat_proj = nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)
self.pos_embed = nn.Sequential(
nn.Conv1d(pos_channel, hidden_channel*4, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv1d(hidden_channel*4, hidden_channel, kernel_size=1)
)
self.fuse_proj = nn.Sequential(
nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)
)
def forward(self, query_feat, query_pos):
pos_embed = self.pos_embed(query_pos.permute(0, 2, 1))
feat_embed = self.feat_proj(query_feat)
proj_embed = self.fuse_proj(feat_embed + pos_embed)
return proj_embed
class ImageProjection(nn.Module):
def __init__(self, pos_channel, hidden_channel):
super(ImageProjection, self).__init__()
self.feat_proj = nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)
self.pos_proj = nn.Sequential(
nn.Conv1d(pos_channel, hidden_channel*4, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv1d(hidden_channel*4, hidden_channel, kernel_size=1),
)
self.fuse_proj = nn.Sequential(
nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)
)
def forward(self, query_feat, query_pos):
feat_embed = self.feat_proj(query_feat)
pos_embed = self.pos_proj(query_pos.permute(0, 2, 1))
proj_embed = self.fuse_proj(feat_embed + pos_embed)
return proj_embed
class ProjectionL2Norm(nn.Module):
def __init__(self, hidden_channel):
super(ProjectionL2Norm, self).__init__()
self.hidden_channel = hidden_channel
self.feat_proj = nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1)
def forward(self, query_feat):
query_feat = self.feat_proj(query_feat)
assert query_feat.shape[1] == self.hidden_channel
query_feat = query_feat / torch.norm(query_feat, p=2, keepdim=True, dim=1)
return query_feat
class ProjectionLayerNorm(nn.Module):
def __init__(self, hidden_channel, norm=True, input_channel=None):
super(ProjectionLayerNorm, self).__init__()
if input_channel is None:
input_channel = hidden_channel
self.hidden_channel = hidden_channel
self.feat_proj = nn.Linear(input_channel, hidden_channel)
self.norm = norm
if norm:
self.norm = nn.LayerNorm(hidden_channel)
def forward(self, query_feat):
query_feat = query_feat.transpose(2, 1)
query_feat = self.feat_proj(query_feat)
if self.norm:
query_feat = self.norm(query_feat)
query_feat = query_feat.transpose(2, 1)
return query_feat
class Projection_wPos(nn.Module):
def __init__(self, hidden_channel, pos_embed):
super(Projection_wPos, self).__init__()
self.hidden_channel = hidden_channel
self.pos_proj = pos_embed
self.feat_proj = ProjectionLayerNorm(hidden_channel)
def forward(self, query_feat, query_pos):
feat_embed = self.feat_proj(query_feat)
pos_embed = self.pos_proj(query_pos)
return feat_embed + pos_embed
================================================
FILE: mmdet3d/models/utils/sparsefusion_models.py
================================================
import copy
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from mmdet3d.models.fusion_layers import apply_3d_transformation
from mmdet3d.models.utils import TransformerDecoderLayer, inverse_sigmoid
from mmdet3d.models.utils.deformable_decoder import DeformableTransformerDecoderLayer
from mmdet3d.models.utils.network_modules import LayerNorm, denormalize_pos, normalize_pos
class PointTransformer2D_3D(nn.Module):
def __init__(self, hidden_channel, num_heads, num_decoder_layers, prediction_heads, ffn_channel, dropout, activation, test_cfg, query_pos, key_pos):
super(PointTransformer2D_3D, self).__init__()
self.hidden_channel = hidden_channel
self.num_heads = num_heads
self.num_decoder_layers = num_decoder_layers
self.prediction_heads = prediction_heads
self.test_cfg = test_cfg
self.decoder = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.decoder.append(
TransformerDecoderLayer(
hidden_channel, num_heads, ffn_channel, dropout, activation,
self_posembed=query_pos[i],
cross_posembed=key_pos[i],
)
)
def forward(self, pts_query_feat, pts_query_pos, lidar_feat_flatten, bev_pos):
ret_dicts = []
res_layer = self.prediction_heads(pts_query_feat)
res_layer['center'] = pts_query_pos.permute(0, 2, 1) # [BS, 2, num_proposals]
for i in range(self.num_decoder_layers):
# Transformer Decoder Layer
# :param query: B C Pq :param query_pos: B Pq 3/6
pts_query_feat = self.decoder[i](pts_query_feat, lidar_feat_flatten, pts_query_pos, bev_pos)
# Prediction
res_layer = self.prediction_heads(pts_query_feat)
res_layer['center'] = res_layer['center'] + pts_query_pos.permute(0, 2, 1)
ret_dicts.append(res_layer)
# for next level positional embedding
pts_query_pos = res_layer['center'].detach().clone().permute(0, 2, 1)
return pts_query_feat, pts_query_pos, ret_dicts
class CameraSE(nn.Module):
def __init__(self, cam_dim, hidden_channel):
super(CameraSE, self).__init__()
self.bn = nn.BatchNorm1d(cam_dim)
self.hidden_channel = hidden_channel
self.mlp_depth = nn.Sequential(
nn.Conv1d(cam_dim, hidden_channel, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1),
nn.ReLU(inplace=True),
nn.Conv1d(hidden_channel, hidden_channel, kernel_size=1),
)
def forward(self, feat, cam_info):
cam_info_bn = self.bn(cam_info)
pred = feat * self.mlp_depth(cam_info_bn).sigmoid()
return pred
class ImageTransformer_Cam_3D_MS(nn.Module):
def __init__(self, num_views, hidden_channel, num_heads, num_decoder_layers, prediction_heads, out_size_factor_img,
ffn_channel, dropout, activation, test_cfg, query_pos, key_pos):
super(ImageTransformer_Cam_3D_MS, self).__init__()
self.hidden_channel = hidden_channel
self.num_heads = num_heads
self.num_decoder_layers = num_decoder_layers
self.prediction_heads = prediction_heads
self.num_views = num_views
self.out_size_factor_img = out_size_factor_img
self.test_cfg = test_cfg
# self.use_camera = use_camera
self.decoder = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.decoder.append(
DeformableTransformerDecoderLayer(
hidden_channel, num_heads, dim_feedforward=ffn_channel, dropout=dropout, activation=activation,
self_posembed=query_pos[i], cross_posembed=key_pos[i],
)
)
camera_dim = 16
# if use_camera == 'se':
# self.camera_net = CameraSE(camera_dim, hidden_channel)
def forward(self, img_query_feat, normal_img_query_pos, img_query_view, img_feats, normal_img_feats_pos_stack, lidar2cam_rt, cam_intrinsic, img_metas, input_padding_mask=None):
num_img_proposals = img_query_feat.shape[-1]
level_num = len(img_feats)
batch_size = img_query_feat.shape[0]
img_feats_flatten = []
level_start_index = [0]
spatial_shapes = []
for lvl in range(level_num):
img_feat = img_feats[lvl]
h, w = img_feat.shape[-2], img_feat.shape[-1]
img_feat_flatten = img_feat.view(batch_size, self.num_views, self.hidden_channel, h*w) # [bs, num_view, C, h*w]
img_feats_flatten.append(img_feat_flatten)
level_start_index.append(level_start_index[-1] + h*w)
spatial_shapes.append([h, w])
level_start_index = level_start_index[:-1]
level_start_index = torch.LongTensor(level_start_index).to(img_query_feat.device)
spatial_shapes = torch.LongTensor(spatial_shapes).to(img_query_feat.device)
img_feats_stack = torch.cat(img_feats_flatten, dim=3) # [bs, num_view, C, h*w (sum)]
reference_points = normal_img_query_pos.sigmoid() # [bs, num_img_proposal, 2]
reference_points = reference_points[:, :, None].repeat(1, 1, level_num, 1)
camera_info = torch.zeros([batch_size, 16, num_img_proposals]).to(img_query_feat.device)
camera_info[:, :9] = lidar2cam_rt[:, :, :3, :3].permute(0, 2, 3, 1).reshape(batch_size, 9, num_img_proposals)
camera_info[:, 9:12] = lidar2cam_rt[:, :, :3, 3].permute(0, 2, 1)
camera_info[:, 12] = cam_intrinsic[:, :, 0, 0]
camera_info[:, 13] = cam_intrinsic[:, :, 1, 1]
camera_info[:, 14:16] = cam_intrinsic[:, :, :2, 2].permute(0, 2, 1)
ret_dicts = []
for i in range(self.num_decoder_layers):
img_prev_query_feat = img_query_feat.clone() # [BS, C, num_proposals]
img_query_feat = torch.zeros_like(img_query_feat) # create new container for img query feature
for sample_idx in range(batch_size):
bincount = torch.bincount(img_query_view[sample_idx], minlength=self.num_views)
view_mask = bincount > 1
max_len = torch.max(bincount)
sample_query_feats = torch.zeros([self.num_views, self.hidden_channel, max_len]).type_as(camera_info)
samples_normal_query_pos = torch.zeros([self.num_views, max_len, 2]).type_as(camera_info)
sample_reference_points = torch.zeros([self.num_views, max_len, level_num, 2]).type_as(camera_info)
sample_padding_mask = torch.zeros([self.num_views, max_len], dtype=torch.bool, device=camera_info.device)
for view_idx in range(self.num_views):
on_the_image = img_query_view[sample_idx] == view_idx # [num_on_the_image, ]
view_count = bincount[view_idx]
if torch.sum(on_the_image) <= 1:
continue
sample_query_feats[view_idx, :, :view_count] = img_prev_query_feat[sample_idx, :, on_the_image]
samples_normal_query_pos[view_idx, :view_count] = normal_img_query_pos[sample_idx, on_the_image]
sample_reference_points[view_idx, :view_count] = reference_points[sample_idx, on_the_image]
sample_padding_mask[view_idx, view_count:] = True
if input_padding_mask is None:
sample_query_feats[view_mask] = self.decoder[i](
sample_query_feats[view_mask], img_feats_stack[sample_idx, view_mask], samples_normal_query_pos[view_mask],
normal_img_feats_pos_stack.repeat(view_mask.sum(), 1, 1), reference_points=sample_reference_points[view_mask],
level_start_index=level_start_index, spatial_shapes=spatial_shapes,
query_padding_mask=sample_padding_mask[view_mask]
)
else:
sample_query_feats[view_mask] = self.decoder[i](
sample_query_feats[view_mask], img_feats_stack[sample_idx, view_mask], samples_normal_query_pos[view_mask],
normal_img_feats_pos_stack.repeat(view_mask.sum(), 1, 1), reference_points=sample_reference_points[view_mask],
level_start_index=level_start_index, spatial_shapes=spatial_shapes,
query_padding_mask=sample_padding_mask[view_mask], input_padding_mask=input_padding_mask[sample_idx,view_mask]
)
for view_idx in range(self.num_views):
on_the_image = img_query_view[sample_idx] == view_idx # [num_on_the_image, ]
if torch.sum(on_the_image) <= 1:
continue
view_count = bincount[view_idx]
img_query_feat[sample_idx, :, on_the_image] = sample_query_feats[view_idx, :, :view_count]
res_layer = self.prediction_heads(img_query_feat)
if 'center_img' in res_layer:
res_layer['center_img'] = res_layer['center_img'] + normal_img_query_pos.permute(0, 2, 1)
res_layer['center_img'] = res_layer['center_img'].sigmoid()
res_layer['dim_img'] = res_layer['dim_img'].sigmoid()
res_layer['center_2d'] = res_layer['center_2d'] + normal_img_query_pos.permute(0, 2, 1)
normal_img_query_pos = res_layer['center_2d'].detach().clone().permute(0, 2, 1)
res_layer['center_2d'] = res_layer['center_2d'].sigmoid()
if batch_size > 1 or i == self.num_decoder_layers-1: # only when training
center_2d = res_layer['center_2d'].clone().permute(0, 2, 1) # [bs, num_proposals, 2]
depth = res_layer['depth_2d'].clone().permute(0, 2, 1)[..., :1] # [bs, num_proposals, 1]
h, w = img_metas[0]['input_shape'][:2]
center_pos = denormalize_pos(center_2d, w, h, sigmoid=False) # [bs, num_proposals, 2]
center_pos = center_pos * depth
camera_coords = torch.cat([center_pos, depth], dim=2) # [bs, num_proposals, 3]
loc_cam_3d = torch.matmul(torch.inverse(cam_intrinsic[:, :, :3, :3]), camera_coords.unsqueeze(-1)).squeeze(-1) # [bs, num_proposals, 3]
res_layer['loc_cam_3d'] = loc_cam_3d.permute(0, 2, 1)
ret_dicts.append(res_layer)
# img_query_feat = self.camera_net(img_query_feat, camera_info.clone())
loc_cam_3d = copy.deepcopy(ret_dicts[-1]['loc_cam_3d'].detach()).permute(0, 2, 1)[..., None]
lidar2cam_r = camera_info[:, :9, :].permute(0, 2, 1)
lidar2cam_r = lidar2cam_r.reshape(batch_size, num_img_proposals, 3, 3)
lidar2cam_t = camera_info[:, 9:12, :].permute(0, 2, 1)[..., None]
bev_coords = torch.matmul(torch.inverse(lidar2cam_r), loc_cam_3d - lidar2cam_t)
bev_coords = bev_coords.squeeze(-1)
bev_coords[..., 0:1] = (bev_coords[..., 0:1] - self.test_cfg['pc_range'][0]) / (
self.test_cfg['pc_range'][3] - self.test_cfg['pc_range'][0])
bev_coords[..., 1:2] = (bev_coords[..., 1:2] - self.test_cfg['pc_range'][1]) / (
self.test_cfg['pc_range'][4] - self.test_cfg['pc_range'][1])
bev_coords[..., 0:1] = bev_coords[..., 0:1] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'])
bev_coords[..., 1:2] = bev_coords[..., 1:2] * (self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor'])
dims, rots, vels = self.transform_bbox(ret_dicts[-1], camera_info, w, img_metas)
bev_coords = torch.cat([bev_coords, rots, vels, dims], dim=2)
return img_query_feat, normal_img_query_pos, bev_coords, camera_info, ret_dicts
def transform_bbox(self, ret_dict, camera_info, width, img_metas):
bs = camera_info.shape[0]
num_proposal = camera_info.shape[2]
lidar2cam_rs = camera_info[:, :9]
lidar2cam_rs = lidar2cam_rs.reshape(bs, 3, 3, num_proposal)
lidar2cam_rs = lidar2cam_rs.permute(0, 3, 1, 2) # [bs, num_proposals, 3, 3]
cam2lidar_rs = torch.inverse(lidar2cam_rs)
cam_dims = ret_dict['dim_2d'].detach().clone() # [bs, 3, num_proposals]
cam_rots = ret_dict['rot_2d'].detach().clone() # [bs, 2, num_proposals]
cam_vels = ret_dict['vel_2d'].detach().clone() # [bs, 2, num_proposals]
dims = cam_dims[:, [2, 0, 1]]
dims = dims.permute(0, 2, 1)
sin_rots = -cam_rots[:, 0:1]
cos_rots = cam_rots[:, 1:2]
rot_dirs = torch.cat([cos_rots, torch.zeros_like(sin_rots), sin_rots], dim=1) # [bs, 3, num_proposals]
rot_dirs = rot_dirs.permute(0, 2, 1).unsqueeze(-1) # [bs, num_proposals, 3, 1]
rot_dirs = torch.matmul(cam2lidar_rs, rot_dirs) # [bs, num_proposals, 3, 1]
lidar_rots = -rot_dirs[:, :, [0, 1], 0] # [bs, num_proposals, 2]
cam_vels_x = cam_vels[:, 0:1, :]
cam_vels_z = cam_vels[:, 1:2, :]
vels = torch.cat([cam_vels_x, torch.zeros_like(cam_vels_x), cam_vels_z], dim=1) # [bs, 3, num_proposals]
vels = vels.permute(0, 2, 1).unsqueeze(-1) # [bs, num_proposals, 3, 1]
vels = torch.matmul(cam2lidar_rs, vels) # [bs, num_proposals, 3, 1]
lidar_vels = vels[:, :, [0, 1], 0]
return dims, lidar_rots, lidar_vels
class ViewTransformer(nn.Module):
def __init__(self, hidden_channel, num_heads, prediction_heads, ffn_channel, dropout, activation, test_cfg,
query_pos, key_pos, view_projection, use_camera):
super(ViewTransformer, self).__init__()
self.hidden_channel = hidden_channel
self.num_heads = num_heads
self.prediction_heads = prediction_heads
self.test_cfg = test_cfg
self.grid_x_size = test_cfg['grid_size'][0] // test_cfg['out_size_factor']
self.grid_y_size = test_cfg['grid_size'][1] // test_cfg['out_size_factor']
self.view_projection = view_projection
self.use_camera = use_camera
if use_camera is not None:
assert use_camera == "se"
self.camera_net = CameraSE(16, hidden_channel)
self.decoder = TransformerDecoderLayer(
hidden_channel, num_heads, ffn_channel, activation=activation, dropout=dropout,
self_posembed=query_pos, cross_posembed=key_pos,
cross_only=True
)
def forward(self, img_query_feat, img_query_pos_bev, normal_img_query_pos, img_ret_dicts, camera_info):
bs = img_query_feat.shape[0]
num_proposals = img_query_feat.shape[-1]
center_3d = img_ret_dicts[-1]['loc_cam_3d'].detach().clone().permute(0, 2, 1) # [bs, num_proposal, 3]
center_3d = center_3d[:, -num_proposals:]
if self.use_camera is not None:
img_query_feat = self.camera_net(img_query_feat, camera_info)
camera_info = camera_info.permute(0, 2, 1) # [bs, num_proposal, 16]
img_query_feat = self.view_projection(img_query_feat)
camera_R = camera_info[:, :, :9].reshape(bs, num_proposals, 3, 3)
camera_t = camera_info[:, :, 9:12].reshape(bs, num_proposals, 3, 1)
camera_t = -torch.matmul(camera_R.permute(0, 1, 3, 2), camera_t).squeeze(-1)
camera_t[..., 0:1] = (camera_t[..., 0:1] - self.test_cfg['pc_range'][0]) / (
self.test_cfg['pc_range'][3] - self.test_cfg['pc_range'][0])
camera_t[..., 1:2] = (camera_t[..., 1:2] - self.test_cfg['pc_range'][1]) / (
self.test_cfg['pc_range'][4] - self.test_cfg['pc_range'][1])
camera_t[..., 0:1] = camera_t[..., 0:1] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'])
camera_t[..., 1:2] = camera_t[..., 1:2] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'])
img_query_pos = copy.deepcopy(img_query_pos_bev[..., :7])
img_query_pos[..., :2] = inverse_sigmoid((img_query_pos[..., :2] + 12) / 204)
img_query_pos[..., 2] = inverse_sigmoid((img_query_pos[..., 2] + 10) / 20)
img_query_pos[..., 3:5] = inverse_sigmoid((img_query_pos[..., 3:5] + 1) / 2)
img_query_pos = torch.cat([img_query_pos, normal_img_query_pos], dim=2)
img_query_feat = self.decoder(img_query_feat, img_query_feat, img_query_pos, img_query_pos)
# Prediction
res_layer = self.prediction_heads(img_query_feat)
res_layer['center_mono'] = img_query_pos_bev[..., 0:2].permute(0, 2, 1)
res_layer['height_mono'] = img_query_pos_bev[..., 2:3].permute(0, 2, 1)
res_layer['rot_mono'] = img_query_pos_bev[..., 3:5].permute(0, 2, 1)
res_layer['vel_mono'] = img_query_pos_bev[..., 5:7].permute(0, 2, 1)
res_layer['dim_mono'] = img_query_pos_bev[..., 7:10].permute(0, 2, 1)
res_layer['center_view'] = res_layer['center_view'] + img_query_pos_bev[..., 0:2].permute(0, 2, 1)
img_query_pos_bev = res_layer['center_view'].detach().clone().permute(0, 2, 1)
return img_query_feat, img_query_pos_bev, [res_layer]
class FusionTransformer2D_3D_Self(nn.Module):
def __init__(self, hidden_channel, num_heads, num_decoder_layers, prediction_heads, ffn_channel, dropout, activation, test_cfg,
query_pos, key_pos, pts_projection, img_projection, num_proposals):
super(FusionTransformer2D_3D_Self, self).__init__()
self.hidden_channel = hidden_channel
self.num_heads = num_heads
self.num_decoder_layers = num_decoder_layers
self.prediction_heads = prediction_heads
self.test_cfg = test_cfg
self.grid_x_size = test_cfg['grid_size'][0] // test_cfg['out_size_factor']
self.grid_y_size = test_cfg['grid_size'][1] // test_cfg['out_size_factor']
self.pts_projection = pts_projection
self.img_projection = img_projection
self.num_proposals = num_proposals
self.decoder = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.decoder.append(
TransformerDecoderLayer(
hidden_channel, num_heads, ffn_channel, dropout, activation,
self_posembed=query_pos[i], cross_posembed=key_pos[i], cross_only=True
)
)
def forward(self, pts_query_feat, pts_query_pos, img_query_feat, img_query_pos, need_weights=False):
ret_dicts = []
pts_query_feat = self.pts_projection(pts_query_feat)
img_query_feat = self.img_projection(img_query_feat)
all_query_feat = torch.cat([pts_query_feat, img_query_feat], dim=2)
all_query_pos = torch.cat([pts_query_pos, img_query_pos], dim=1)
for i in range(self.num_decoder_layers):
# Transformer Decoder Layer
# :param query: B C Pq :param query_pos: B Pq 3/6
all_query_feat_raw = all_query_feat.clone()
if need_weights:
all_query_feat, attn_weights = self.decoder[i](all_query_feat, all_query_feat, all_query_pos, all_query_pos, need_weights=True)
else:
all_query_feat = self.decoder[i](all_query_feat, all_query_feat, all_query_pos, all_query_pos)
all_query_feat_pred = all_query_feat
# Prediction
res_layer = self.prediction_heads(all_query_feat_pred)
res_layer['center'] = res_layer['center'] + all_query_pos.permute(0, 2, 1)
ret_dicts.append(res_layer)
all_query_pos = res_layer['center'].detach().clone().permute(0, 2, 1)
# return all_query_feat, all_query_pos, ret_dicts
if need_weights:
return all_query_feat, all_query_pos, ret_dicts, attn_weights
else:
return all_query_feat, all_query_pos, ret_dicts
class ImageTransformer2D_3D_MS(nn.Module):
def __init__(self, num_views, hidden_channel, num_heads, num_decoder_layers, prediction_heads, out_size_factor_img,
ffn_channel, dropout, activation, test_cfg, query_pos, key_pos, supervision2d):
super(ImageTransformer2D_3D_MS, self).__init__()
self.hidden_channel = hidden_channel
self.num_heads = num_heads
self.num_decoder_layers = num_decoder_layers
self.prediction_heads = prediction_heads
self.num_views = num_views
self.out_size_factor_img = out_size_factor_img
self.test_cfg = test_cfg
self.supervision2d = supervision2d
self.decoder = nn.ModuleList()
for i in range(self.num_decoder_layers):
self.decoder.append(
DeformableTransformerDecoderLayer(
hidden_channel, num_heads, dim_feedforward=ffn_channel, dropout=dropout, activation=activation,
self_posembed=query_pos[i], cross_posembed=key_pos[i],
)
)
def forward(self, img_query_feat, normal_img_query_pos, img_query_view, img_feats, normal_img_feats_pos_stack, img_metas):
level_num = len(img_feats)
batch_size = img_query_feat.shape[0]
img_feats_flatten = []
level_start_index = [0]
spatial_shapes = []
for lvl in range(level_num):
img_feat = img_feats[lvl]
h, w = img_feat.shape[-2], img_feat.shape[-1]
img_feat_flatten = img_feat.view(batch_size, self.num_views, self.hidden_channel, h*w) # [bs, num_view, C, h*w]
img_feats_flatten.append(img_feat_flatten)
level_start_index.append(level_start_index[-1] + h*w)
spatial_shapes.append([h, w])
level_start_index = level_start_index[:-1]
level_start_index = torch.LongTensor(level_start_index).to(img_query_feat.device)
spatial_shapes = torch.LongTensor(spatial_shapes).to(img_query_feat.device)
img_feats_stack = torch.cat(img_feats_flatten, dim=3) # [bs, num_view, C, h*w (sum)]
reference_points = normal_img_query_pos.sigmoid() # [bs, num_img_proposal, 2]
reference_points = reference_points[:, :, None].repeat(1, 1, level_num, 1)
ret_dicts = []
for i in range(self.num_decoder_layers):
img_prev_query_feat = img_query_feat.clone() # [BS, C, num_proposals]
img_query_feat = torch.zeros_like(img_query_feat) # create new container for img query feature
for sample_idx in range(batch_size):
for view_idx in range(self.num_views):
on_the_image = img_query_view[sample_idx] == view_idx # [num_on_the_image, ]
if torch.sum(on_the_image) <= 1:
continue
img_query_feat_view = img_prev_query_feat[sample_idx, :, on_the_image] # [C, num_on_the_image]
img_query_feat_view = self.decoder[i](
img_query_feat_view[None], img_feats_stack[sample_idx:sample_idx + 1, view_idx],
normal_img_query_pos[sample_idx:sample_idx + 1, on_the_image], normal_img_feats_pos_stack,
reference_points=reference_points[sample_idx:sample_idx+1, on_the_image],
level_start_index=level_start_index, spatial_shapes=spatial_shapes
)
img_query_feat[sample_idx, :, on_the_image] = img_query_feat_view.clone()
res_layer = self.prediction_heads(img_query_feat)
if 'center_offset' in res_layer:
assert 'center_2d' not in res_layer and 'offset' not in res_layer
res_layer['center_2d'] = res_layer['center_offset'][:, :2]
res_layer['offset'] = res_layer['center_offset'][:, 2:]
res_layer['center_2d'] = res_layer['center_2d'] + normal_img_query_pos.permute(0, 2, 1)
if self.supervision2d:
normal_img_query_pos = res_layer['center_2d'].detach().clone().permute(0, 2, 1)
res_layer['center_2d'] = res_layer['center_2d'].sigmoid()
res_layer['offset'] = res_layer['offset'].sigmoid()
bbox_width = res_layer['offset'][:, 0] + res_layer['offset'][:, 2]
bbox_height = res_layer['offset'][:, 1] + res_layer['offset'][:, 3]
bbox_cx = (res_layer['center_2d'][:, 0] - res_layer['offset'][:, 0] + res_layer['center_2d'][:, 0] + res_layer['offset'][:, 2]) / 2
bbox_cy = (res_layer['center_2d'][:, 1] - res_layer['offset'][:, 1] + res_layer['center_2d'][:, 1] + res_layer['offset'][:, 3]) / 2
res_layer['bbox_2d'] = torch.stack([bbox_cx, bbox_cy, bbox_width, bbox_height], dim=1).detach().clone()
ret_dicts.append(res_layer)
return img_query_feat, normal_img_query_pos, ret_dicts
def camera2lidar(self, camera_coords, lidar2img, img_meta, batch_size):
# img_pos: [W*H, 2]
coords = torch.cat([camera_coords, torch.ones_like(camera_coords[..., :1])], dim=1) # [N, 4]
img2lidars = torch.inverse(lidar2img)
coords3d = torch.matmul(img2lidars, coords.unsqueeze(-1)).squeeze(-1)[..., :3] # [N, 3]
if batch_size > 1:
coords3d = apply_3d_transformation(coords3d, 'LIDAR', img_meta, reverse=False).detach()
coords3d[..., 0:1] = (coords3d[..., 0:1] - self.test_cfg['pc_range'][0]) / (
self.test_cfg['pc_range'][3] - self.test_cfg['pc_range'][0])
coords3d[..., 1:2] = (coords3d[..., 1:2] - self.test_cfg['pc_range'][1]) / (
self.test_cfg['pc_range'][4] - self.test_cfg['pc_range'][1])
coords3d[..., 0:1] = coords3d[..., 0:1] * (self.test_cfg['grid_size'][0] // self.test_cfg['out_size_factor'])
coords3d[..., 1:2] = coords3d[..., 1:2] * (self.test_cfg['grid_size'][1] // self.test_cfg['out_size_factor'])
if not self.pos_3d:
coords3d = coords3d[..., :2] # [N, 3]
if self.pos_3d:
coords3d = coords3d.contiguous().view(coords3d.size(0), 3)
else:
coords3d = coords3d.contiguous().view(coords3d.size(0), 2)
return coords3d
================================================
FILE: mmdet3d/models/utils/transformer.py
================================================
# Copyright (c) OpenMMLab. All rights reserved.
import copy
import math
import warnings
import collections
from typing import Sequence, Iterable, Optional
from itertools import repeat
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer, build_norm_layer)
from mmcv.runner.base_module import BaseModule
from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, TORCH_VERSION, digit_version)
from .drop import build_dropout
from mmdet3d.models.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
from mmcv.cnn.bricks.registry import ACTIVATION_LAYERS
# From PyTorch internals
def _ntuple(n):
def parse(x):
if isinstance(x, collections.abc.Iterable):
return x
return tuple(repeat(x, n))
return parse
to_2tuple = _ntuple(2)
class GELU(nn.Module):
r"""Applies the Gaussian Error Linear Units function:
.. math::
\text{GELU}(x) = x * \Phi(x)
where :math:`\Phi(x)` is the Cumulative Distribution Function for
Gaussian Distribution.
Shape:
- Input: :math:`(N, *)` where `*` means, any number of additional
dimensions
- Output: :math:`(N, *)`, same shape as the input
.. image:: scripts/activation_images/GELU.png
Examples::
>>> m = nn.GELU()
>>> input = torch.randn(2)
>>> output = m(input)
"""
def forward(self, input: torch.Tensor) -> torch.Tensor:
return F.gelu(input)
if (TORCH_VERSION == 'parrots' or digit_version(TORCH_VERSION) < digit_version('1.4')):
ACTIVATION_LAYERS.register_module(module=GELU)
else:
ACTIVATION_LAYERS.register_module(module=nn.GELU)
class ModuleList(BaseModule, nn.ModuleList):
"""ModuleList in openmmlab.
Args:
modules (iterable, optional): an iterable of modules to add.
init_cfg (dict, optional): Initialization config dict.
"""
def __init__(self,
modules: Optional[Iterable] = None,
init_cfg: Optional[dict] = None):
BaseModule.__init__(self, init_cfg)
nn.ModuleList.__init__(self, modules)
class Sequential(BaseModule, nn.Sequential):
"""Sequential module in openmmlab.
Args:
init_cfg (dict, optional): Initialization config dict.
"""
def __init__(self, *args, init_cfg: Optional[dict] = None):
BaseModule.__init__(self, init_cfg)
nn.Sequential.__init__(self, *args)
def build_positional_encoding(cfg, default_args=None):
"""Builder for Position Encoding."""
return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
def build_attention(cfg, default_args=None):
"""Builder for attention."""
return build_from_cfg(cfg, ATTENTION, default_args)
def build_feedforward_network(cfg, default_args=None):
"""Builder for feed-forward network (FFN)."""
return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
def build_transformer_layer(cfg, default_args=None):
"""Builder for transformer layer."""
return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
def build_transformer_layer_sequence(cfg, default_args=None):
"""Builder for transformer encoder and transformer decoder."""
return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
class AdaptivePadding(nn.Module):
"""Applies padding adaptively to the input.
This module can make input get fully covered by filter
you specified. It support two modes "same" and "corner". The
"same" mode is same with "SAME" padding mode in TensorFlow, pad
zero around input. The "corner" mode would pad zero
to bottom right.
Args:
kernel_size (int | tuple): Size of the kernel. Default: 1.
stride (int | tuple): Stride of the filter. Default: 1.
dilation (int | tuple): Spacing between kernel elements.
Default: 1.
padding (str): Support "same" and "corner", "corner" mode
would pad zero to bottom right, and "same" mode would
pad zero around input. Default: "corner".
Example:
>>> kernel_size = 16
>>> stride = 16
>>> dilation = 1
>>> input = torch.rand(1, 1, 15, 17)
>>> adap_pad = AdaptivePadding(
>>> kernel_size=kernel_size,
>>> stride=stride,
>>> dilation=dilation,
>>> padding="corner")
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
>>> input = torch.rand(1, 1, 16, 17)
>>> out = adap_pad(input)
>>> assert (out.shape[2], out.shape[3]) == (16, 32)
"""
def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
super().__init__()
assert padding in ('same', 'corner')
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
self.padding = padding
self.kernel_size = kernel_size
self.stride = stride
self.dilation = dilation
def get_pad_shape(self, input_shape):
"""Calculate the padding size of input.
Args:
input_shape (:obj:`torch.Size`): arrange as (H, W).
Returns:
Tuple[int]: The padding size along the
original H and W directions
"""
input_h, input_w = input_shape
kernel_h, kernel_w = self.kernel_size
stride_h, stride_w = self.stride
output_h = math.ceil(input_h / stride_h)
output_w = math.ceil(input_w / stride_w)
pad_h = max((output_h - 1) * stride_h +
(kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
pad_w = max((output_w - 1) * stride_w +
(kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
return pad_h, pad_w
def forward(self, x):
"""Add padding to `x`
Args:
x (Tensor): Input tensor has shape (B, C, H, W).
Returns:
Tensor: The tensor with adaptive padding
"""
pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
if pad_h > 0 or pad_w > 0:
if self.padding == 'corner':
x = F.pad(x, [0, pad_w, 0, pad_h])
elif self.padding == 'same':
x = F.pad(x, [
pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
pad_h - pad_h // 2
])
return x
class PatchEmbed(BaseModule):
"""Image to Patch Embedding.
We use a conv layer to implement PatchEmbed.
Args:
in_channels (int): The num of input channels. Default: 3
embed_dims (int): The dimensions of embedding. Default: 768
conv_type (str): The type of convolution
to generate patch embedding. Default: "Conv2d".
kernel_size (int): The kernel_size of embedding conv. Default: 16.
stride (int): The slide stride of embedding conv.
Default: 16.
padding (int | tuple | string): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int): The dilation rate of embedding conv. Default: 1.
bias (bool): Bias of embed conv. Default: True.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: None.
input_size (int | tuple | None): The size of input, which will be
used to calculate the out size. Only works when `dynamic_size`
is False. Default: None.
init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
Default: None.
"""
def __init__(self,
in_channels=3,
embed_dims=768,
conv_type='Conv2d',
kernel_size=16,
stride=16,
padding='corner',
dilation=1,
bias=True,
norm_cfg=None,
input_size=None,
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.embed_dims = embed_dims
if stride is None:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adaptive_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of conv
padding = 0
else:
self.adaptive_padding = None
padding = to_2tuple(padding)
self.projection = build_conv_layer(
dict(type=conv_type),
in_channels=in_channels,
out_channels=embed_dims,
kernel_size=kernel_size,
stride=stride,
padding=padding,
dilation=dilation,
bias=bias)
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
else:
self.norm = None
if input_size:
input_size = to_2tuple(input_size)
# `init_out_size` would be used outside to
# calculate the num_patches
# e.g. when `use_abs_pos_embed` outside
self.init_input_size = input_size
if self.adaptive_padding:
pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
input_h, input_w = input_size
input_h = input_h + pad_h
input_w = input_w + pad_w
input_size = (input_h, input_w)
# https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
(kernel_size[0] - 1) - 1) // stride[0] + 1
w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
(kernel_size[1] - 1) - 1) // stride[1] + 1
self.init_out_size = (h_out, w_out)
else:
self.init_input_size = None
self.init_out_size = None
def forward(self, x):
"""
Args:
x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, out_h * out_w, embed_dims)
- out_size (tuple[int]): Spatial shape of x, arrange as
(out_h, out_w).
"""
if self.adaptive_padding:
x = self.adaptive_padding(x)
x = self.projection(x)
out_size = (x.shape[2], x.shape[3])
x = x.flatten(2).transpose(1, 2)
if self.norm is not None:
x = self.norm(x)
return x, out_size
class PatchMerging(BaseModule):
"""Merge patch feature map.
This layer groups feature map by kernel_size, and applies norm and linear
layers to the grouped feature map ((used in Swin Transformer)).
Our implementation uses `nn.Unfold` to
merge patches, which is about 25% faster than the original
implementation. However, we need to modify pretrained
models for compatibility.
Args:
in_channels (int): The num of input channels.
to gets fully covered by filter and stride you specified.
out_channels (int): The num of output channels.
kernel_size (int | tuple, optional): the kernel size in the unfold
layer. Defaults to 2.
stride (int | tuple, optional): the stride of the sliding blocks in the
unfold layer. Default: None. (Would be set as `kernel_size`)
padding (int | tuple | string ): The padding length of
embedding conv. When it is a string, it means the mode
of adaptive padding, support "same" and "corner" now.
Default: "corner".
dilation (int | tuple, optional): dilation parameter in the unfold
layer. Default: 1.
bias (bool, optional): Whether to add bias in linear layer or not.
Defaults: False.
norm_cfg (dict, optional): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (dict, optional): The extra config for initialization.
Default: None.
"""
def __init__(self,
in_channels,
out_channels,
kernel_size=2,
stride=None,
padding='corner',
dilation=1,
bias=False,
norm_cfg=dict(type='LN'),
init_cfg=None):
super().__init__(init_cfg=init_cfg)
self.in_channels = in_channels
self.out_channels = out_channels
if stride:
stride = stride
else:
stride = kernel_size
kernel_size = to_2tuple(kernel_size)
stride = to_2tuple(stride)
dilation = to_2tuple(dilation)
if isinstance(padding, str):
self.adaptive_padding = AdaptivePadding(
kernel_size=kernel_size,
stride=stride,
dilation=dilation,
padding=padding)
# disable the padding of unfold
padding = 0
else:
self.adaptive_padding = None
padding = to_2tuple(padding)
self.sampler = nn.Unfold(
kernel_size=kernel_size,
dilation=dilation,
padding=padding,
stride=stride)
sample_dim = kernel_size[0] * kernel_size[1] * in_channels
if norm_cfg is not None:
self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
else:
self.norm = None
self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
def forward(self, x, input_size):
"""
Args:
x (Tensor): Has shape (B, H*W, C_in).
input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
Default: None.
Returns:
tuple: Contains merged results and its spatial shape.
- x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
- out_size (tuple[int]): Spatial shape of x, arrange as
(Merged_H, Merged_W).
"""
B, L, C = x.shape
assert isinstance(input_size, Sequence), f'Expect ' \
f'input_size is ' \
f'`Sequence` ' \
f'but get {input_size}'
H, W = input_size
assert L == H * W, 'input feature has wrong size'
x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W
if self.adaptive_padding:
x = self.adaptive_padding(x)
H, W = x.shape[-2:]
# Use nn.Unfold to merge patch. About 25% faster than original method,
# but need to modify pretrained model for compatibility
# if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
x = self.sampler(x)
out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
(self.sampler.kernel_size[0] - 1) -
1) // self.sampler.stride[0] + 1
out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
(self.sampler.kernel_size[1] - 1) -
1) // self.sampler.stride[1] + 1
output_size = (out_h, out_w)
x = x.transpose(1, 2) # B, H/2*W/2, 4*C
x = self.norm(x) if self.norm else x
x = self.reduction(x)
return x, output_size
@ATTENTION.register_module()
class MultiheadAttention(BaseModule):
"""A wrapper for ``torch.nn.MultiheadAttention``.
This module implements MultiheadAttention with identity connection,
and positional encoding is also passed as input.
Args:
embed_dims (int): The embedding dimension.
num_heads (int): Parallel attention heads.
attn_drop (float): A Dropout layer on attn_output_weights.
Default: 0.0.
proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
Default: 0.0.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): When it is True, Key, Query and Value are shape of
(batch, n, embed_dim), otherwise (n, batch, embed_dim).
Default to False.
"""
def __init__(self,
embed_dims,
num_heads,
attn_drop=0.,
proj_drop=0.,
dropout_layer=dict(type='Dropout', drop_prob=0.),
init_cfg=None,
batch_first=False,
**kwargs):
super().__init__(init_cfg)
if 'dropout' in kwargs:
warnings.warn(
'The arguments `dropout` in MultiheadAttention '
'has been deprecated, now you can separately '
'set `attn_drop`(float), proj_drop(float), '
'and `dropout_layer`(dict) ', DeprecationWarning)
attn_drop = kwargs['dropout']
dropout_layer['drop_prob'] = kwargs.pop('dropout')
self.embed_dims = embed_dims
self.num_heads = num_heads
self.batch_first = batch_first
self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
**kwargs)
self.proj_drop = nn.Dropout(proj_drop)
self.dropout_layer = build_dropout(
dropout_layer) if dropout_layer else nn.Identity()
@deprecated_api_warning({'residual': 'identity'},
cls_name='MultiheadAttention')
def forward(self,
query,
key=None,
value=None,
identity=None,
query_pos=None,
key_pos=None,
attn_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `MultiheadAttention`.
**kwargs allow passing a more general data flow when combining
with other operations in `transformerlayer`.
Args:
query (Tensor): The input query with shape [num_queries, bs,
embed_dims] if self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
If None, the ``query`` will be used. Defaults to None.
value (Tensor): The value tensor with same shape as `key`.
Same in `nn.MultiheadAttention.forward`. Defaults to None.
If None, the `key` will be used.
identity (Tensor): This tensor, with the same shape as x,
will be used for the identity link.
If None, `x` will be used. Defaults to None.
query_pos (Tensor): The positional encoding for query, with
the same shape as `x`. If not None, it will
be added to `x` before forward function. Defaults to None.
key_pos (Tensor): The positional encoding for `key`, with the
same shape as `key`. Defaults to None. If not None, it will
be added to `key` before forward function. If None, and
`query_pos` has the same shape as `key`, then `query_pos`
will be used for `key_pos`. Defaults to None.
attn_mask (Tensor): ByteTensor mask with shape [num_queries,
num_keys]. Same in `nn.MultiheadAttention.forward`.
Defaults to None.
key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
Defaults to None.
Returns:
Tensor: forwarded results with shape
[num_queries, bs, embed_dims]
if self.batch_first is False, else
[bs, num_queries embed_dims].
"""
if key is None:
key = query
if value is None:
value = key
if identity is None:
identity = query
if key_pos is None:
if query_pos is not None:
# use query_pos if key_pos is not available
if query_pos.shape == key.shape:
key_pos = query_pos
else:
warnings.warn(f'position encoding of key is'
f'missing in {self.__class__.__name__}.')
if query_pos is not None:
query = query + query_pos
if key_pos is not None:
key = key + key_pos
# Because the dataflow('key', 'query', 'value') of
# ``torch.nn.MultiheadAttention`` is (num_query, batch,
# embed_dims), We should adjust the shape of dataflow from
# batch_first (batch, num_query, embed_dims) to num_query_first
# (num_query ,batch, embed_dims), and recover ``attn_output``
# from num_query_first to batch_first.
if self.batch_first:
query = query.transpose(0, 1)
key = key.transpose(0, 1)
value = value.transpose(0, 1)
out = self.attn(
query=query,
key=key,
value=value,
attn_mask=attn_mask,
key_padding_mask=key_padding_mask)[0]
if self.batch_first:
out = out.transpose(0, 1)
return identity + self.dropout_layer(self.proj_drop(out))
@FEEDFORWARD_NETWORK.register_module()
class FFN(BaseModule):
"""Implements feed-forward networks (FFNs) with identity connection.
Args:
embed_dims (int): The feature dimension. Same as
`MultiheadAttention`. Defaults: 256.
feedforward_channels (int): The hidden dimension of FFNs.
Defaults: 1024.
num_fcs (int, optional): The number of fully-connected layers in
FFNs. Default: 2.
act_cfg (dict, optional): The activation config for FFNs.
Default: dict(type='ReLU')
ffn_drop (float, optional): Probability of an element to be
zeroed in FFN. Default 0.0.
add_identity (bool, optional): Whether to add the
identity connection. Default: `True`.
dropout_layer (obj:`ConfigDict`): The dropout_layer used
when adding the shortcut.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
@deprecated_api_warning(
{
'dropout': 'ffn_drop',
'add_residual': 'add_identity'
},
cls_name='FFN')
def __init__(self,
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
act_cfg=dict(type='ReLU', inplace=True),
ffn_drop=0.,
dropout_layer=None,
add_identity=True,
init_cfg=None,
**kwargs):
super().__init__(init_cfg)
assert num_fcs >= 2, 'num_fcs should be no less ' \
f'than 2. got {num_fcs}.'
self.embed_dims = embed_dims
self.feedforward_channels = feedforward_channels
self.num_fcs = num_fcs
self.act_cfg = act_cfg
self.activate = build_activation_layer(act_cfg)
layers = []
in_channels = embed_dims
for _ in range(num_fcs - 1):
layers.append(
Sequential(
Linear(in_channels, feedforward_channels), self.activate,
nn.Dropout(ffn_drop)))
in_channels = feedforward_channels
layers.append(Linear(feedforward_channels, embed_dims))
layers.append(nn.Dropout(ffn_drop))
self.layers = Sequential(*layers)
self.dropout_layer = build_dropout(
dropout_layer) if dropout_layer else torch.nn.Identity()
self.add_identity = add_identity
@deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
def forward(self, x, identity=None):
"""Forward function for `FFN`.
The function would add x to the output tensor if residue is None.
"""
out = self.layers(x)
if not self.add_identity:
return self.dropout_layer(out)
if identity is None:
identity = x
return identity + self.dropout_layer(out)
@TRANSFORMER_LAYER.register_module()
class BaseTransformerLayer(BaseModule):
"""Base `TransformerLayer` for vision transformer.
It can be built from `mmcv.ConfigDict` and support more flexible
customization, for example, using any number of `FFN or LN ` and
use different kinds of `attention` by specifying a list of `ConfigDict`
named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
when you specifying `norm` as the first element of `operation_order`.
More details about the `prenorm`: `On Layer Normalization in the
Transformer Architecture `_ .
Args:
attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for `self_attention` or `cross_attention` modules,
The order of the configs in the list should be consistent with
corresponding attentions in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config. Default: None.
ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
Configs for FFN, The order of the configs in the list should be
consistent with corresponding ffn in operation_order.
If it is a dict, all of the attention modules in operation_order
will be built with this config.
operation_order (tuple[str]): The execution order of operation
in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
Support `prenorm` when you specifying first element as `norm`.
Default:None.
norm_cfg (dict): Config dict for normalization layer.
Default: dict(type='LN').
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
batch_first (bool): Key, Query and Value are shape
of (batch, n, embed_dim)
or (n, batch, embed_dim). Default to False.
"""
def __init__(self,
attn_cfgs=None,
ffn_cfgs=dict(
type='FFN',
embed_dims=256,
feedforward_channels=1024,
num_fcs=2,
ffn_drop=0.,
act_cfg=dict(type='ReLU', inplace=True),
),
operation_order=None,
norm_cfg=dict(type='LN'),
init_cfg=None,
batch_first=False,
**kwargs):
deprecated_args = dict(
feedforward_channels='feedforward_channels',
ffn_dropout='ffn_drop',
ffn_num_fcs='num_fcs')
for ori_name, new_name in deprecated_args.items():
if ori_name in kwargs:
warnings.warn(
f'The arguments `{ori_name}` in BaseTransformerLayer '
f'has been deprecated, now you should set `{new_name}` '
f'and other FFN related arguments '
f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
ffn_cfgs[new_name] = kwargs[ori_name]
super().__init__(init_cfg)
self.batch_first = batch_first
assert set(operation_order) & {
'self_attn', 'norm', 'ffn', 'cross_attn'} == \
set(operation_order), f'The operation_order of' \
f' {self.__class__.__name__} should ' \
f'contains all four operation type ' \
f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
num_attn = operation_order.count('self_attn') + operation_order.count(
'cross_attn')
if isinstance(attn_cfgs, dict):
attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
else:
assert num_attn == len(attn_cfgs), f'The length ' \
f'of attn_cfg {num_attn} is ' \
f'not consistent with the number of attention' \
f'in operation_order {operation_order}.'
self.num_attn = num_attn
self.operation_order = operation_order
self.norm_cfg = norm_cfg
self.pre_norm = operation_order[0] == 'norm'
self.attentions = ModuleList()
index = 0
for operation_name in operation_order:
if operation_name in ['self_attn', 'cross_attn']:
if 'batch_first' in attn_cfgs[index]:
assert self.batch_first == attn_cfgs[index]['batch_first']
else:
attn_cfgs[index]['batch_first'] = self.batch_first
attention = build_attention(attn_cfgs[index])
# Some custom attentions used as `self_attn`
# or `cross_attn` can have different behavior.
attention.operation_name = operation_name
self.attentions.append(attention)
index += 1
self.embed_dims = self.attentions[0].embed_dims
self.ffns = ModuleList()
num_ffns = operation_order.count('ffn')
if isinstance(ffn_cfgs, dict):
ffn_cfgs = ConfigDict(ffn_cfgs)
if isinstance(ffn_cfgs, dict):
ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
assert len(ffn_cfgs) == num_ffns
for ffn_index in range(num_ffns):
if 'embed_dims' not in ffn_cfgs[ffn_index]:
ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
else:
assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
self.ffns.append(
build_feedforward_network(ffn_cfgs[ffn_index],
dict(type='FFN')))
self.norms = ModuleList()
num_norms = operation_order.count('norm')
for _ in range(num_norms):
self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
def forward(self,
query,
key=None,
value=None,
query_pos=None,
key_pos=None,
attn_masks=None,
query_key_padding_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `TransformerDecoderLayer`.
**kwargs contains some specific arguments of attentions.
Args:
query (Tensor): The input query with shape
[num_queries, bs, embed_dims] if
self.batch_first is False, else
[bs, num_queries embed_dims].
key (Tensor): The key tensor with shape [num_keys, bs,
embed_dims] if self.batch_first is False, else
[bs, num_keys, embed_dims] .
value (Tensor): The value tensor with same shape as `key`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor] | None): 2D Tensor used in
calculation of corresponding attention. The length of
it should equal to the number of `attention` in
`operation_order`. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in `self_attn` layer.
Defaults to None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: forwarded results with shape [num_queries, bs, embed_dims].
"""
norm_index = 0
attn_index = 0
ffn_index = 0
identity = query
if attn_masks is None:
attn_masks = [None for _ in range(self.num_attn)]
elif isinstance(attn_masks, torch.Tensor):
attn_masks = [
copy.deepcopy(attn_masks) for _ in range(self.num_attn)
]
warnings.warn(f'Use same attn_mask in all attentions in '
f'{self.__class__.__name__} ')
else:
assert len(attn_masks) == self.num_attn, f'The length of ' \
f'attn_masks {len(attn_masks)} must be equal ' \
f'to the number of attention in ' \
f'operation_order {self.num_attn}'
for layer in self.operation_order:
if layer == 'self_attn':
temp_key = temp_value = query
query = self.attentions[attn_index](
query,
temp_key,
temp_value,
identity if self.pre_norm else None,
query_pos=query_pos,
key_pos=query_pos,
attn_mask=attn_masks[attn_index],
key_padding_mask=query_key_padding_mask,
**kwargs)
attn_index += 1
identity = query
elif layer == 'norm':
query = self.norms[norm_index](query)
norm_index += 1
elif layer == 'cross_attn':
query = self.attentions[attn_index](
query,
key,
value,
identity if self.pre_norm else None,
query_pos=query_pos,
key_pos=key_pos,
attn_mask=attn_masks[attn_index],
key_padding_mask=key_padding_mask,
**kwargs)
attn_index += 1
identity = query
elif layer == 'ffn':
query = self.ffns[ffn_index](
query, identity if self.pre_norm else None)
ffn_index += 1
return query
@TRANSFORMER_LAYER_SEQUENCE.register_module()
class TransformerLayerSequence(BaseModule):
"""Base class for TransformerEncoder and TransformerDecoder in vision
transformer.
As base-class of Encoder and Decoder in vision transformer.
Support customization such as specifying different kind
of `transformer_layer` in `transformer_coder`.
Args:
transformerlayer (list[obj:`mmcv.ConfigDict`] |
obj:`mmcv.ConfigDict`): Config of transformerlayer
in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
it would be repeated `num_layer` times to a
list[`mmcv.ConfigDict`]. Default: None.
num_layers (int): The number of `TransformerLayer`. Default: None.
init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
Default: None.
"""
def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
super().__init__(init_cfg)
if isinstance(transformerlayers, dict):
transformerlayers = [
copy.deepcopy(transformerlayers) for _ in range(num_layers)
]
else:
assert isinstance(transformerlayers, list) and \
len(transformerlayers) == num_layers
self.num_layers = num_layers
self.layers = ModuleList()
for i in range(num_layers):
self.layers.append(build_transformer_layer(transformerlayers[i]))
self.embed_dims = self.layers[0].embed_dims
self.pre_norm = self.layers[0].pre_norm
def forward(self,
query,
key,
value,
query_pos=None,
key_pos=None,
attn_masks=None,
query_key_padding_mask=None,
key_padding_mask=None,
**kwargs):
"""Forward function for `TransformerCoder`.
Args:
query (Tensor): Input query with shape
`(num_queries, bs, embed_dims)`.
key (Tensor): The key tensor with shape
`(num_keys, bs, embed_dims)`.
value (Tensor): The value tensor with shape
`(num_keys, bs, embed_dims)`.
query_pos (Tensor): The positional encoding for `query`.
Default: None.
key_pos (Tensor): The positional encoding for `key`.
Default: None.
attn_masks (List[Tensor], optional): Each element is 2D Tensor
which is used in calculation of corresponding attention in
operation_order. Default: None.
query_key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_queries]. Only used in self-attention
Default: None.
key_padding_mask (Tensor): ByteTensor for `query`, with
shape [bs, num_keys]. Default: None.
Returns:
Tensor: results with shape [num_queries, bs, embed_dims].
"""
for layer in self.layers:
query = layer(
query,
key,
value,
query_pos=query_pos,
key_pos=key_pos,
attn_masks=attn_masks,
query_key_padding_mask=query_key_padding_mask,
key_padding_mask=key_padding_mask,
**kwargs)
return query
================================================
FILE: mmdet3d/models/utils/transformerdecoder.py
================================================
import copy
import numpy as np
import torch
from mmcv.cnn import ConvModule, build_conv_layer, kaiming_init
from mmcv.runner import force_fp32
from torch import nn
import torch.nn.functional as F
from torch.nn.parameter import Parameter
from torch.nn import Linear
from torch.nn.init import xavier_uniform_, constant_
from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius,
xywhr2xyxyr, limit_period, PseudoSampler)
from mmdet3d.core.bbox.structures import rotation_3d_in_axis
from mmdet3d.core import Box3DMode, LiDARInstance3DBoxes
from mmdet3d.models import builder
from mmdet3d.models.builder import HEADS, build_loss
from mmdet3d.models.utils import clip_sigmoid
from mmdet3d.models.fusion_layers import apply_3d_transformation
from mmdet3d.ops.iou3d.iou3d_utils import nms_gpu
from mmdet.core import build_bbox_coder, multi_apply, build_assigner, build_sampler, AssignResult
from mmdet3d.ops.roiaware_pool3d import points_in_boxes_batch
class PositionEmbeddingLearnedLN(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Linear(input_channel, num_pos_feats),
nn.ReLU(inplace=True),
nn.Linear(num_pos_feats, num_pos_feats),
nn.LayerNorm(num_pos_feats),
)
def forward(self, xyz):
position_embedding = self.position_embedding_head(xyz)
position_embedding = position_embedding.transpose(1, 2).contiguous()
return position_embedding
class PositionEmbeddingLearned(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
nn.BatchNorm1d(num_pos_feats),
nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
def forward(self, xyz):
xyz = xyz.transpose(1, 2).contiguous()
position_embedding = self.position_embedding_head(xyz)
return position_embedding
class PositionEmbeddingLearnedwoNorm(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, input_channel, num_pos_feats=288):
super().__init__()
self.position_embedding_head = nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
# nn.BatchNorm1d(num_pos_feats),
nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1))
def forward(self, xyz):
xyz = xyz.transpose(1, 2).contiguous()
position_embedding = self.position_embedding_head(xyz)
return position_embedding
class PositionEmbeddingLearnedMulti(nn.Module):
"""
Absolute pos embedding, learned.
"""
def __init__(self, input_channel, num_pos_feats=288, pos_num=2):
super().__init__()
self.position_embedding_heads = nn.ModuleList()
self.pos_num = pos_num
for i in range(pos_num):
self.position_embedding_heads.append(nn.Sequential(
nn.Conv1d(input_channel, num_pos_feats, kernel_size=1),
nn.BatchNorm1d(num_pos_feats),
nn.ReLU(inplace=True),
nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1)
))
def forward(self, xyzs):
output = None
for i in range(self.pos_num):
xyz = xyzs[i].transpose(1, 2).contiguous()
position_embedding = self.position_embedding_heads[i](xyz)
if output is None:
output = position_embedding
else:
output = output + position_embedding
return output
class PositionEmbeddingLearnedMultiInput(nn.Module):
def __init__(self, input_channels, num_pos_feats=288):
super().__init__()
self.position_embedding_heads = nn.ModuleList()
self.pos_num = len(input_channels)
for i in range(self.pos_num):
pos_embed = PositionEmbeddingLearned(input_channels[i], num_pos_feats)
self.position_embedding_heads.append(pos_embed)
def forward(self, xyzs):
output = None
assert len(xyzs) == self.pos_num
for i in range(self.pos_num):
if output is None:
output = self.position_embedding_heads[i](xyzs[i])
else:
output = output + self.position_embedding_heads[i](xyzs[i])
return output
class TransformerDecoderLayer(nn.Module):
def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
self_posembed=None, cross_posembed=None, cross_only=False):
super().__init__()
self.cross_only = cross_only
if not self.cross_only:
self.self_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
self.multihead_attn = MultiheadAttention(d_model, nhead, dropout=dropout)
# Implementation of Feedforward model
self.linear1 = nn.Linear(d_model, dim_feedforward)
self.dropout = nn.Dropout(dropout)
self.linear2 = nn.Linear(dim_feedforward, d_model)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)
def _get_activation_fn(activation):
"""Return an activation function given a string"""
if activation == "relu":
return F.relu
if activation == "gelu":
return F.gelu
if activation == "glu":
return F.glu
raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
self.activation = _get_activation_fn(activation)
self.self_posembed = self_posembed
self.cross_posembed = cross_posembed
def with_pos_embed(self, tensor, pos_embed):
return tensor if pos_embed is None else tensor + pos_embed
def forward(self, query, key, query_pos, key_pos, attn_mask=None, need_weights=False):
"""
:param query: B C Pq
:param key: B C Pk
:param query_pos: B Pq 3/6
:param key_pos: B Pk 3/6
:param value_pos: [B Pq 3/6]
:return:
"""
# NxCxP to PxNxC
if self.self_posembed is not None:
query_pos_embed = self.self_posembed(query_pos).permute(2, 0, 1)
else:
query_pos_embed = None
if self.cross_posembed is not None:
key_pos_embed = self.cross_posembed(key_pos).permute(2, 0, 1)
else:
key_pos_embed = None
query = query.permute(2, 0, 1)
key = key.permute(2, 0, 1)
if not self.cross_only:
q = k = v = self.with_pos_embed(query, query_pos_embed)
query2 = self.self_attn(q, k, value=v)[0]
query = query + self.dropout1(query2)
query = self.norm1(query)
query2, weights = self.multihead_attn(query=self.with_pos_embed(query, query_pos_embed),
key=self.with_pos_embed(key, key_pos_embed), value=self.with_pos_embed(key, key_pos_embed),
attn_mask=attn_mask)
query = query + self.dropout2(query2)
query = self.norm2(query)
query2 = self.linear2(self.dropout(self.activation(self.linear1(query))))
query = query + self.dropout3(query2)
query = self.norm3(query)
# NxCxP to PxNxC
query = query.permute(1, 2, 0)
if need_weights:
return query, weights
else:
return query
class MultiheadAttention(nn.Module):
r"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
Args:
embed_dim: total dimension of the model.
num_heads: parallel attention heads.
dropout: a Dropout layer on attn_output_weights. Default: 0.0.
bias: add bias as module parameter. Default: True.
add_bias_kv: add bias to the key and value sequences at dim=0.
add_zero_attn: add a new batch of zeros to the key and
value sequences at dim=1.
kdim: total number of features in key. Default: None.
vdim: total number of features in key. Default: None.
Note: if kdim and vdim are None, they will be set to embed_dim such that
query, key, and value have the same number of features.
Examples::
>>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)
"""
def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False, kdim=None,
vdim=None):
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.kdim = kdim if kdim is not None else embed_dim
self.vdim = vdim if vdim is not None else embed_dim
self._qkv_same_embed_dim = self.kdim == embed_dim and self.vdim == embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.in_proj_weight = Parameter(torch.empty(3 * embed_dim, embed_dim))
if self._qkv_same_embed_dim is False:
self.q_proj_weight = Parameter(torch.Tensor(embed_dim, embed_dim))
self.k_proj_weight = Parameter(torch.Tensor(embed_dim, self.kdim))
self.v_proj_weight = Parameter(torch.Tensor(embed_dim, self.vdim))
if bias:
self.in_proj_bias = Parameter(torch.empty(3 * embed_dim))
else:
self.register_parameter('in_proj_bias', None)
self.out_proj = Linear(embed_dim, embed_dim, bias=bias)
if add_bias_kv:
self.bias_k = Parameter(torch.empty(1, 1, embed_dim))
self.bias_v = Parameter(torch.empty(1, 1, embed_dim))
else:
self.bias_k = self.bias_v = None
self.add_zero_attn = add_zero_attn
self._reset_parameters()
def _reset_parameters(self):
if self._qkv_same_embed_dim:
xavier_uniform_(self.in_proj_weight)
else:
xavier_uniform_(self.q_proj_weight)
xavier_uniform_(self.k_proj_weight)
xavier_uniform_(self.v_proj_weight)
if self.in_proj_bias is not None:
constant_(self.in_proj_bias, 0.)
constant_(self.out_proj.bias, 0.)
if self.bias_k is not None:
xavier_normal_(self.bias_k)
if self.bias_v is not None:
xavier_normal_(self.bias_v)
def forward(self, query, key, value, key_padding_mask=None, need_weights=True, attn_mask=None):
r"""
Args:
query, key, value: map a query and a set of key-value pairs to an output.
See "Attention Is All You Need" for more details.
key_padding_mask: if provided, specified padding elements in the key will
be ignored by the attention. This is an binary mask. When the value is True,
the corresponding value on the attention layer will be filled with -inf.
need_weights: output attn_output_weights.
attn_mask: mask that prevents attention to certain positions. This is an additive mask
(i.e. the values will be added to the attention layer).
Shape:
- Inputs:
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
the embedding dimension.
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length.
- attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
- Outputs:
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
E is the embedding dimension.
- attn_output_weights: :math:`(N, L, S)` where N is the batch size,
L is the target sequence length, S is the source sequence length.
"""
if hasattr(self, '_qkv_same_embed_dim') and self._qkv_same_embed_dim is False:
return multi_head_attention_forward(
query, key, value, self.embed_dim, self.num_heads,
self.in_proj_weight, self.in_proj_bias,
self.bias_k, self.bias_v, self.add_zero_attn,
self.dropout, self.out_proj.weight, self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask, need_weights=need_weights,
attn_mask=attn_mask, use_separate_proj_weight=True,
q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
v_proj_weight=self.v_proj_weight)
else:
if not hasattr(self, '_qkv_same_embed_dim'):
warnings.warn('A new version of MultiheadAttention module has been implemented. \
Please re-train your model with the new module',
UserWarning)
return multi_head_attention_forward(
query, key, value, self.embed_dim, self.num_heads,
self.in_proj_weight, self.in_proj_bias,
self.bias_k, self.bias_v, self.add_zero_attn,
self.dropout, self.out_proj.weight, self.out_proj.bias,
training=self.training,
key_padding_mask=key_padding_mask, need_weights=need_weights,
attn_mask=attn_mask)
def multi_head_attention_forward(query, # type: Tensor
key, # type: Tensor
value, # type: Tensor
embed_dim_to_check, # type: int
num_heads, # type: int
in_proj_weight, # type: Tensor
in_proj_bias, # type: Tensor
bias_k, # type: Optional[Tensor]
bias_v, # type: Optional[Tensor]
add_zero_attn, # type: bool
dropout_p, # type: float
out_proj_weight, # type: Tensor
out_proj_bias, # type: Tensor
training=True, # type: bool
key_padding_mask=None, # type: Optional[Tensor]
need_weights=True, # type: bool
attn_mask=None, # type: Optional[Tensor]
use_separate_proj_weight=False, # type: bool
q_proj_weight=None, # type: Optional[Tensor]
k_proj_weight=None, # type: Optional[Tensor]
v_proj_weight=None, # type: Optional[Tensor]
static_k=None, # type: Optional[Tensor]
static_v=None, # type: Optional[Tensor]
):
# type: (...) -> Tuple[Tensor, Optional[Tensor]]
r"""
Args:
query, key, value: map a query and a set of key-value pairs to an output.
See "Attention Is All You Need" for more details.
embed_dim_to_check: total dimension of the model.
num_heads: parallel attention heads.
in_proj_weight, in_proj_bias: input projection weight and bias.
bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
add_zero_attn: add a new batch of zeros to the key and
value sequences at dim=1.
dropout_p: probability of an element to be zeroed.
out_proj_weight, out_proj_bias: the output projection weight and bias.
training: apply dropout if is ``True``.
key_padding_mask: if provided, specified padding elements in the key will
be ignored by the attention. This is an binary mask. When the value is True,
the corresponding value on the attention layer will be filled with -inf.
need_weights: output attn_output_weights.
attn_mask: mask that prevents attention to certain positions. This is an additive mask
(i.e. the values will be added to the attention layer).
use_separate_proj_weight: the function accept the proj. weights for query, key,
and value in differnt forms. If false, in_proj_weight will be used, which is
a combination of q_proj_weight, k_proj_weight, v_proj_weight.
q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
static_k, static_v: static key and value used for attention operators.
Shape:
Inputs:
- query: :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
the embedding dimension.
- key: :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- value: :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
the embedding dimension.
- key_padding_mask: :math:`(N, S)`, ByteTensor, where N is the batch size, S is the source sequence length.
- attn_mask: :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
- static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
- static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
Outputs:
- attn_output: :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
E is the embedding dimension.
- attn_output_weights: :math:`(N, L, S)` where N is the batch size,
L is the target sequence length, S is the source sequence length.
"""
qkv_same = torch.equal(query, key) and torch.equal(key, value)
kv_same = torch.equal(key, value)
tgt_len, bsz, embed_dim = query.size()
assert embed_dim == embed_dim_to_check
assert list(query.size()) == [tgt_len, bsz, embed_dim]
assert key.size() == value.size()
head_dim = embed_dim // num_heads
assert head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
scaling = float(head_dim) ** -0.5
if use_separate_proj_weight is not True:
if qkv_same:
# self-attention
q, k, v = F.linear(query, in_proj_weight, in_proj_bias).chunk(3, dim=-1)
elif kv_same:
# encoder-decoder attention
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = 0
_end = embed_dim
_w = in_proj_weight[_start:_end, :]
if _b is not None:
_b = _b[_start:_end]
q = F.linear(query, _w, _b)
if key is None:
assert value is None
k = None
v = None
else:
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = embed_dim
_end = None
_w = in_proj_weight[_start:, :]
if _b is not None:
_b = _b[_start:]
k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
else:
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = 0
_end = embed_dim
_w = in_proj_weight[_start:_end, :]
if _b is not None:
_b = _b[_start:_end]
q = F.linear(query, _w, _b)
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = embed_dim
_end = embed_dim * 2
_w = in_proj_weight[_start:_end, :]
if _b is not None:
_b = _b[_start:_end]
k = F.linear(key, _w, _b)
# This is inline in_proj function with in_proj_weight and in_proj_bias
_b = in_proj_bias
_start = embed_dim * 2
_end = None
_w = in_proj_weight[_start:, :]
if _b is not None:
_b = _b[_start:]
v = F.linear(value, _w, _b)
else:
q_proj_weight_non_opt = torch.jit._unwrap_optional(q_proj_weight)
len1, len2 = q_proj_weight_non_opt.size()
assert len1 == embed_dim and len2 == query.size(-1)
k_proj_weight_non_opt = torch.jit._unwrap_optional(k_proj_weight)
len1, len2 = k_proj_weight_non_opt.size()
assert len1 == embed_dim and len2 == key.size(-1)
v_proj_weight_non_opt = torch.jit._unwrap_optional(v_proj_weight)
len1, len2 = v_proj_weight_non_opt.size()
assert len1 == embed_dim and len2 == value.size(-1)
if in_proj_bias is not None:
q = F.linear(query, q_proj_weight_non_opt, in_proj_bias[0:embed_dim])
k = F.linear(key, k_proj_weight_non_opt, in_proj_bias[embed_dim:(embed_dim * 2)])
v = F.linear(value, v_proj_weight_non_opt, in_proj_bias[(embed_dim * 2):])
else:
q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
q = q * scaling
if bias_k is not None and bias_v is not None:
if static_k is None and static_v is None:
k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
if attn_mask is not None:
attn_mask = torch.cat([attn_mask,
torch.zeros((attn_mask.size(0), 1),
dtype=attn_mask.dtype,
device=attn_mask.device)], dim=1)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
dtype=key_padding_mask.dtype,
device=key_padding_mask.device)], dim=1)
else:
assert static_k is None, "bias cannot be added to static key."
assert static_v is None, "bias cannot be added to static value."
else:
assert bias_k is None
assert bias_v is None
q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
if k is not None:
k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
if v is not None:
v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
if static_k is not None:
assert static_k.size(0) == bsz * num_heads
assert static_k.size(2) == head_dim
k = static_k
if static_v is not None:
assert static_v.size(0) == bsz * num_heads
assert static_v.size(2) == head_dim
v = static_v
src_len = k.size(1)
if key_padding_mask is not None:
assert key_padding_mask.size(0) == bsz
assert key_padding_mask.size(1) == src_len
if add_zero_attn:
src_len += 1
k = torch.cat([k, torch.zeros((k.size(0), 1) + k.size()[2:], dtype=k.dtype, device=k.device)], dim=1)
v = torch.cat([v, torch.zeros((v.size(0), 1) + v.size()[2:], dtype=v.dtype, device=v.device)], dim=1)
if attn_mask is not None:
if len(attn_mask.shape) == 2:
attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), 1),
dtype=attn_mask.dtype,
device=attn_mask.device)], dim=1)
else:
attn_mask = torch.cat([attn_mask, torch.zeros((attn_mask.size(0), attn_mask.size(1), 1),
dtype=attn_mask.dtype,
device=attn_mask.device)], dim=2)
if key_padding_mask is not None:
key_padding_mask = torch.cat(
[key_padding_mask, torch.zeros((key_padding_mask.size(0), 1),
dtype=key_padding_mask.dtype,
device=key_padding_mask.device)], dim=1)
attn_output_weights = torch.bmm(q, k.transpose(1, 2))
assert list(attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
if attn_mask is not None:
if len(attn_mask.shape) == 2:
attn_mask = attn_mask.unsqueeze(0)
else:
attn_mask = attn_mask.unsqueeze(1).repeat(1, num_heads, 1, 1)
attn_mask = attn_mask.reshape(attn_mask.size(0)*num_heads, attn_mask.size(2), attn_mask.size(3))
attn_output_weights += attn_mask
if key_padding_mask is not None:
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
attn_output_weights = attn_output_weights.masked_fill(
key_padding_mask.unsqueeze(1).unsqueeze(2),
float('-inf'),
)
attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, src_len)
attn_output_weights = F.softmax(
attn_output_weights, dim=-1)
attn_output_weights = F.dropout(attn_output_weights, p=dropout_p, training=training)
attn_output = torch.bmm(attn_output_weights, v)
assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
if need_weights:
# average attention weights over heads
attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
return attn_output, attn_output_weights.sum(dim=1) / num_heads
else:
return attn_output, None
================================================
FILE: mmdet3d/models/voxel_encoders/__init__.py
================================================
from .pillar_encoder import PillarFeatureNet
from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE
__all__ = [
'PillarFeatureNet', 'HardVFE', 'DynamicVFE', 'HardSimpleVFE',
'DynamicSimpleVFE'
]
================================================
FILE: mmdet3d/models/voxel_encoders/pillar_encoder.py
================================================
import torch
from mmcv.cnn import build_norm_layer
from mmcv.runner import force_fp32
from torch import nn
from mmdet3d.ops import DynamicScatter
from ..registry import VOXEL_ENCODERS
from .utils import PFNLayer, get_paddings_indicator
@VOXEL_ENCODERS.register_module()
class PillarFeatureNet(nn.Module):
"""Pillar Feature Net.
The network prepares the pillar features and performs forward pass
through PFNLayers.
Args:
in_channels (int, optional): Number of input features,
either x, y, z or x, y, z, r. Defaults to 4.
feat_channels (tuple, optional): Number of features in each of the
N PFNLayers. Defaults to (64, ).
with_distance (bool, optional): Whether to include Euclidean distance
to points. Defaults to False.
with_cluster_center (bool, optional): [description]. Defaults to True.
with_voxel_center (bool, optional): [description]. Defaults to True.
voxel_size (tuple[float], optional): Size of voxels, only utilize x
and y size. Defaults to (0.2, 0.2, 4).
point_cloud_range (tuple[float], optional): Point cloud range, only
utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
norm_cfg ([type], optional): [description].
Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
mode (str, optional): The mode to gather point features. Options are
'max' or 'avg'. Defaults to 'max'.
legacy (bool): Whether to use the new behavior or
the original behavior. Defaults to True.
"""
def __init__(self,
in_channels=4,
feat_channels=(64, ),
with_distance=False,
with_cluster_center=True,
with_voxel_center=True,
voxel_size=(0.2, 0.2, 4),
point_cloud_range=(0, -40, -3, 70.4, 40, 1),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
mode='max',
legacy=True):
super(PillarFeatureNet, self).__init__()
assert len(feat_channels) > 0
self.legacy = legacy
if with_cluster_center:
in_channels += 3
if with_voxel_center:
in_channels += 2
if with_distance:
in_channels += 1
self._with_distance = with_distance
self._with_cluster_center = with_cluster_center
self._with_voxel_center = with_voxel_center
self.fp16_enabled = False
# Create PillarFeatureNet layers
self.in_channels = in_channels
feat_channels = [in_channels] + list(feat_channels)
pfn_layers = []
for i in range(len(feat_channels) - 1):
in_filters = feat_channels[i]
out_filters = feat_channels[i + 1]
if i < len(feat_channels) - 2:
last_layer = False
else:
last_layer = True
pfn_layers.append(
PFNLayer(
in_filters,
out_filters,
norm_cfg=norm_cfg,
last_layer=last_layer,
mode=mode))
self.pfn_layers = nn.ModuleList(pfn_layers)
# Need pillar (voxel) size and x/y offset in order to calculate offset
self.vx = voxel_size[0]
self.vy = voxel_size[1]
self.x_offset = self.vx / 2 + point_cloud_range[0]
self.y_offset = self.vy / 2 + point_cloud_range[1]
self.point_cloud_range = point_cloud_range
@force_fp32(out_fp16=True)
def forward(self, features, num_points, coors):
"""Forward function.
Args:
features (torch.Tensor): Point features or raw points in shape
(N, M, C).
num_points (torch.Tensor): Number of points in each pillar.
coors (torch.Tensor): Coordinates of each voxel.
Returns:
torch.Tensor: Features of pillars.
"""
features_ls = [features]
# Find distance of x, y, and z from cluster center
if self._with_cluster_center:
points_mean = features[:, :, :3].sum(
dim=1, keepdim=True) / num_points.type_as(features).view(
-1, 1, 1)
f_cluster = features[:, :, :3] - points_mean
features_ls.append(f_cluster)
# Find distance of x, y, and z from pillar center
dtype = features.dtype
if self._with_voxel_center:
if not self.legacy:
f_center = torch.zeros_like(features[:, :, :2])
f_center[:, :, 0] = features[:, :, 0] - (
coors[:, 3].to(dtype).unsqueeze(1) * self.vx +
self.x_offset)
f_center[:, :, 1] = features[:, :, 1] - (
coors[:, 2].to(dtype).unsqueeze(1) * self.vy +
self.y_offset)
else:
f_center = features[:, :, :2]
f_center[:, :, 0] = f_center[:, :, 0] - (
coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
self.x_offset)
f_center[:, :, 1] = f_center[:, :, 1] - (
coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
self.y_offset)
features_ls.append(f_center)
if self._with_distance:
points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
features_ls.append(points_dist)
# Combine together feature decorations
features = torch.cat(features_ls, dim=-1)
# The feature decorations were calculated without regard to whether
# pillar was empty. Need to ensure that
# empty pillars remain set to zeros.
voxel_count = features.shape[1]
mask = get_paddings_indicator(num_points, voxel_count, axis=0)
mask = torch.unsqueeze(mask, -1).type_as(features)
features *= mask
for pfn in self.pfn_layers:
features = pfn(features, num_points)
return features.squeeze()
@VOXEL_ENCODERS.register_module()
class DynamicPillarFeatureNet(PillarFeatureNet):
"""Pillar Feature Net using dynamic voxelization.
The network prepares the pillar features and performs forward pass
through PFNLayers. The main difference is that it is used for
dynamic voxels, which contains different number of points inside a voxel
without limits.
Args:
in_channels (int, optional): Number of input features,
either x, y, z or x, y, z, r. Defaults to 4.
feat_channels (tuple, optional): Number of features in each of the
N PFNLayers. Defaults to (64, ).
with_distance (bool, optional): Whether to include Euclidean distance
to points. Defaults to False.
with_cluster_center (bool, optional): [description]. Defaults to True.
with_voxel_center (bool, optional): [description]. Defaults to True.
voxel_size (tuple[float], optional): Size of voxels, only utilize x
and y size. Defaults to (0.2, 0.2, 4).
point_cloud_range (tuple[float], optional): Point cloud range, only
utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1).
norm_cfg ([type], optional): [description].
Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01).
mode (str, optional): The mode to gather point features. Options are
'max' or 'avg'. Defaults to 'max'.
"""
def __init__(self,
in_channels=4,
feat_channels=(64, ),
with_distance=False,
with_cluster_center=True,
with_voxel_center=True,
voxel_size=(0.2, 0.2, 4),
point_cloud_range=(0, -40, -3, 70.4, 40, 1),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
mode='max'):
super(DynamicPillarFeatureNet, self).__init__(
in_channels,
feat_channels,
with_distance,
with_cluster_center=with_cluster_center,
with_voxel_center=with_voxel_center,
voxel_size=voxel_size,
point_cloud_range=point_cloud_range,
norm_cfg=norm_cfg,
mode=mode)
self.fp16_enabled = False
feat_channels = [self.in_channels] + list(feat_channels)
pfn_layers = []
# TODO: currently only support one PFNLayer
for i in range(len(feat_channels) - 1):
in_filters = feat_channels[i]
out_filters = feat_channels[i + 1]
if i > 0:
in_filters *= 2
norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
pfn_layers.append(
nn.Sequential(
nn.Linear(in_filters, out_filters, bias=False), norm_layer,
nn.ReLU(inplace=True)))
self.num_pfn = len(pfn_layers)
self.pfn_layers = nn.ModuleList(pfn_layers)
self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range,
(mode != 'max'))
self.cluster_scatter = DynamicScatter(
voxel_size, point_cloud_range, average_points=True)
def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
"""Map the centers of voxels to its corresponding points.
Args:
pts_coors (torch.Tensor): The coordinates of each points, shape
(M, 3), where M is the number of points.
voxel_mean (torch.Tensor): The mean or aggreagated features of a
voxel, shape (N, C), where N is the number of voxels.
voxel_coors (torch.Tensor): The coordinates of each voxel.
Returns:
torch.Tensor: Corresponding voxel centers of each points, shape
(M, C), where M is the numver of points.
"""
# Step 1: scatter voxel into canvas
# Calculate necessary things for canvas creation
canvas_y = int(
(self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
canvas_x = int(
(self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
canvas_channel = voxel_mean.size(1)
batch_size = pts_coors[-1, 0] + 1
canvas_len = canvas_y * canvas_x * batch_size
# Create the canvas for this sample
canvas = voxel_mean.new_zeros(canvas_channel, canvas_len)
# Only include non-empty pillars
indices = (
voxel_coors[:, 0] * canvas_y * canvas_x +
voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
# Scatter the blob back to the canvas
canvas[:, indices.long()] = voxel_mean.t()
# Step 2: get voxel mean for each point
voxel_index = (
pts_coors[:, 0] * canvas_y * canvas_x +
pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
center_per_point = canvas[:, voxel_index.long()].t()
return center_per_point
@force_fp32(out_fp16=True)
def forward(self, features, coors):
"""Forward function.
Args:
features (torch.Tensor): Point features or raw points in shape
(N, M, C).
coors (torch.Tensor): Coordinates of each voxel
Returns:
torch.Tensor: Features of pillars.
"""
features_ls = [features]
# Find distance of x, y, and z from cluster center
if self._with_cluster_center:
voxel_mean, mean_coors = self.cluster_scatter(features, coors)
points_mean = self.map_voxel_center_to_point(
coors, voxel_mean, mean_coors)
# TODO: maybe also do cluster for reflectivity
f_cluster = features[:, :3] - points_mean[:, :3]
features_ls.append(f_cluster)
# Find distance of x, y, and z from pillar center
if self._with_voxel_center:
f_center = features.new_zeros(size=(features.size(0), 2))
f_center[:, 0] = features[:, 0] - (
coors[:, 3].type_as(features) * self.vx + self.x_offset)
f_center[:, 1] = features[:, 1] - (
coors[:, 2].type_as(features) * self.vy + self.y_offset)
features_ls.append(f_center)
if self._with_distance:
points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
features_ls.append(points_dist)
# Combine together feature decorations
features = torch.cat(features_ls, dim=-1)
for i, pfn in enumerate(self.pfn_layers):
point_feats = pfn(features)
voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors)
if i != len(self.pfn_layers) - 1:
# need to concat voxel feats if it is not the last pfn
feat_per_point = self.map_voxel_center_to_point(
coors, voxel_feats, voxel_coors)
features = torch.cat([point_feats, feat_per_point], dim=1)
return voxel_feats, voxel_coors
================================================
FILE: mmdet3d/models/voxel_encoders/utils.py
================================================
import torch
from mmcv.cnn import build_norm_layer
from mmcv.runner import auto_fp16
from torch import nn
from torch.nn import functional as F
def get_paddings_indicator(actual_num, max_num, axis=0):
"""Create boolean mask by actually number of a padded tensor.
Args:
actual_num (torch.Tensor): Actual number of points in each voxel.
max_num (int): Max number of points in each voxel
Returns:
torch.Tensor: Mask indicates which points are valid inside a voxel.
"""
actual_num = torch.unsqueeze(actual_num, axis + 1)
# tiled_actual_num: [N, M, 1]
max_num_shape = [1] * len(actual_num.shape)
max_num_shape[axis + 1] = -1
max_num = torch.arange(
max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape)
# tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]]
# tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]]
paddings_indicator = actual_num.int() > max_num
# paddings_indicator shape: [batch_size, max_num]
return paddings_indicator
class VFELayer(nn.Module):
"""Voxel Feature Encoder layer.
The voxel encoder is composed of a series of these layers.
This module do not support average pooling and only support to use
max pooling to gather features inside a VFE.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
norm_cfg (dict): Config dict of normalization layers
max_out (bool): Whether aggregate the features of points inside
each voxel and only return voxel features.
cat_max (bool): Whether concatenate the aggregated features
and pointwise features.
"""
def __init__(self,
in_channels,
out_channels,
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
max_out=True,
cat_max=True):
super(VFELayer, self).__init__()
self.fp16_enabled = False
self.cat_max = cat_max
self.max_out = max_out
# self.units = int(out_channels / 2)
self.norm = build_norm_layer(norm_cfg, out_channels)[1]
self.linear = nn.Linear(in_channels, out_channels, bias=False)
@auto_fp16(apply_to=('inputs'), out_fp32=True)
def forward(self, inputs):
"""Forward function.
Args:
inputs (torch.Tensor): Voxels features of shape (N, M, C).
N is the number of voxels, M is the number of points in
voxels, C is the number of channels of point features.
Returns:
torch.Tensor: Voxel features. There are three mode under which the
features have different meaning.
- `max_out=False`: Return point-wise features in
shape (N, M, C).
- `max_out=True` and `cat_max=False`: Return aggregated
voxel features in shape (N, C)
- `max_out=True` and `cat_max=True`: Return concatenated
point-wise features in shape (N, M, C).
"""
# [K, T, 7] tensordot [7, units] = [K, T, units]
voxel_count = inputs.shape[1]
x = self.linear(inputs)
x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
1).contiguous()
pointwise = F.relu(x)
# [K, T, units]
if self.max_out:
aggregated = torch.max(pointwise, dim=1, keepdim=True)[0]
else:
# this is for fusion layer
return pointwise
if not self.cat_max:
return aggregated.squeeze(1)
else:
# [K, 1, units]
repeated = aggregated.repeat(1, voxel_count, 1)
concatenated = torch.cat([pointwise, repeated], dim=2)
# [K, T, 2 * units]
return concatenated
class PFNLayer(nn.Module):
"""Pillar Feature Net Layer.
The Pillar Feature Net is composed of a series of these layers, but the
PointPillars paper results only used a single PFNLayer.
Args:
in_channels (int): Number of input channels.
out_channels (int): Number of output channels.
norm_cfg (dict): Config dict of normalization layers
last_layer (bool): If last_layer, there is no concatenation of
features.
mode (str): Pooling model to gather features inside voxels.
Default to 'max'.
"""
def __init__(self,
in_channels,
out_channels,
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
last_layer=False,
mode='max'):
super().__init__()
self.fp16_enabled = False
self.name = 'PFNLayer'
self.last_vfe = last_layer
if not self.last_vfe:
out_channels = out_channels // 2
self.units = out_channels
self.norm = build_norm_layer(norm_cfg, self.units)[1]
self.linear = nn.Linear(in_channels, self.units, bias=False)
assert mode in ['max', 'avg']
self.mode = mode
@auto_fp16(apply_to=('inputs'), out_fp32=True)
def forward(self, inputs, num_voxels=None, aligned_distance=None):
"""Forward function.
Args:
inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C).
N is the number of voxels, M is the number of points in
voxels, C is the number of channels of point features.
num_voxels (torch.Tensor, optional): Number of points in each
voxel. Defaults to None.
aligned_distance (torch.Tensor, optional): The distance of
each points to the voxel center. Defaults to None.
Returns:
torch.Tensor: Features of Pillars.
"""
x = self.linear(inputs)
x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2,
1).contiguous()
x = F.relu(x)
if self.mode == 'max':
if aligned_distance is not None:
x = x.mul(aligned_distance.unsqueeze(-1))
x_max = torch.max(x, dim=1, keepdim=True)[0]
elif self.mode == 'avg':
if aligned_distance is not None:
x = x.mul(aligned_distance.unsqueeze(-1))
x_max = x.sum(
dim=1, keepdim=True) / num_voxels.type_as(inputs).view(
-1, 1, 1)
if self.last_vfe:
return x_max
else:
x_repeat = x_max.repeat(1, inputs.shape[1], 1)
x_concatenated = torch.cat([x, x_repeat], dim=2)
return x_concatenated
================================================
FILE: mmdet3d/models/voxel_encoders/voxel_encoder.py
================================================
import torch
from mmcv.cnn import build_norm_layer
from mmcv.runner import force_fp32
from torch import nn
from mmdet3d.ops import DynamicScatter
from .. import builder
from ..registry import VOXEL_ENCODERS
from .utils import VFELayer, get_paddings_indicator
@VOXEL_ENCODERS.register_module()
class HardSimpleVFE(nn.Module):
"""Simple voxel feature encoder used in SECOND.
It simply averages the values of points in a voxel.
Args:
num_features (int): Number of features to use. Default: 4.
"""
def __init__(self, num_features=4):
super(HardSimpleVFE, self).__init__()
self.num_features = num_features
self.fp16_enabled = False
@force_fp32(out_fp16=True)
def forward(self, features, num_points, coors):
"""Forward function.
Args:
features (torch.Tensor): Point features in shape
(N, M, 3(4)). N is the number of voxels and M is the maximum
number of points inside a single voxel.
num_points (torch.Tensor): Number of points in each voxel,
shape (N, ).
coors (torch.Tensor): Coordinates of voxels.
Returns:
torch.Tensor: Mean of points inside each voxel in shape (N, 3(4))
"""
points_mean = features[:, :, :self.num_features].sum(
dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1)
return points_mean.contiguous()
@VOXEL_ENCODERS.register_module()
class DynamicSimpleVFE(nn.Module):
"""Simple dynamic voxel feature encoder used in DV-SECOND.
It simply averages the values of points in a voxel.
But the number of points in a voxel is dynamic and varies.
Args:
voxel_size (tupe[float]): Size of a single voxel
point_cloud_range (tuple[float]): Range of the point cloud and voxels
"""
def __init__(self,
voxel_size=(0.2, 0.2, 4),
point_cloud_range=(0, -40, -3, 70.4, 40, 1)):
super(DynamicSimpleVFE, self).__init__()
self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
self.fp16_enabled = False
@torch.no_grad()
@force_fp32(out_fp16=True)
def forward(self, features, coors):
"""Forward function.
Args:
features (torch.Tensor): Point features in shape
(N, 3(4)). N is the number of points.
coors (torch.Tensor): Coordinates of voxels.
Returns:
torch.Tensor: Mean of points inside each voxel in shape (M, 3(4)).
M is the number of voxels.
"""
# This function is used from the start of the voxelnet
# num_points: [concated_num_points]
features, features_coors = self.scatter(features, coors)
return features, features_coors
@VOXEL_ENCODERS.register_module()
class DynamicVFE(nn.Module):
"""Dynamic Voxel feature encoder used in DV-SECOND.
It encodes features of voxels and their points. It could also fuse
image feature into voxel features in a point-wise manner.
The number of points inside the voxel varies.
Args:
in_channels (int): Input channels of VFE. Defaults to 4.
feat_channels (list(int)): Channels of features in VFE.
with_distance (bool): Whether to use the L2 distance of points to the
origin point. Default False.
with_cluster_center (bool): Whether to use the distance to cluster
center of points inside a voxel. Default to False.
with_voxel_center (bool): Whether to use the distance to center of
voxel for each points inside a voxel. Default to False.
voxel_size (tuple[float]): Size of a single voxel. Default to
(0.2, 0.2, 4).
point_cloud_range (tuple[float]): The range of points or voxels.
Default to (0, -40, -3, 70.4, 40, 1).
norm_cfg (dict): Config dict of normalization layers.
mode (str): The mode when pooling features of points inside a voxel.
Available options include 'max' and 'avg'. Default to 'max'.
fusion_layer (dict | None): The config dict of fusion layer used in
multi-modal detectors. Default to None.
return_point_feats (bool): Whether to return the features of each
points. Default to False.
"""
def __init__(self,
in_channels=4,
feat_channels=[],
with_distance=False,
with_cluster_center=False,
with_voxel_center=False,
voxel_size=(0.2, 0.2, 4),
point_cloud_range=(0, -40, -3, 70.4, 40, 1),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
mode='max',
fusion_layer=None,
return_point_feats=False):
super(DynamicVFE, self).__init__()
assert mode in ['avg', 'max']
assert len(feat_channels) > 0
if with_cluster_center:
in_channels += 3
if with_voxel_center:
in_channels += 3
if with_distance:
in_channels += 3
self.in_channels = in_channels
self._with_distance = with_distance
self._with_cluster_center = with_cluster_center
self._with_voxel_center = with_voxel_center
self.return_point_feats = return_point_feats
self.fp16_enabled = False
# Need pillar (voxel) size and x/y offset in order to calculate offset
self.vx = voxel_size[0]
self.vy = voxel_size[1]
self.vz = voxel_size[2]
self.x_offset = self.vx / 2 + point_cloud_range[0]
self.y_offset = self.vy / 2 + point_cloud_range[1]
self.z_offset = self.vz / 2 + point_cloud_range[2]
self.point_cloud_range = point_cloud_range
self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
feat_channels = [self.in_channels] + list(feat_channels)
vfe_layers = []
for i in range(len(feat_channels) - 1):
in_filters = feat_channels[i]
out_filters = feat_channels[i + 1]
if i > 0:
in_filters *= 2
norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
vfe_layers.append(
nn.Sequential(
nn.Linear(in_filters, out_filters, bias=False), norm_layer,
nn.ReLU(inplace=True)))
self.vfe_layers = nn.ModuleList(vfe_layers)
self.num_vfe = len(vfe_layers)
self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range,
(mode != 'max'))
self.cluster_scatter = DynamicScatter(
voxel_size, point_cloud_range, average_points=True)
self.fusion_layer = None
if fusion_layer is not None:
self.fusion_layer = builder.build_fusion_layer(fusion_layer)
def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors):
"""Map voxel features to its corresponding points.
Args:
pts_coors (torch.Tensor): Voxel coordinate of each point.
voxel_mean (torch.Tensor): Voxel features to be mapped.
voxel_coors (torch.Tensor): Coordinates of valid voxels
Returns:
torch.Tensor: Features or centers of each point.
"""
# Step 1: scatter voxel into canvas
# Calculate necessary things for canvas creation
canvas_z = int(
(self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz)
canvas_y = int(
(self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy)
canvas_x = int(
(self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx)
# canvas_channel = voxel_mean.size(1)
batch_size = pts_coors[-1, 0] + 1
canvas_len = canvas_z * canvas_y * canvas_x * batch_size
# Create the canvas for this sample
canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long)
# Only include non-empty pillars
indices = (
voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x +
voxel_coors[:, 1] * canvas_y * canvas_x +
voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3])
# Scatter the blob back to the canvas
canvas[indices.long()] = torch.arange(
start=0, end=voxel_mean.size(0), device=voxel_mean.device)
# Step 2: get voxel mean for each point
voxel_index = (
pts_coors[:, 0] * canvas_z * canvas_y * canvas_x +
pts_coors[:, 1] * canvas_y * canvas_x +
pts_coors[:, 2] * canvas_x + pts_coors[:, 3])
voxel_inds = canvas[voxel_index.long()]
center_per_point = voxel_mean[voxel_inds, ...]
return center_per_point
@force_fp32(out_fp16=True)
def forward(self,
features,
coors,
points=None,
img_feats=None,
img_metas=None):
"""Forward functions.
Args:
features (torch.Tensor): Features of voxels, shape is NxC.
coors (torch.Tensor): Coordinates of voxels, shape is Nx(1+NDim).
points (list[torch.Tensor], optional): Raw points used to guide the
multi-modality fusion. Defaults to None.
img_feats (list[torch.Tensor], optional): Image fetures used for
multi-modality fusion. Defaults to None.
img_metas (dict, optional): [description]. Defaults to None.
Returns:
tuple: If `return_point_feats` is False, returns voxel features and
its coordinates. If `return_point_feats` is True, returns
feature of each points inside voxels.
"""
features_ls = [features]
# Find distance of x, y, and z from cluster center
if self._with_cluster_center:
voxel_mean, mean_coors = self.cluster_scatter(features, coors)
points_mean = self.map_voxel_center_to_point(
coors, voxel_mean, mean_coors)
# TODO: maybe also do cluster for reflectivity
f_cluster = features[:, :3] - points_mean[:, :3]
features_ls.append(f_cluster)
# Find distance of x, y, and z from pillar center
if self._with_voxel_center:
f_center = features.new_zeros(size=(features.size(0), 3))
f_center[:, 0] = features[:, 0] - (
coors[:, 3].type_as(features) * self.vx + self.x_offset)
f_center[:, 1] = features[:, 1] - (
coors[:, 2].type_as(features) * self.vy + self.y_offset)
f_center[:, 2] = features[:, 2] - (
coors[:, 1].type_as(features) * self.vz + self.z_offset)
features_ls.append(f_center)
if self._with_distance:
points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True)
features_ls.append(points_dist)
# Combine together feature decorations
features = torch.cat(features_ls, dim=-1)
for i, vfe in enumerate(self.vfe_layers):
point_feats = vfe(features)
if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None
and img_feats is not None):
point_feats = self.fusion_layer(img_feats, points, point_feats,
img_metas)
voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors)
if i != len(self.vfe_layers) - 1:
# need to concat voxel feats if it is not the last vfe
feat_per_point = self.map_voxel_center_to_point(
coors, voxel_feats, voxel_coors)
features = torch.cat([point_feats, feat_per_point], dim=1)
if self.return_point_feats:
return point_feats
return voxel_feats, voxel_coors
@VOXEL_ENCODERS.register_module()
class HardVFE(nn.Module):
"""Voxel feature encoder used in DV-SECOND.
It encodes features of voxels and their points. It could also fuse
image feature into voxel features in a point-wise manner.
Args:
in_channels (int): Input channels of VFE. Defaults to 4.
feat_channels (list(int)): Channels of features in VFE.
with_distance (bool): Whether to use the L2 distance of points to the
origin point. Default False.
with_cluster_center (bool): Whether to use the distance to cluster
center of points inside a voxel. Default to False.
with_voxel_center (bool): Whether to use the distance to center of
voxel for each points inside a voxel. Default to False.
voxel_size (tuple[float]): Size of a single voxel. Default to
(0.2, 0.2, 4).
point_cloud_range (tuple[float]): The range of points or voxels.
Default to (0, -40, -3, 70.4, 40, 1).
norm_cfg (dict): Config dict of normalization layers.
mode (str): The mode when pooling features of points inside a voxel.
Available options include 'max' and 'avg'. Default to 'max'.
fusion_layer (dict | None): The config dict of fusion layer used in
multi-modal detectors. Default to None.
return_point_feats (bool): Whether to return the features of each
points. Default to False.
"""
def __init__(self,
in_channels=4,
feat_channels=[],
with_distance=False,
with_cluster_center=False,
with_voxel_center=False,
voxel_size=(0.2, 0.2, 4),
point_cloud_range=(0, -40, -3, 70.4, 40, 1),
norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
mode='max',
fusion_layer=None,
return_point_feats=False):
super(HardVFE, self).__init__()
assert len(feat_channels) > 0
if with_cluster_center:
in_channels += 3
if with_voxel_center:
in_channels += 3
if with_distance:
in_channels += 3
self.in_channels = in_channels
self._with_distance = with_distance
self._with_cluster_center = with_cluster_center
self._with_voxel_center = with_voxel_center
self.return_point_feats = return_point_feats
self.fp16_enabled = False
# Need pillar (voxel) size and x/y offset to calculate pillar offset
self.vx = voxel_size[0]
self.vy = voxel_size[1]
self.vz = voxel_size[2]
self.x_offset = self.vx / 2 + point_cloud_range[0]
self.y_offset = self.vy / 2 + point_cloud_range[1]
self.z_offset = self.vz / 2 + point_cloud_range[2]
self.point_cloud_range = point_cloud_range
self.scatter = DynamicScatter(voxel_size, point_cloud_range, True)
feat_channels = [self.in_channels] + list(feat_channels)
vfe_layers = []
for i in range(len(feat_channels) - 1):
in_filters = feat_channels[i]
out_filters = feat_channels[i + 1]
if i > 0:
in_filters *= 2
# TODO: pass norm_cfg to VFE
# norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters)
if i == (len(feat_channels) - 2):
cat_max = False
max_out = True
if fusion_layer:
max_out = False
else:
max_out = True
cat_max = True
vfe_layers.append(
VFELayer(
in_filters,
out_filters,
norm_cfg=norm_cfg,
max_out=max_out,
cat_max=cat_max))
self.vfe_layers = nn.ModuleList(vfe_layers)
self.num_vfe = len(vfe_layers)
self.fusion_layer = None
if fusion_layer is not None:
self.fusion_layer = builder.build_fusion_layer(fusion_layer)
@force_fp32(out_fp16=True)
def forward(self,
features,
num_points,
coors,
img_feats=None,
img_metas=None):
"""Forward functions.
Args:
features (torch.Tensor): Features of voxels, shape is MxNxC.
num_points (torch.Tensor): Number of points in each voxel.
coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim).
img_feats (list[torch.Tensor], optional): Image fetures used for
multi-modality fusion. Defaults to None.
img_metas (dict, optional): [description]. Defaults to None.
Returns:
tuple: If `return_point_feats` is False, returns voxel features and
its coordinates. If `return_point_feats` is True, returns
feature of each points inside voxels.
"""
features_ls = [features]
# Find distance of x, y, and z from cluster center
if self._with_cluster_center:
points_mean = (
features[:, :, :3].sum(dim=1, keepdim=True) /
num_points.type_as(features).view(-1, 1, 1))
# TODO: maybe also do cluster for reflectivity
f_cluster = features[:, :, :3] - points_mean
features_ls.append(f_cluster)
# Find distance of x, y, and z from pillar center
if self._with_voxel_center:
f_center = features.new_zeros(
size=(features.size(0), features.size(1), 3))
f_center[:, :, 0] = features[:, :, 0] - (
coors[:, 3].type_as(features).unsqueeze(1) * self.vx +
self.x_offset)
f_center[:, :, 1] = features[:, :, 1] - (
coors[:, 2].type_as(features).unsqueeze(1) * self.vy +
self.y_offset)
f_center[:, :, 2] = features[:, :, 2] - (
coors[:, 1].type_as(features).unsqueeze(1) * self.vz +
self.z_offset)
features_ls.append(f_center)
if self._with_distance:
points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True)
features_ls.append(points_dist)
# Combine together feature decorations
voxel_feats = torch.cat(features_ls, dim=-1)
# The feature decorations were calculated without regard to whether
# pillar was empty.
# Need to ensure that empty voxels remain set to zeros.
voxel_count = voxel_feats.shape[1]
mask = get_paddings_indicator(num_points, voxel_count, axis=0)
voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats)
for i, vfe in enumerate(self.vfe_layers):
voxel_feats = vfe(voxel_feats)
if (self.fusion_layer is not None and img_feats is not None):
voxel_feats = self.fusion_with_mask(features, mask, voxel_feats,
coors, img_feats, img_metas)
return voxel_feats
def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats,
img_metas):
"""Fuse image and point features with mask.
Args:
features (torch.Tensor): Features of voxel, usually it is the
values of points in voxels.
mask (torch.Tensor): Mask indicates valid features in each voxel.
voxel_feats (torch.Tensor): Features of voxels.
coors (torch.Tensor): Coordinates of each single voxel.
img_feats (list[torch.Tensor]): Multi-scale feature maps of image.
img_metas (list(dict)): Meta information of image and points.
Returns:
torch.Tensor: Fused features of each voxel.
"""
# the features is consist of a batch of points
batch_size = coors[-1, 0] + 1
points = []
for i in range(batch_size):
single_mask = (coors[:, 0] == i)
points.append(features[single_mask][mask[single_mask]])
point_feats = voxel_feats[mask]
point_feats = self.fusion_layer(img_feats, points, point_feats,
img_metas)
voxel_canvas = voxel_feats.new_zeros(
size=(voxel_feats.size(0), voxel_feats.size(1),
point_feats.size(-1)))
voxel_canvas[mask] = point_feats
out = torch.max(voxel_canvas, dim=1)[0]
return out
================================================
FILE: mmdet3d/ops/__init__.py
================================================
from mmcv.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version,
get_compiling_cuda_version, nms, roi_align,
sigmoid_focal_loss)
from .ball_query import ball_query
from .furthest_point_sample import (Points_Sampler, furthest_point_sample,
furthest_point_sample_with_dist)
from .gather_points import gather_points
from .group_points import (GroupAll, QueryAndGroup, group_points,
grouping_operation)
from .interpolate import three_interpolate, three_nn
from .knn import knn
from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d
from .pointnet_modules import (PointFPModule, PointSAModule, PointSAModuleMSG,
build_sa_module)
from .roiaware_pool3d import (RoIAwarePool3d, points_in_boxes_batch,
points_in_boxes_cpu, points_in_boxes_gpu)
from .sparse_block import (SparseBasicBlock, SparseBottleneck,
make_sparse_convmodule)
from .voxel import DynamicScatter, Voxelization, dynamic_scatter, voxelization
__all__ = [
'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version',
'get_compiling_cuda_version', 'NaiveSyncBatchNorm1d',
'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization',
'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss',
'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck',
'RoIAwarePool3d', 'points_in_boxes_gpu', 'points_in_boxes_cpu',
'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample',
'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn',
'gather_points', 'grouping_operation', 'group_points', 'GroupAll',
'QueryAndGroup', 'PointSAModule', 'PointSAModuleMSG', 'PointFPModule',
'points_in_boxes_batch', 'get_compiler_version',
'get_compiling_cuda_version', 'Points_Sampler', 'build_sa_module'
]
================================================
FILE: mmdet3d/ops/ball_query/__init__.py
================================================
from .ball_query import ball_query
__all__ = ['ball_query']
================================================
FILE: mmdet3d/ops/ball_query/ball_query.py
================================================
import torch
from torch.autograd import Function
from . import ball_query_ext
class BallQuery(Function):
"""Ball Query.
Find nearby points in spherical space.
"""
@staticmethod
def forward(ctx, min_radius: float, max_radius: float, sample_num: int,
xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor:
"""forward.
Args:
min_radius (float): minimum radius of the balls.
max_radius (float): maximum radius of the balls.
sample_num (int): maximum number of features in the balls.
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) centers of the ball query.
Returns:
Tensor: (B, npoint, nsample) tensor with the indicies of
the features that form the query balls.
"""
assert center_xyz.is_contiguous()
assert xyz.is_contiguous()
assert min_radius < max_radius
B, N, _ = xyz.size()
npoint = center_xyz.size(1)
idx = torch.cuda.IntTensor(B, npoint, sample_num).zero_()
ball_query_ext.ball_query_wrapper(B, N, npoint, min_radius, max_radius,
sample_num, center_xyz, xyz, idx)
ctx.mark_non_differentiable(idx)
return idx
@staticmethod
def backward(ctx, a=None):
return None, None, None, None
ball_query = BallQuery.apply
================================================
FILE: mmdet3d/ops/ball_query/src/ball_query.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
#include
#include
#include
#include
#include
#include
extern THCState *state;
#define CHECK_CUDA(x) \
TORCH_CHECK(x.type().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
int ball_query_wrapper(int b, int n, int m, float min_radius, float max_radius, int nsample,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor,
at::Tensor idx_tensor);
void ball_query_kernel_launcher(int b, int n, int m, float min_radius, float max_radius,
int nsample, const float *xyz, const float *new_xyz,
int *idx, cudaStream_t stream);
int ball_query_wrapper(int b, int n, int m, float min_radius, float max_radius, int nsample,
at::Tensor new_xyz_tensor, at::Tensor xyz_tensor,
at::Tensor idx_tensor) {
CHECK_INPUT(new_xyz_tensor);
CHECK_INPUT(xyz_tensor);
const float *new_xyz = new_xyz_tensor.data_ptr();
const float *xyz = xyz_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
ball_query_kernel_launcher(b, n, m, min_radius, max_radius,
nsample, new_xyz, xyz, idx, stream);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ball_query_wrapper", &ball_query_wrapper, "ball_query_wrapper");
}
================================================
FILE: mmdet3d/ops/ball_query/src/ball_query_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
#include
#include
#include
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void ball_query_kernel(int b, int n, int m,
float min_radius,
float max_radius,
int nsample,
const float *__restrict__ new_xyz,
const float *__restrict__ xyz,
int *__restrict__ idx) {
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= m) return;
new_xyz += bs_idx * m * 3 + pt_idx * 3;
xyz += bs_idx * n * 3;
idx += bs_idx * m * nsample + pt_idx * nsample;
float max_radius2 = max_radius * max_radius;
float min_radius2 = min_radius * min_radius;
float new_x = new_xyz[0];
float new_y = new_xyz[1];
float new_z = new_xyz[2];
int cnt = 0;
for (int k = 0; k < n; ++k) {
float x = xyz[k * 3 + 0];
float y = xyz[k * 3 + 1];
float z = xyz[k * 3 + 2];
float d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
(new_z - z) * (new_z - z);
if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
if (cnt == 0) {
for (int l = 0; l < nsample; ++l) {
idx[l] = k;
}
}
idx[cnt] = k;
++cnt;
if (cnt >= nsample) break;
}
}
}
void ball_query_kernel_launcher(int b, int n, int m, float min_radius, float max_radius,
int nsample, const float *new_xyz, const float *xyz,
int *idx, cudaStream_t stream) {
// new_xyz: (B, M, 3)
// xyz: (B, N, 3)
// output:
// idx: (B, M, nsample)
cudaError_t err;
dim3 blocks(DIVUP(m, THREADS_PER_BLOCK),
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
ball_query_kernel<<>>(b, n, m, min_radius, max_radius,
nsample, new_xyz, xyz, idx);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: mmdet3d/ops/furthest_point_sample/__init__.py
================================================
from .furthest_point_sample import (furthest_point_sample,
furthest_point_sample_with_dist)
from .points_sampler import Points_Sampler
__all__ = [
'furthest_point_sample', 'furthest_point_sample_with_dist',
'Points_Sampler'
]
================================================
FILE: mmdet3d/ops/furthest_point_sample/furthest_point_sample.py
================================================
import torch
from torch.autograd import Function
from . import furthest_point_sample_ext
class FurthestPointSampling(Function):
"""Furthest Point Sampling.
Uses iterative furthest point sampling to select a set of features whose
corresponding points have the furthest distance.
"""
@staticmethod
def forward(ctx, points_xyz: torch.Tensor,
num_points: int) -> torch.Tensor:
"""forward.
Args:
points_xyz (Tensor): (B, N, 3) where N > num_points.
num_points (int): Number of points in the sampled set.
Returns:
Tensor: (B, num_points) indices of the sampled points.
"""
assert points_xyz.is_contiguous()
B, N = points_xyz.size()[:2]
output = torch.cuda.IntTensor(B, num_points)
temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
furthest_point_sample_ext.furthest_point_sampling_wrapper(
B, N, num_points, points_xyz, temp, output)
ctx.mark_non_differentiable(output)
return output
@staticmethod
def backward(xyz, a=None):
return None, None
class FurthestPointSamplingWithDist(Function):
"""Furthest Point Sampling With Distance.
Uses iterative furthest point sampling to select a set of features whose
corresponding points have the furthest distance.
"""
@staticmethod
def forward(ctx, points_dist: torch.Tensor,
num_points: int) -> torch.Tensor:
"""forward.
Args:
points_dist (Tensor): (B, N, N) Distance between each point pair.
num_points (int): Number of points in the sampled set.
Returns:
Tensor: (B, num_points) indices of the sampled points.
"""
assert points_dist.is_contiguous()
B, N, _ = points_dist.size()
output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
temp = points_dist.new_zeros([B, N]).fill_(1e10)
furthest_point_sample_ext.furthest_point_sampling_with_dist_wrapper(
B, N, num_points, points_dist, temp, output)
ctx.mark_non_differentiable(output)
return output
@staticmethod
def backward(xyz, a=None):
return None, None
furthest_point_sample = FurthestPointSampling.apply
furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply
================================================
FILE: mmdet3d/ops/furthest_point_sample/points_sampler.py
================================================
import torch
from mmcv.runner import force_fp32
from torch import nn as nn
from typing import List
from .furthest_point_sample import (furthest_point_sample,
furthest_point_sample_with_dist)
from .utils import calc_square_dist
def get_sampler_type(sampler_type):
"""Get the type and mode of points sampler.
Args:
sampler_type (str): The type of points sampler.
The valid value are "D-FPS", "F-FPS", or "FS".
Returns:
class: Points sampler type.
"""
if sampler_type == 'D-FPS':
sampler = DFPS_Sampler
elif sampler_type == 'F-FPS':
sampler = FFPS_Sampler
elif sampler_type == 'FS':
sampler = FS_Sampler
else:
raise ValueError('Only "sampler_type" of "D-FPS", "F-FPS", or "FS"'
f' are supported, got {sampler_type}')
return sampler
class Points_Sampler(nn.Module):
"""Points sampling.
Args:
num_point (list[int]): Number of sample points.
fps_mod_list (list[str]: Type of FPS method, valid mod
['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
F-FPS: using feature distances for FPS.
D-FPS: using Euclidean distances of points for FPS.
FS: using F-FPS and D-FPS simultaneously.
fps_sample_range_list (list[int]): Range of points to apply FPS.
Default: [-1].
"""
def __init__(self,
num_point: List[int],
fps_mod_list: List[str] = ['D-FPS'],
fps_sample_range_list: List[int] = [-1]):
super(Points_Sampler, self).__init__()
# FPS would be applied to different fps_mod in the list,
# so the length of the num_point should be equal to
# fps_mod_list and fps_sample_range_list.
assert len(num_point) == len(fps_mod_list) == len(
fps_sample_range_list)
self.num_point = num_point
self.fps_sample_range_list = fps_sample_range_list
self.samplers = nn.ModuleList()
for fps_mod in fps_mod_list:
self.samplers.append(get_sampler_type(fps_mod)())
self.fp16_enabled = False
@force_fp32()
def forward(self, points_xyz, features):
"""forward.
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
features (Tensor): (B, C, N) Descriptors of the features.
Return:
Tensor: (B, npoint, sample_num) Indices of sampled points.
"""
indices = []
last_fps_end_index = 0
for fps_sample_range, sampler, npoint in zip(
self.fps_sample_range_list, self.samplers, self.num_point):
assert fps_sample_range < points_xyz.shape[1]
if fps_sample_range == -1:
sample_points_xyz = points_xyz[:, last_fps_end_index:]
sample_features = features[:, :, last_fps_end_index:]
else:
sample_points_xyz = \
points_xyz[:, last_fps_end_index:fps_sample_range]
sample_features = \
features[:, :, last_fps_end_index:fps_sample_range]
fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
npoint)
indices.append(fps_idx + last_fps_end_index)
last_fps_end_index += fps_sample_range
indices = torch.cat(indices, dim=1)
return indices
class DFPS_Sampler(nn.Module):
"""DFPS_Sampling.
Using Euclidean distances of points for FPS.
"""
def __init__(self):
super(DFPS_Sampler, self).__init__()
def forward(self, points, features, npoint):
"""Sampling points with D-FPS."""
fps_idx = furthest_point_sample(points.contiguous(), npoint)
return fps_idx
class FFPS_Sampler(nn.Module):
"""FFPS_Sampler.
Using feature distances for FPS.
"""
def __init__(self):
super(FFPS_Sampler, self).__init__()
def forward(self, points, features, npoint):
"""Sampling points with F-FPS."""
features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
features_dist = calc_square_dist(
features_for_fps, features_for_fps, norm=False)
fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
return fps_idx
class FS_Sampler(nn.Module):
"""FS_Sampling.
Using F-FPS and D-FPS simultaneously.
"""
def __init__(self):
super(FS_Sampler, self).__init__()
def forward(self, points, features, npoint):
"""Sampling points with FS_Sampling."""
features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
features_dist = calc_square_dist(
features_for_fps, features_for_fps, norm=False)
fps_idx_ffps = furthest_point_sample_with_dist(features_dist, npoint)
fps_idx_dfps = furthest_point_sample(points, npoint)
fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
return fps_idx
================================================
FILE: mmdet3d/ops/furthest_point_sample/src/furthest_point_sample.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
#include
#include
#include
#include
#include
extern THCState *state;
int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor,
at::Tensor temp_tensor,
at::Tensor idx_tensor);
void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp,
int *idxs, cudaStream_t stream);
int furthest_point_sampling_with_dist_wrapper(int b, int n, int m,
at::Tensor points_tensor,
at::Tensor temp_tensor,
at::Tensor idx_tensor);
void furthest_point_sampling_with_dist_kernel_launcher(int b, int n, int m,
const float *dataset,
float *temp, int *idxs,
cudaStream_t stream);
int furthest_point_sampling_wrapper(int b, int n, int m,
at::Tensor points_tensor,
at::Tensor temp_tensor,
at::Tensor idx_tensor) {
const float *points = points_tensor.data_ptr();
float *temp = temp_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
furthest_point_sampling_kernel_launcher(b, n, m, points, temp, idx, stream);
return 1;
}
int furthest_point_sampling_with_dist_wrapper(int b, int n, int m,
at::Tensor points_tensor,
at::Tensor temp_tensor,
at::Tensor idx_tensor) {
const float *points = points_tensor.data();
float *temp = temp_tensor.data();
int *idx = idx_tensor.data();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
furthest_point_sampling_with_dist_kernel_launcher(b, n, m, points, temp, idx, stream);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("furthest_point_sampling_wrapper", &furthest_point_sampling_wrapper,
"furthest_point_sampling_wrapper");
m.def("furthest_point_sampling_with_dist_wrapper",
&furthest_point_sampling_with_dist_wrapper,
"furthest_point_sampling_with_dist_wrapper");
}
================================================
FILE: mmdet3d/ops/furthest_point_sample/src/furthest_point_sample_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu
#include
#include
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
inline int opt_n_threads(int work_size) {
const int pow_2 = std::log(static_cast(work_size)) / std::log(2.0);
return max(min(1 << pow_2, TOTAL_THREADS), 1);
}
__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
int idx1, int idx2) {
const float v1 = dists[idx1], v2 = dists[idx2];
const int i1 = dists_i[idx1], i2 = dists_i[idx2];
dists[idx1] = max(v1, v2);
dists_i[idx1] = v2 > v1 ? i2 : i1;
}
template
__global__ void furthest_point_sampling_kernel(
int b, int n, int m, const float *__restrict__ dataset,
float *__restrict__ temp, int *__restrict__ idxs) {
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
if (m <= 0) return;
__shared__ float dists[block_size];
__shared__ int dists_i[block_size];
int batch_index = blockIdx.x;
dataset += batch_index * n * 3;
temp += batch_index * n;
idxs += batch_index * m;
int tid = threadIdx.x;
const int stride = block_size;
int old = 0;
if (threadIdx.x == 0) idxs[0] = old;
__syncthreads();
for (int j = 1; j < m; j++) {
int besti = 0;
float best = -1;
float x1 = dataset[old * 3 + 0];
float y1 = dataset[old * 3 + 1];
float z1 = dataset[old * 3 + 2];
for (int k = tid; k < n; k += stride) {
float x2, y2, z2;
x2 = dataset[k * 3 + 0];
y2 = dataset[k * 3 + 1];
z2 = dataset[k * 3 + 2];
// float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
// if (mag <= 1e-3)
// continue;
float d =
(x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
float d2 = min(d, temp[k]);
temp[k] = d2;
besti = d2 > best ? k : besti;
best = d2 > best ? d2 : best;
}
dists[tid] = best;
dists_i[tid] = besti;
__syncthreads();
if (block_size >= 1024) {
if (tid < 512) {
__update(dists, dists_i, tid, tid + 512);
}
__syncthreads();
}
if (block_size >= 512) {
if (tid < 256) {
__update(dists, dists_i, tid, tid + 256);
}
__syncthreads();
}
if (block_size >= 256) {
if (tid < 128) {
__update(dists, dists_i, tid, tid + 128);
}
__syncthreads();
}
if (block_size >= 128) {
if (tid < 64) {
__update(dists, dists_i, tid, tid + 64);
}
__syncthreads();
}
if (block_size >= 64) {
if (tid < 32) {
__update(dists, dists_i, tid, tid + 32);
}
__syncthreads();
}
if (block_size >= 32) {
if (tid < 16) {
__update(dists, dists_i, tid, tid + 16);
}
__syncthreads();
}
if (block_size >= 16) {
if (tid < 8) {
__update(dists, dists_i, tid, tid + 8);
}
__syncthreads();
}
if (block_size >= 8) {
if (tid < 4) {
__update(dists, dists_i, tid, tid + 4);
}
__syncthreads();
}
if (block_size >= 4) {
if (tid < 2) {
__update(dists, dists_i, tid, tid + 2);
}
__syncthreads();
}
if (block_size >= 2) {
if (tid < 1) {
__update(dists, dists_i, tid, tid + 1);
}
__syncthreads();
}
old = dists_i[0];
if (tid == 0) idxs[j] = old;
}
}
void furthest_point_sampling_kernel_launcher(int b, int n, int m,
const float *dataset, float *temp,
int *idxs, cudaStream_t stream) {
// dataset: (B, N, 3)
// tmp: (B, N)
// output:
// idx: (B, M)
cudaError_t err;
unsigned int n_threads = opt_n_threads(n);
switch (n_threads) {
case 1024:
furthest_point_sampling_kernel<1024>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 512:
furthest_point_sampling_kernel<512>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 256:
furthest_point_sampling_kernel<256>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 128:
furthest_point_sampling_kernel<128>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 64:
furthest_point_sampling_kernel<64>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 32:
furthest_point_sampling_kernel<32>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 16:
furthest_point_sampling_kernel<16>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 8:
furthest_point_sampling_kernel<8>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 4:
furthest_point_sampling_kernel<4>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 2:
furthest_point_sampling_kernel<2>
<<>>(b, n, m, dataset, temp, idxs);
break;
case 1:
furthest_point_sampling_kernel<1>
<<>>(b, n, m, dataset, temp, idxs);
break;
default:
furthest_point_sampling_kernel<512>
<<>>(b, n, m, dataset, temp, idxs);
}
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
// Modified from
// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
template
__global__ void furthest_point_sampling_with_dist_kernel(
int b, int n, int m, const float *__restrict__ dataset,
float *__restrict__ temp, int *__restrict__ idxs) {
// dataset: (B, N, N)
// tmp: (B, N)
// output:
// idx: (B, M)
if (m <= 0)
return;
__shared__ float dists[block_size];
__shared__ int dists_i[block_size];
int batch_index = blockIdx.x;
dataset += batch_index * n * n;
temp += batch_index * n;
idxs += batch_index * m;
int tid = threadIdx.x;
const int stride = block_size;
int old = 0;
if (threadIdx.x == 0)
idxs[0] = old;
__syncthreads();
for (int j = 1; j < m; j++) {
int besti = 0;
float best = -1;
// float x1 = dataset[old * 3 + 0];
// float y1 = dataset[old * 3 + 1];
// float z1 = dataset[old * 3 + 2];
for (int k = tid; k < n; k += stride) {
// float x2, y2, z2;
// x2 = dataset[k * 3 + 0];
// y2 = dataset[k * 3 + 1];
// z2 = dataset[k * 3 + 2];
// float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
// (z2 - z1);
float d = dataset[old * n + k];
float d2 = min(d, temp[k]);
temp[k] = d2;
besti = d2 > best ? k : besti;
best = d2 > best ? d2 : best;
}
dists[tid] = best;
dists_i[tid] = besti;
__syncthreads();
if (block_size >= 1024) {
if (tid < 512) {
__update(dists, dists_i, tid, tid + 512);
}
__syncthreads();
}
if (block_size >= 512) {
if (tid < 256) {
__update(dists, dists_i, tid, tid + 256);
}
__syncthreads();
}
if (block_size >= 256) {
if (tid < 128) {
__update(dists, dists_i, tid, tid + 128);
}
__syncthreads();
}
if (block_size >= 128) {
if (tid < 64) {
__update(dists, dists_i, tid, tid + 64);
}
__syncthreads();
}
if (block_size >= 64) {
if (tid < 32) {
__update(dists, dists_i, tid, tid + 32);
}
__syncthreads();
}
if (block_size >= 32) {
if (tid < 16) {
__update(dists, dists_i, tid, tid + 16);
}
__syncthreads();
}
if (block_size >= 16) {
if (tid < 8) {
__update(dists, dists_i, tid, tid + 8);
}
__syncthreads();
}
if (block_size >= 8) {
if (tid < 4) {
__update(dists, dists_i, tid, tid + 4);
}
__syncthreads();
}
if (block_size >= 4) {
if (tid < 2) {
__update(dists, dists_i, tid, tid + 2);
}
__syncthreads();
}
if (block_size >= 2) {
if (tid < 1) {
__update(dists, dists_i, tid, tid + 1);
}
__syncthreads();
}
old = dists_i[0];
if (tid == 0)
idxs[j] = old;
}
}
void furthest_point_sampling_with_dist_kernel_launcher(int b, int n, int m,
const float *dataset,
float *temp, int *idxs,
cudaStream_t stream) {
// dataset: (B, N, N)
// temp: (B, N)
// output:
// idx: (B, M)
cudaError_t err;
unsigned int n_threads = opt_n_threads(n);
switch (n_threads) {
case 1024:
furthest_point_sampling_with_dist_kernel<1024><<>>(
b, n, m, dataset, temp, idxs);
break;
case 512:
furthest_point_sampling_with_dist_kernel<512><<>>(
b, n, m, dataset, temp, idxs);
break;
case 256:
furthest_point_sampling_with_dist_kernel<256><<>>(
b, n, m, dataset, temp, idxs);
break;
case 128:
furthest_point_sampling_with_dist_kernel<128><<>>(
b, n, m, dataset, temp, idxs);
break;
case 64:
furthest_point_sampling_with_dist_kernel<64><<>>(
b, n, m, dataset, temp, idxs);
break;
case 32:
furthest_point_sampling_with_dist_kernel<32><<>>(
b, n, m, dataset, temp, idxs);
break;
case 16:
furthest_point_sampling_with_dist_kernel<16><<>>(
b, n, m, dataset, temp, idxs);
break;
case 8:
furthest_point_sampling_with_dist_kernel<8><<>>(
b, n, m, dataset, temp, idxs);
break;
case 4:
furthest_point_sampling_with_dist_kernel<4><<>>(
b, n, m, dataset, temp, idxs);
break;
case 2:
furthest_point_sampling_with_dist_kernel<2><<>>(
b, n, m, dataset, temp, idxs);
break;
case 1:
furthest_point_sampling_with_dist_kernel<1><<>>(
b, n, m, dataset, temp, idxs);
break;
default:
furthest_point_sampling_with_dist_kernel<512><<>>(
b, n, m, dataset, temp, idxs);
}
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: mmdet3d/ops/furthest_point_sample/utils.py
================================================
import torch
def calc_square_dist(point_feat_a, point_feat_b, norm=True):
"""Calculating square distance between a and b.
Args:
point_feat_a (Tensor): (B, N, C) Feature vector of each point.
point_feat_b (Tensor): (B, M, C) Feature vector of each point.
norm (Bool): Whether to normalize the distance.
Default: True.
Returns:
Tensor: (B, N, M) Distance between each pair points.
"""
length_a = point_feat_a.shape[1]
length_b = point_feat_b.shape[1]
num_channel = point_feat_a.shape[-1]
# [bs, n, 1]
a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1)
# [bs, 1, m]
b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1)
a_square = a_square.repeat((1, 1, length_b)) # [bs, n, m]
b_square = b_square.repeat((1, length_a, 1)) # [bs, n, m]
coor = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2))
dist = a_square + b_square - 2 * coor
if norm:
dist = torch.sqrt(dist) / num_channel
return dist
================================================
FILE: mmdet3d/ops/gather_points/__init__.py
================================================
from .gather_points import gather_points
__all__ = ['gather_points']
================================================
FILE: mmdet3d/ops/gather_points/gather_points.py
================================================
import torch
from torch.autograd import Function
from . import gather_points_ext
class GatherPoints(Function):
"""Gather Points.
Gather points with given index.
"""
@staticmethod
def forward(ctx, features: torch.Tensor,
indices: torch.Tensor) -> torch.Tensor:
"""forward.
Args:
features (Tensor): (B, C, N) features to gather.
indices (Tensor): (B, M) where M is the number of points.
Returns:
Tensor: (B, C, M) where M is the number of points.
"""
assert features.is_contiguous()
assert indices.is_contiguous()
B, npoint = indices.size()
_, C, N = features.size()
output = torch.cuda.FloatTensor(B, C, npoint)
gather_points_ext.gather_points_wrapper(B, C, N, npoint, features,
indices, output)
ctx.for_backwards = (indices, C, N)
ctx.mark_non_differentiable(indices)
return output
@staticmethod
def backward(ctx, grad_out):
idx, C, N = ctx.for_backwards
B, npoint = idx.size()
grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
grad_out_data = grad_out.data.contiguous()
gather_points_ext.gather_points_grad_wrapper(B, C, N, npoint,
grad_out_data, idx,
grad_features.data)
return grad_features, None
gather_points = GatherPoints.apply
================================================
FILE: mmdet3d/ops/gather_points/src/gather_points.cpp
================================================
#include
#include
#include
#include
#include
extern THCState *state;
int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor);
void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx,
float *out, cudaStream_t stream);
int gather_points_grad_wrapper(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor grad_points_tensor);
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx,
float *grad_points,
cudaStream_t stream);
int gather_points_wrapper(int b, int c, int n, int npoints,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *out = out_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
gather_points_kernel_launcher(b, c, n, npoints, points, idx, out, stream);
return 1;
}
int gather_points_grad_wrapper(int b, int c, int n, int npoints,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *grad_points = grad_points_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
gather_points_grad_kernel_launcher(b, c, n, npoints, grad_out, idx,
grad_points, stream);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("gather_points_wrapper", &gather_points_wrapper,
"gather_points_wrapper");
m.def("gather_points_grad_wrapper", &gather_points_grad_wrapper,
"gather_points_grad_wrapper");
}
================================================
FILE: mmdet3d/ops/gather_points/src/gather_points_cuda.cu
================================================
#include
#include
#define TOTAL_THREADS 1024
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void gather_points_kernel(int b, int c, int n, int m,
const float *__restrict__ points,
const int *__restrict__ idx,
float *__restrict__ out) {
// points: (B, C, N)
// idx: (B, M)
// output:
// out: (B, C, M)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
points += bs_idx * c * n + c_idx * n;
out[0] = points[idx[0]];
}
void gather_points_kernel_launcher(int b, int c, int n, int npoints,
const float *points, const int *idx,
float *out, cudaStream_t stream) {
// points: (B, C, N)
// idx: (B, npoints)
// output:
// out: (B, C, npoints)
cudaError_t err;
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
gather_points_kernel<<>>(b, c, n, npoints, points,
idx, out);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
__global__ void gather_points_grad_kernel(int b, int c, int n, int m,
const float *__restrict__ grad_out,
const int *__restrict__ idx,
float *__restrict__ grad_points) {
// grad_out: (B, C, M)
// idx: (B, M)
// output:
// grad_points: (B, C, N)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= m) return;
grad_out += bs_idx * c * m + c_idx * m + pt_idx;
idx += bs_idx * m + pt_idx;
grad_points += bs_idx * c * n + c_idx * n;
atomicAdd(grad_points + idx[0], grad_out[0]);
}
void gather_points_grad_kernel_launcher(int b, int c, int n, int npoints,
const float *grad_out, const int *idx,
float *grad_points,
cudaStream_t stream) {
// grad_out: (B, C, npoints)
// idx: (B, npoints)
// output:
// grad_points: (B, C, N)
cudaError_t err;
dim3 blocks(DIVUP(npoints, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
gather_points_grad_kernel<<>>(
b, c, n, npoints, grad_out, idx, grad_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: mmdet3d/ops/group_points/__init__.py
================================================
from .group_points import GroupAll, QueryAndGroup, grouping_operation
__all__ = ['QueryAndGroup', 'GroupAll', 'grouping_operation']
================================================
FILE: mmdet3d/ops/group_points/group_points.py
================================================
import torch
from torch import nn as nn
from torch.autograd import Function
from typing import Tuple
from ..ball_query import ball_query
from . import group_points_ext
class QueryAndGroup(nn.Module):
"""Query and Group.
Groups with a ball query of radius
Args:
max_radius (float): The maximum radius of the balls.
sample_num (int): Maximum number of features to gather in the ball.
min_radius (float): The minimum radius of the balls.
use_xyz (bool): Whether to use xyz.
Default: True.
return_grouped_xyz (bool): Whether to return grouped xyz.
Default: False.
normalize_xyz (bool): Whether to normalize xyz.
Default: False.
uniform_sample (bool): Whether to sample uniformly.
Default: False
return_unique_cnt (bool): Whether to return the count of
unique samples.
Default: False.
"""
def __init__(self,
max_radius,
sample_num,
min_radius=0,
use_xyz=True,
return_grouped_xyz=False,
normalize_xyz=False,
uniform_sample=False,
return_unique_cnt=False):
super(QueryAndGroup, self).__init__()
self.max_radius = max_radius
self.min_radius = min_radius
self.sample_num = sample_num
self.use_xyz = use_xyz
self.return_grouped_xyz = return_grouped_xyz
self.normalize_xyz = normalize_xyz
self.uniform_sample = uniform_sample
self.return_unique_cnt = return_unique_cnt
if self.return_unique_cnt:
assert self.uniform_sample
def forward(self, points_xyz, center_xyz, features=None):
"""forward.
Args:
points_xyz (Tensor): (B, N, 3) xyz coordinates of the features.
center_xyz (Tensor): (B, npoint, 3) Centriods.
features (Tensor): (B, C, N) Descriptors of the features.
Return:
Tensor: (B, 3 + C, npoint, sample_num) Grouped feature.
"""
idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
points_xyz, center_xyz)
if self.uniform_sample:
unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
for i_batch in range(idx.shape[0]):
for i_region in range(idx.shape[1]):
unique_ind = torch.unique(idx[i_batch, i_region, :])
num_unique = unique_ind.shape[0]
unique_cnt[i_batch, i_region] = num_unique
sample_ind = torch.randint(
0,
num_unique, (self.sample_num - num_unique, ),
dtype=torch.long)
all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
idx[i_batch, i_region, :] = all_ind
xyz_trans = points_xyz.transpose(1, 2).contiguous()
# (B, 3, npoint, sample_num)
grouped_xyz = grouping_operation(xyz_trans, idx)
grouped_xyz -= center_xyz.transpose(1, 2).unsqueeze(-1)
if self.normalize_xyz:
grouped_xyz /= self.max_radius
if features is not None:
grouped_features = grouping_operation(features, idx)
if self.use_xyz:
# (B, C + 3, npoint, sample_num)
new_features = torch.cat([grouped_xyz, grouped_features],
dim=1)
else:
new_features = grouped_features
else:
assert (self.use_xyz
), 'Cannot have not features and not use xyz as a feature!'
new_features = grouped_xyz
ret = [new_features]
if self.return_grouped_xyz:
ret.append(grouped_xyz)
if self.return_unique_cnt:
ret.append(unique_cnt)
if len(ret) == 1:
return ret[0]
else:
return tuple(ret)
class GroupAll(nn.Module):
"""Group All.
Group xyz with feature.
Args:
use_xyz (bool): Whether to use xyz.
"""
def __init__(self, use_xyz: bool = True):
super().__init__()
self.use_xyz = use_xyz
def forward(self,
xyz: torch.Tensor,
new_xyz: torch.Tensor,
features: torch.Tensor = None):
"""forward.
Args:
xyz (Tensor): (B, N, 3) xyz coordinates of the features.
new_xyz (Tensor): Ignored.
features (Tensor): (B, C, N) features to group.
Return:
Tensor: (B, C + 3, 1, N) Grouped feature.
"""
grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
if features is not None:
grouped_features = features.unsqueeze(2)
if self.use_xyz:
new_features = torch.cat([grouped_xyz, grouped_features],
dim=1) # (B, 3 + C, 1, N)
else:
new_features = grouped_features
else:
new_features = grouped_xyz
return new_features
class GroupingOperation(Function):
"""Grouping Operation.
Group feature with given index.
"""
@staticmethod
def forward(ctx, features: torch.Tensor,
indices: torch.Tensor) -> torch.Tensor:
"""forward.
Args:
features (Tensor): (B, C, N) tensor of features to group.
indices (Tensor): (B, npoint, nsample) the indicies of
features to group with.
Returns:
Tensor: (B, C, npoint, nsample) Grouped features.
"""
assert features.is_contiguous()
assert indices.is_contiguous()
B, nfeatures, nsample = indices.size()
_, C, N = features.size()
output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
group_points_ext.forward(B, C, N, nfeatures, nsample, features,
indices, output)
ctx.for_backwards = (indices, N)
return output
@staticmethod
def backward(ctx,
grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""backward.
Args:
grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
of the output from forward.
Returns:
Tensor: (B, C, N) gradient of the features.
"""
idx, N = ctx.for_backwards
B, C, npoint, nsample = grad_out.size()
grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
grad_out_data = grad_out.data.contiguous()
group_points_ext.backward(B, C, N, npoint, nsample, grad_out_data, idx,
grad_features.data)
return grad_features, None
grouping_operation = GroupingOperation.apply
================================================
FILE: mmdet3d/ops/group_points/src/group_points.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
#include
#include
#include
#include
#include
#include
extern THCState *state;
int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor);
void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx,
float *out, cudaStream_t stream);
int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor,
at::Tensor grad_points_tensor);
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
int nsample, const float *grad_out,
const int *idx, float *grad_points,
cudaStream_t stream);
int group_points_grad_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor grad_out_tensor, at::Tensor idx_tensor,
at::Tensor grad_points_tensor) {
float *grad_points = grad_points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
const float *grad_out = grad_out_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
group_points_grad_kernel_launcher(b, c, n, npoints, nsample, grad_out, idx,
grad_points, stream);
return 1;
}
int group_points_wrapper(int b, int c, int n, int npoints, int nsample,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
float *out = out_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
group_points_kernel_launcher(b, c, n, npoints, nsample, points, idx, out,
stream);
return 1;
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("forward", &group_points_wrapper, "group_points_wrapper");
m.def("backward", &group_points_grad_wrapper, "group_points_grad_wrapper");
}
================================================
FILE: mmdet3d/ops/group_points/src/group_points_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
#include
#include
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void group_points_grad_kernel(int b, int c, int n, int npoints,
int nsample,
const float *__restrict__ grad_out,
const int *__restrict__ idx,
float *__restrict__ grad_points) {
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample;
grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
}
void group_points_grad_kernel_launcher(int b, int c, int n, int npoints,
int nsample, const float *grad_out,
const int *idx, float *grad_points,
cudaStream_t stream) {
// grad_out: (B, C, npoints, nsample)
// idx: (B, npoints, nsample)
// output:
// grad_points: (B, C, N)
cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
group_points_grad_kernel<<>>(
b, c, n, npoints, nsample, grad_out, idx, grad_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
__global__ void group_points_kernel(int b, int c, int n, int npoints,
int nsample,
const float *__restrict__ points,
const int *__restrict__ idx,
float *__restrict__ out) {
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int index = blockIdx.x * blockDim.x + threadIdx.x;
int pt_idx = index / nsample;
if (bs_idx >= b || c_idx >= c || pt_idx >= npoints) return;
int sample_idx = index % nsample;
idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
int in_idx = bs_idx * c * n + c_idx * n + idx[0];
int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
pt_idx * nsample + sample_idx;
out[out_idx] = points[in_idx];
}
void group_points_kernel_launcher(int b, int c, int n, int npoints, int nsample,
const float *points, const int *idx,
float *out, cudaStream_t stream) {
// points: (B, C, N)
// idx: (B, npoints, nsample)
// output:
// out: (B, C, npoints, nsample)
cudaError_t err;
dim3 blocks(DIVUP(npoints * nsample, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
group_points_kernel<<>>(b, c, n, npoints, nsample,
points, idx, out);
// cudaDeviceSynchronize(); // for using printf in kernel function
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: mmdet3d/ops/interpolate/__init__.py
================================================
from .three_interpolate import three_interpolate
from .three_nn import three_nn
__all__ = ['three_nn', 'three_interpolate']
================================================
FILE: mmdet3d/ops/interpolate/src/interpolate.cpp
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
#include
#include
#include
#include
#include
#include
#include
#include
#include
extern THCState *state;
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor,
at::Tensor idx_tensor);
void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx,
cudaStream_t stream);
void three_interpolate_wrapper(int b, int c, int m, int n,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor weight_tensor, at::Tensor out_tensor);
void three_interpolate_kernel_launcher(int b, int c, int m, int n,
const float *points, const int *idx,
const float *weight, float *out,
cudaStream_t stream);
void three_interpolate_grad_wrapper(int b, int c, int n, int m,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor grad_points_tensor);
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
const float *grad_out,
const int *idx, const float *weight,
float *grad_points,
cudaStream_t stream);
void three_nn_wrapper(int b, int n, int m, at::Tensor unknown_tensor,
at::Tensor known_tensor, at::Tensor dist2_tensor,
at::Tensor idx_tensor) {
const float *unknown = unknown_tensor.data_ptr();
const float *known = known_tensor.data_ptr();
float *dist2 = dist2_tensor.data_ptr();
int *idx = idx_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
three_nn_kernel_launcher(b, n, m, unknown, known, dist2, idx, stream);
}
void three_interpolate_wrapper(int b, int c, int m, int n,
at::Tensor points_tensor, at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor out_tensor) {
const float *points = points_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *out = out_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
three_interpolate_kernel_launcher(b, c, m, n, points, idx, weight, out,
stream);
}
void three_interpolate_grad_wrapper(int b, int c, int n, int m,
at::Tensor grad_out_tensor,
at::Tensor idx_tensor,
at::Tensor weight_tensor,
at::Tensor grad_points_tensor) {
const float *grad_out = grad_out_tensor.data_ptr();
const float *weight = weight_tensor.data_ptr();
float *grad_points = grad_points_tensor.data_ptr();
const int *idx = idx_tensor.data_ptr();
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
three_interpolate_grad_kernel_launcher(b, c, n, m, grad_out, idx, weight,
grad_points, stream);
}
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("three_nn_wrapper", &three_nn_wrapper, "three_nn_wrapper");
m.def("three_interpolate_wrapper", &three_interpolate_wrapper,
"three_interpolate_wrapper");
m.def("three_interpolate_grad_wrapper", &three_interpolate_grad_wrapper,
"three_interpolate_grad_wrapper");
}
================================================
FILE: mmdet3d/ops/interpolate/src/three_interpolate_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
#include
#include
#include
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void three_interpolate_kernel(int b, int c, int m, int n,
const float *__restrict__ points,
const int *__restrict__ idx,
const float *__restrict__ weight,
float *__restrict__ out) {
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
weight += bs_idx * n * 3 + pt_idx * 3;
points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
out += bs_idx * c * n + c_idx * n;
out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
weight[2] * points[idx[2]];
}
void three_interpolate_kernel_launcher(int b, int c, int m, int n,
const float *points, const int *idx,
const float *weight, float *out,
cudaStream_t stream) {
// points: (B, C, M)
// idx: (B, N, 3)
// weight: (B, N, 3)
// output:
// out: (B, C, N)
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_interpolate_kernel<<>>(b, c, m, n, points,
idx, weight, out);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
__global__ void three_interpolate_grad_kernel(
int b, int c, int n, int m, const float *__restrict__ grad_out,
const int *__restrict__ idx, const float *__restrict__ weight,
float *__restrict__ grad_points) {
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
int bs_idx = blockIdx.z;
int c_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || c_idx >= c || pt_idx >= n) return;
grad_out += bs_idx * c * n + c_idx * n + pt_idx;
weight += bs_idx * n * 3 + pt_idx * 3;
grad_points += bs_idx * c * m + c_idx * m;
idx += bs_idx * n * 3 + pt_idx * 3;
atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
}
void three_interpolate_grad_kernel_launcher(int b, int c, int n, int m,
const float *grad_out,
const int *idx, const float *weight,
float *grad_points,
cudaStream_t stream) {
// grad_out: (B, C, N)
// weight: (B, N, 3)
// output:
// grad_points: (B, C, M)
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK), c,
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_interpolate_grad_kernel<<>>(
b, c, n, m, grad_out, idx, weight, grad_points);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: mmdet3d/ops/interpolate/src/three_nn_cuda.cu
================================================
// Modified from
// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
#include
#include
#include
#define THREADS_PER_BLOCK 256
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
__global__ void three_nn_kernel(int b, int n, int m,
const float *__restrict__ unknown,
const float *__restrict__ known,
float *__restrict__ dist2,
int *__restrict__ idx) {
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
int bs_idx = blockIdx.y;
int pt_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (bs_idx >= b || pt_idx >= n) return;
unknown += bs_idx * n * 3 + pt_idx * 3;
known += bs_idx * m * 3;
dist2 += bs_idx * n * 3 + pt_idx * 3;
idx += bs_idx * n * 3 + pt_idx * 3;
float ux = unknown[0];
float uy = unknown[1];
float uz = unknown[2];
double best1 = 1e40, best2 = 1e40, best3 = 1e40;
int besti1 = 0, besti2 = 0, besti3 = 0;
for (int k = 0; k < m; ++k) {
float x = known[k * 3 + 0];
float y = known[k * 3 + 1];
float z = known[k * 3 + 2];
float d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
if (d < best1) {
best3 = best2;
besti3 = besti2;
best2 = best1;
besti2 = besti1;
best1 = d;
besti1 = k;
} else if (d < best2) {
best3 = best2;
besti3 = besti2;
best2 = d;
besti2 = k;
} else if (d < best3) {
best3 = d;
besti3 = k;
}
}
dist2[0] = best1;
dist2[1] = best2;
dist2[2] = best3;
idx[0] = besti1;
idx[1] = besti2;
idx[2] = besti3;
}
void three_nn_kernel_launcher(int b, int n, int m, const float *unknown,
const float *known, float *dist2, int *idx,
cudaStream_t stream) {
// unknown: (B, N, 3)
// known: (B, M, 3)
// output:
// dist2: (B, N, 3)
// idx: (B, N, 3)
cudaError_t err;
dim3 blocks(DIVUP(n, THREADS_PER_BLOCK),
b); // blockIdx.x(col), blockIdx.y(row)
dim3 threads(THREADS_PER_BLOCK);
three_nn_kernel<<>>(b, n, m, unknown, known,
dist2, idx);
err = cudaGetLastError();
if (cudaSuccess != err) {
fprintf(stderr, "CUDA kernel failed : %s\n", cudaGetErrorString(err));
exit(-1);
}
}
================================================
FILE: mmdet3d/ops/interpolate/three_interpolate.py
================================================
import torch
from torch.autograd import Function
from typing import Tuple
from . import interpolate_ext
class ThreeInterpolate(Function):
@staticmethod
def forward(ctx, features: torch.Tensor, indices: torch.Tensor,
weight: torch.Tensor) -> torch.Tensor:
"""Performs weighted linear interpolation on 3 features.
Args:
features (Tensor): (B, C, M) Features descriptors to be
interpolated from
indices (Tensor): (B, n, 3) index three nearest neighbors
of the target features in features
weight (Tensor): (B, n, 3) weights of interpolation
Returns:
Tensor: (B, C, N) tensor of the interpolated features
"""
assert features.is_contiguous()
assert indices.is_contiguous()
assert weight.is_contiguous()
B, c, m = features.size()
n = indices.size(1)
ctx.three_interpolate_for_backward = (indices, weight, m)
output = torch.cuda.FloatTensor(B, c, n)
interpolate_ext.three_interpolate_wrapper(B, c, m, n, features,
indices, weight, output)
return output
@staticmethod
def backward(
ctx, grad_out: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Backward of three interpolate.
Args:
grad_out (Tensor): (B, C, N) tensor with gradients of outputs
Returns:
Tensor: (B, C, M) tensor with gradients of features
"""
idx, weight, m = ctx.three_interpolate_for_backward
B, c, n = grad_out.size()
grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
grad_out_data = grad_out.data.contiguous()
interpolate_ext.three_interpolate_grad_wrapper(B, c, n, m,
grad_out_data, idx,
weight,
grad_features.data)
return grad_features, None, None
three_interpolate = ThreeInterpolate.apply
================================================
FILE: mmdet3d/ops/interpolate/three_nn.py
================================================
import torch
from torch.autograd import Function
from typing import Tuple
from . import interpolate_ext
class ThreeNN(Function):
@staticmethod
def forward(ctx, target: torch.Tensor,
source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""Find the top-3 nearest neighbors of the target set from the source
set.
Args:
target (Tensor): shape (B, N, 3), points set that needs to
find the nearest neighbors.
source (Tensor): shape (B, M, 3), points set that is used
to find the nearest neighbors of points in target set.
Returns:
Tensor: shape (B, N, 3), L2 distance of each point in target
set to their corresponding nearest neighbors.
"""
assert target.is_contiguous()
assert source.is_contiguous()
B, N, _ = target.size()
m = source.size(1)
dist2 = torch.cuda.FloatTensor(B, N, 3)
idx = torch.cuda.IntTensor(B, N, 3)
interpolate_ext.three_nn_wrapper(B, N, m, target, source, dist2, idx)
ctx.mark_non_differentiable(idx)
return torch.sqrt(dist2), idx
@staticmethod
def backward(ctx, a=None, b=None):
return None, None
three_nn = ThreeNN.apply
================================================
FILE: mmdet3d/ops/iou3d/__init__.py
================================================
from .iou3d_utils import boxes_iou_bev, nms_gpu, nms_normal_gpu
__all__ = ['boxes_iou_bev', 'nms_gpu', 'nms_normal_gpu']
================================================
FILE: mmdet3d/ops/iou3d/iou3d_utils.py
================================================
import torch
from . import iou3d_cuda
def boxes_iou_bev(boxes_a, boxes_b):
"""Calculate boxes IoU in the bird view.
Args:
boxes_a (torch.Tensor): Input boxes a with shape (M, 5).
boxes_b (torch.Tensor): Input boxes b with shape (N, 5).
Returns:
ans_iou (torch.Tensor): IoU result with shape (M, N).
"""
ans_iou = boxes_a.new_zeros(
torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
iou3d_cuda.boxes_iou_bev_gpu(boxes_a.contiguous(), boxes_b.contiguous(),
ans_iou)
return ans_iou
def nms_gpu(boxes, scores, thresh, pre_maxsize=None, post_max_size=None):
"""Nms function with gpu implementation.
Args:
boxes (torch.Tensor): Input boxes with the shape of [N, 5]
([x1, y1, x2, y2, ry]).
scores (torch.Tensor): Scores of boxes with the shape of [N].
thresh (int): Threshold.
pre_maxsize (int): Max size of boxes before nms. Default: None.
post_maxsize (int): Max size of boxes after nms. Default: None.
Returns:
torch.Tensor: Indexes after nms.
"""
order = scores.sort(0, descending=True)[1]
if pre_maxsize is not None:
order = order[:pre_maxsize]
boxes = boxes[order].contiguous()
keep = torch.zeros(boxes.size(0), dtype=torch.long)
num_out = iou3d_cuda.nms_gpu(boxes, keep, thresh, boxes.device.index)
keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
if post_max_size is not None:
keep = keep[:post_max_size]
return keep
def nms_normal_gpu(boxes, scores, thresh):
"""Normal non maximum suppression on GPU.
Args:
boxes (torch.Tensor): Input boxes with shape (N, 5).
scores (torch.Tensor): Scores of predicted boxes with shape (N).
thresh (torch.Tensor): Threshold of non maximum suppression.
Returns:
torch.Tensor: Remaining indices with scores in descending order.
"""
order = scores.sort(0, descending=True)[1]
boxes = boxes[order].contiguous()
keep = torch.zeros(boxes.size(0), dtype=torch.long)
num_out = iou3d_cuda.nms_normal_gpu(boxes, keep, thresh,
boxes.device.index)
return order[keep[:num_out].cuda(boxes.device)].contiguous()
================================================
FILE: mmdet3d/ops/iou3d/src/iou3d.cpp
================================================
// Modified from
// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
#include
#include
#include
#include
#include
#define CHECK_CUDA(x) \
TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ")
#define CHECK_CONTIGUOUS(x) \
TORCH_CHECK(x.is_contiguous(), #x, " must be contiguous ")
#define CHECK_INPUT(x) \
CHECK_CUDA(x); \
CHECK_CONTIGUOUS(x)
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
#define CHECK_ERROR(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "GPUassert: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort) exit(code);
}
}
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
void boxesoverlapLauncher(const int num_a, const float *boxes_a,
const int num_b, const float *boxes_b,
float *ans_overlap);
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b,
const float *boxes_b, float *ans_iou);
void nmsLauncher(const float *boxes, unsigned long long *mask, int boxes_num,
float nms_overlap_thresh);
void nmsNormalLauncher(const float *boxes, unsigned long long *mask,
int boxes_num, float nms_overlap_thresh);
int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
at::Tensor ans_overlap) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT(boxes_a);
CHECK_INPUT(boxes_b);
CHECK_INPUT(ans_overlap);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
const float *boxes_a_data = boxes_a.data_ptr();
const float *boxes_b_data = boxes_b.data_ptr();
float *ans_overlap_data = ans_overlap.data_ptr();
boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data,
ans_overlap_data);
return 1;
}
int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b,
at::Tensor ans_iou) {
// params boxes_a: (N, 5) [x1, y1, x2, y2, ry]
// params boxes_b: (M, 5)
// params ans_overlap: (N, M)
CHECK_INPUT(boxes_a);
CHECK_INPUT(boxes_b);
CHECK_INPUT(ans_iou);
int num_a = boxes_a.size(0);
int num_b = boxes_b.size(0);
const float *boxes_a_data = boxes_a.data_ptr();
const float *boxes_b_data = boxes_b.data_ptr();
float *ans_iou_data = ans_iou.data_ptr();
boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
return 1;
}
int nms_gpu(at::Tensor boxes, at::Tensor keep,
float nms_overlap_thresh, int device_id) {
// params boxes: (N, 5) [x1, y1, x2, y2, ry]
// params keep: (N)
CHECK_INPUT(boxes);
CHECK_CONTIGUOUS(keep);
cudaSetDevice(device_id);
int boxes_num = boxes.size(0);
const float *boxes_data = boxes.data_ptr();
long *keep_data = keep.data_ptr();
const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
unsigned long long *mask_data = NULL;
CHECK_ERROR(cudaMalloc((void **)&mask_data,
boxes_num * col_blocks * sizeof(unsigned long long)));
nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
// unsigned long long mask_cpu[boxes_num * col_blocks];
// unsigned long long *mask_cpu = new unsigned long long [boxes_num *
// col_blocks];
std::vector