Repository: NVlabs/BEV-Planner Branch: main Commit: 01c28d6db56a Files: 481 Total size: 4.2 MB Directory structure: gitextract_5bm2vozl/ ├── .gitignore ├── README.md ├── configs/ │ ├── _base_/ │ │ ├── datasets/ │ │ │ ├── coco_instance.py │ │ │ ├── kitti-3d-3class.py │ │ │ ├── kitti-3d-car.py │ │ │ ├── kitti-mono3d.py │ │ │ ├── lyft-3d.py │ │ │ ├── nuim_instance.py │ │ │ ├── nus-3d.py │ │ │ ├── nus-mono3d.py │ │ │ ├── range100_lyft-3d.py │ │ │ ├── s3dis-3d-5class.py │ │ │ ├── s3dis_seg-3d-13class.py │ │ │ ├── scannet-3d-18class.py │ │ │ ├── scannet_seg-3d-20class.py │ │ │ ├── sunrgbd-3d-10class.py │ │ │ ├── waymoD5-3d-3class.py │ │ │ └── waymoD5-3d-car.py │ │ ├── default_runtime.py │ │ ├── init.py │ │ ├── models/ │ │ │ ├── 3dssd.py │ │ │ ├── cascade_mask_rcnn_r50_fpn.py │ │ │ ├── centerpoint_01voxel_second_secfpn_nus.py │ │ │ ├── centerpoint_02pillar_second_secfpn_nus.py │ │ │ ├── dgcnn.py │ │ │ ├── fcaf3d.py │ │ │ ├── fcos3d.py │ │ │ ├── groupfree3d.py │ │ │ ├── h3dnet.py │ │ │ ├── hv_pointpillars_fpn_lyft.py │ │ │ ├── hv_pointpillars_fpn_nus.py │ │ │ ├── hv_pointpillars_fpn_range100_lyft.py │ │ │ ├── hv_pointpillars_secfpn_kitti.py │ │ │ ├── hv_pointpillars_secfpn_waymo.py │ │ │ ├── hv_second_secfpn_kitti.py │ │ │ ├── hv_second_secfpn_waymo.py │ │ │ ├── imvotenet_image.py │ │ │ ├── mask_rcnn_r50_fpn.py │ │ │ ├── paconv_cuda_ssg.py │ │ │ ├── paconv_ssg.py │ │ │ ├── parta2.py │ │ │ ├── pgd.py │ │ │ ├── point_rcnn.py │ │ │ ├── pointnet2_msg.py │ │ │ ├── pointnet2_ssg.py │ │ │ ├── smoke.py │ │ │ └── votenet.py │ │ └── schedules/ │ │ ├── cosine.py │ │ ├── cyclic_20e.py │ │ ├── cyclic_40e.py │ │ ├── mmdet_schedule_1x.py │ │ ├── schedule_2x.py │ │ ├── schedule_3x.py │ │ ├── seg_cosine_100e.py │ │ ├── seg_cosine_150e.py │ │ ├── seg_cosine_200e.py │ │ └── seg_cosine_50e.py │ └── bev_next/ │ ├── bev_planner.py │ ├── bev_planner_plus.py │ ├── bev_planner_plus_plus.py │ ├── bev_planner_w_map.py │ ├── det_pretrain_320x800_vov_36ep.py │ ├── det_pretrain_640x1600_vov_36ep.py │ └── map_pretrain.py ├── mmdet3d/ │ ├── __init__.py │ ├── apis/ │ │ ├── __init__.py │ │ ├── inference.py │ │ ├── test.py │ │ └── train.py │ ├── core/ │ │ ├── __init__.py │ │ ├── anchor/ │ │ │ ├── __init__.py │ │ │ └── anchor_3d_generator.py │ │ ├── bbox/ │ │ │ ├── __init__.py │ │ │ ├── assigners/ │ │ │ │ └── __init__.py │ │ │ ├── box_np_ops.py │ │ │ ├── coders/ │ │ │ │ ├── __init__.py │ │ │ │ ├── anchor_free_bbox_coder.py │ │ │ │ ├── centerpoint_bbox_coders.py │ │ │ │ ├── delta_xyzwhlr_bbox_coder.py │ │ │ │ ├── fcos3d_bbox_coder.py │ │ │ │ ├── groupfree3d_bbox_coder.py │ │ │ │ ├── monoflex_bbox_coder.py │ │ │ │ ├── partial_bin_based_bbox_coder.py │ │ │ │ ├── pgd_bbox_coder.py │ │ │ │ ├── point_xyzwhlr_bbox_coder.py │ │ │ │ └── smoke_bbox_coder.py │ │ │ ├── iou_calculators/ │ │ │ │ ├── __init__.py │ │ │ │ └── iou3d_calculator.py │ │ │ ├── samplers/ │ │ │ │ ├── __init__.py │ │ │ │ └── iou_neg_piecewise_sampler.py │ │ │ ├── structures/ │ │ │ │ ├── __init__.py │ │ │ │ ├── base_box3d.py │ │ │ │ ├── box_3d_mode.py │ │ │ │ ├── cam_box3d.py │ │ │ │ ├── coord_3d_mode.py │ │ │ │ ├── custom_box.py │ │ │ │ ├── depth_box3d.py │ │ │ │ ├── lidar_box3d.py │ │ │ │ └── utils.py │ │ │ ├── transforms.py │ │ │ └── util.py │ │ ├── evaluation/ │ │ │ ├── __init__.py │ │ │ ├── indoor_eval.py │ │ │ ├── instance_seg_eval.py │ │ │ ├── kitti_utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── eval.py │ │ │ │ └── rotate_iou.py │ │ │ ├── lyft_eval.py │ │ │ ├── scannet_utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluate_semantic_instance.py │ │ │ │ └── util_3d.py │ │ │ ├── seg_eval.py │ │ │ └── waymo_utils/ │ │ │ ├── __init__.py │ │ │ └── prediction_kitti_to_waymo.py │ │ ├── hook/ │ │ │ ├── __init__.py │ │ │ ├── ema.py │ │ │ ├── forge_load.py │ │ │ ├── sequentialsontrol.py │ │ │ └── utils.py │ │ ├── points/ │ │ │ ├── __init__.py │ │ │ ├── base_points.py │ │ │ ├── cam_points.py │ │ │ ├── depth_points.py │ │ │ └── lidar_points.py │ │ ├── post_processing/ │ │ │ ├── __init__.py │ │ │ ├── box3d_nms.py │ │ │ └── merge_augs.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ ├── array_converter.py │ │ │ └── gaussian.py │ │ ├── visualizer/ │ │ │ ├── __init__.py │ │ │ ├── image_vis.py │ │ │ ├── open3d_vis.py │ │ │ └── show_result.py │ │ └── voxel/ │ │ ├── __init__.py │ │ ├── builder.py │ │ └── voxel_generator.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── custom_3d.py │ │ ├── custom_3d_seg.py │ │ ├── dataset_wrappers.py │ │ ├── evals/ │ │ │ ├── eval_utils.py │ │ │ ├── map_api.py │ │ │ ├── metric_utils.py │ │ │ └── nuscenes_eval_motion.py │ │ ├── evaluation/ │ │ │ ├── AP.py │ │ │ ├── __init__.py │ │ │ ├── distance.py │ │ │ ├── raster_eval.py │ │ │ └── vector_eval.py │ │ ├── kitti2d_dataset.py │ │ ├── kitti_dataset.py │ │ ├── kitti_mono_dataset.py │ │ ├── lyft_dataset.py │ │ ├── map_utils/ │ │ │ ├── mean_ap.py │ │ │ ├── tpfp.py │ │ │ └── tpfp_chamfer.py │ │ ├── nuscenes_dataset.py │ │ ├── nuscenes_eval.py │ │ ├── nuscenes_mono_dataset.py │ │ ├── occ_metrics.py │ │ ├── occupancy_eval.py │ │ ├── pipelines/ │ │ │ ├── __init__.py │ │ │ ├── compose.py │ │ │ ├── data_augment_utils.py │ │ │ ├── dbsampler.py │ │ │ ├── formating.py │ │ │ ├── loading.py │ │ │ ├── test_time_aug.py │ │ │ └── transforms_3d.py │ │ ├── s3dis_dataset.py │ │ ├── samplers/ │ │ │ ├── __init__.py │ │ │ ├── d_sampler.py │ │ │ └── infinite_group_each_sample_in_batch_sampler.py │ │ ├── scannet_dataset.py │ │ ├── semantickitti_dataset.py │ │ ├── sunrgbd_dataset.py │ │ ├── utils.py │ │ ├── vector_map.py │ │ └── waymo_dataset.py │ ├── models/ │ │ ├── __init__.py │ │ ├── backbones/ │ │ │ ├── __init__.py │ │ │ ├── base_pointnet.py │ │ │ ├── convnext.py │ │ │ ├── dgcnn.py │ │ │ ├── dla.py │ │ │ ├── load.py │ │ │ ├── mink_resnet.py │ │ │ ├── multi_backbone.py │ │ │ ├── nostem_regnet.py │ │ │ ├── pointnet2_sa_msg.py │ │ │ ├── pointnet2_sa_ssg.py │ │ │ ├── resnet.py │ │ │ ├── second.py │ │ │ ├── swin.py │ │ │ ├── vovnet.py │ │ │ └── vovnet2.py │ │ ├── builder.py │ │ ├── decode_heads/ │ │ │ ├── __init__.py │ │ │ ├── decode_head.py │ │ │ ├── dgcnn_head.py │ │ │ ├── paconv_head.py │ │ │ └── pointnet2_head.py │ │ ├── dense_heads/ │ │ │ ├── __init__.py │ │ │ ├── anchor3d_head.py │ │ │ ├── anchor_free_mono3d_head.py │ │ │ ├── base_conv_bbox_head.py │ │ │ ├── base_mono3d_dense_head.py │ │ │ ├── centerpoint_head.py │ │ │ ├── centerpoint_head_single_task.py │ │ │ ├── fcaf3d_head.py │ │ │ ├── fcos_mono3d_head.py │ │ │ ├── free_anchor3d_head.py │ │ │ ├── groupfree3d_head.py │ │ │ ├── monoflex_head.py │ │ │ ├── parta2_rpn_head.py │ │ │ ├── pgd_head.py │ │ │ ├── point_rpn_head.py │ │ │ ├── shape_aware_head.py │ │ │ ├── smoke_mono3d_head.py │ │ │ ├── ssd_3d_head.py │ │ │ ├── train_mixins.py │ │ │ └── vote_head.py │ │ ├── detectors/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ ├── bevdet.py │ │ │ ├── centerpoint.py │ │ │ ├── dynamic_voxelnet.py │ │ │ ├── fcos_mono3d.py │ │ │ ├── groupfree3dnet.py │ │ │ ├── h3dnet.py │ │ │ ├── imvotenet.py │ │ │ ├── imvoxelnet.py │ │ │ ├── mink_single_stage.py │ │ │ ├── mvx_faster_rcnn.py │ │ │ ├── mvx_two_stage.py │ │ │ ├── parta2.py │ │ │ ├── point_rcnn.py │ │ │ ├── sassd.py │ │ │ ├── single_stage.py │ │ │ ├── single_stage_mono3d.py │ │ │ ├── smoke_mono3d.py │ │ │ ├── ssd3dnet.py │ │ │ ├── two_stage.py │ │ │ ├── votenet.py │ │ │ └── voxelnet.py │ │ ├── fbbev/ │ │ │ ├── __init__.py │ │ │ ├── detectors/ │ │ │ │ ├── __init__.py │ │ │ │ └── bev_planner.py │ │ │ ├── heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── occupancy_head.py │ │ │ │ └── yolox.py │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ ├── depth_net.py │ │ │ │ ├── fpn3d.py │ │ │ │ ├── frpn.py │ │ │ │ ├── occ_loss_utils/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── focal_loss.py │ │ │ │ │ ├── lovasz_softmax.py │ │ │ │ │ ├── nusc_param.py │ │ │ │ │ └── semkitti.py │ │ │ │ └── resnet3d.py │ │ │ ├── motion_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── motion_head.py │ │ │ │ ├── motion_planner_head.py │ │ │ │ └── traj_loss.py │ │ │ ├── planner_head/ │ │ │ │ ├── AD_mlp.py │ │ │ │ ├── __init__.py │ │ │ │ ├── metric_stp3.py │ │ │ │ ├── naive_planner.py │ │ │ │ ├── plan_loss.py │ │ │ │ └── plan_loss_gt.py │ │ │ ├── streammapnet/ │ │ │ │ ├── CustomMSDeformableAttention.py │ │ │ │ ├── __init__.py │ │ │ │ ├── cost.py │ │ │ │ ├── fp16_dattn.py │ │ │ │ ├── hungarian_lines_assigner.py │ │ │ │ ├── loss.py │ │ │ │ ├── map_utils.py │ │ │ │ ├── streammapnet_head.py │ │ │ │ ├── transformer.py │ │ │ │ └── utils.py │ │ │ ├── streampetr/ │ │ │ │ ├── __init__.py │ │ │ │ ├── hungarian_assigner_2d.py │ │ │ │ ├── hungarian_assigner_3d.py │ │ │ │ ├── match_cost.py │ │ │ │ ├── nms_free_coder.py │ │ │ │ ├── petr_transformer.py │ │ │ │ ├── streampetr_utils.py │ │ │ │ └── streampetr_v2.py │ │ │ ├── track_head/ │ │ │ │ ├── __init__.py │ │ │ │ ├── instances.py │ │ │ │ ├── losses/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── tracking_loss.py │ │ │ │ │ ├── tracking_loss_base.py │ │ │ │ │ ├── tracking_loss_combo.py │ │ │ │ │ ├── tracking_loss_mem_bank.py │ │ │ │ │ └── tracking_loss_prediction.py │ │ │ │ ├── runtime_tracker.py │ │ │ │ ├── streampetr_utils.py │ │ │ │ ├── track_nms_free_coder.py │ │ │ │ ├── trackpetr.py │ │ │ │ └── utils.py │ │ │ ├── utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bricks.py │ │ │ │ ├── draw_bbox.py │ │ │ │ ├── eval_hook.py │ │ │ │ ├── grid_mask.py │ │ │ │ ├── timer_cp.py │ │ │ │ └── wechat_logger.py │ │ │ └── view_transformation/ │ │ │ ├── __init__.py │ │ │ ├── backward_projection/ │ │ │ │ ├── __init__.py │ │ │ │ ├── backward_projection.py │ │ │ │ └── bevformer_utils/ │ │ │ │ ├── __init__.py │ │ │ │ ├── bevformer.py │ │ │ │ ├── bevformer_encoder.py │ │ │ │ ├── custom_base_transformer_layer.py │ │ │ │ ├── multi_scale_deformable_attn_function.py │ │ │ │ ├── positional_encoding.py │ │ │ │ └── spatial_cross_attention_depth.py │ │ │ └── forward_projection/ │ │ │ ├── __init__.py │ │ │ └── view_transformer.py │ │ ├── fusion_layers/ │ │ │ ├── __init__.py │ │ │ ├── coord_transform.py │ │ │ ├── point_fusion.py │ │ │ └── vote_fusion.py │ │ ├── losses/ │ │ │ ├── __init__.py │ │ │ ├── axis_aligned_iou_loss.py │ │ │ ├── chamfer_distance.py │ │ │ ├── multibin_loss.py │ │ │ ├── paconv_regularization_loss.py │ │ │ ├── rotated_iou_loss.py │ │ │ └── uncertain_smooth_l1_loss.py │ │ ├── middle_encoders/ │ │ │ ├── __init__.py │ │ │ ├── pillar_scatter.py │ │ │ ├── sparse_encoder.py │ │ │ └── sparse_unet.py │ │ ├── model_utils/ │ │ │ ├── __init__.py │ │ │ ├── edge_fusion_module.py │ │ │ ├── transformer.py │ │ │ └── vote_module.py │ │ ├── necks/ │ │ │ ├── __init__.py │ │ │ ├── dla_neck.py │ │ │ ├── fpn.py │ │ │ ├── imvoxel_neck.py │ │ │ ├── lss_fpn.py │ │ │ ├── pointnet2_fp_neck.py │ │ │ ├── second_fpn.py │ │ │ └── view_transformer.py │ │ ├── roi_heads/ │ │ │ ├── __init__.py │ │ │ ├── base_3droi_head.py │ │ │ ├── bbox_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── h3d_bbox_head.py │ │ │ │ ├── parta2_bbox_head.py │ │ │ │ └── point_rcnn_bbox_head.py │ │ │ ├── h3d_roi_head.py │ │ │ ├── mask_heads/ │ │ │ │ ├── __init__.py │ │ │ │ ├── pointwise_semantic_head.py │ │ │ │ └── primitive_head.py │ │ │ ├── part_aggregation_roi_head.py │ │ │ ├── point_rcnn_roi_head.py │ │ │ └── roi_extractors/ │ │ │ ├── __init__.py │ │ │ ├── single_roiaware_extractor.py │ │ │ └── single_roipoint_extractor.py │ │ ├── segmentors/ │ │ │ ├── __init__.py │ │ │ ├── base.py │ │ │ └── encoder_decoder.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ ├── clip_sigmoid.py │ │ │ ├── edge_indices.py │ │ │ ├── gen_keypoints.py │ │ │ ├── handle_objs.py │ │ │ └── mlp.py │ │ └── voxel_encoders/ │ │ ├── __init__.py │ │ ├── pillar_encoder.py │ │ ├── utils.py │ │ └── voxel_encoder.py │ ├── ops/ │ │ ├── __init__.py │ │ ├── bev_pool_v2/ │ │ │ ├── __init__.py │ │ │ ├── bev_pool.py │ │ │ └── src/ │ │ │ ├── bev_pool.cpp │ │ │ └── bev_pool_cuda.cu │ │ ├── dgcnn_modules/ │ │ │ ├── __init__.py │ │ │ ├── dgcnn_fa_module.py │ │ │ ├── dgcnn_fp_module.py │ │ │ └── dgcnn_gf_module.py │ │ ├── norm.py │ │ ├── ops_dcnv3/ │ │ │ ├── functions/ │ │ │ │ ├── __init__.py │ │ │ │ └── dcnv3_func.py │ │ │ ├── make.sh │ │ │ ├── modules/ │ │ │ │ ├── __init__.py │ │ │ │ └── dcnv3.py │ │ │ ├── setup.py │ │ │ ├── src/ │ │ │ │ ├── cpu/ │ │ │ │ │ ├── dcnv3_cpu.cpp │ │ │ │ │ └── dcnv3_cpu.h │ │ │ │ ├── cuda/ │ │ │ │ │ ├── dcnv3_cuda.cu │ │ │ │ │ ├── dcnv3_cuda.h │ │ │ │ │ └── dcnv3_im2col_cuda.cuh │ │ │ │ ├── dcnv3.h │ │ │ │ └── vision.cpp │ │ │ └── test.py │ │ ├── paconv/ │ │ │ ├── __init__.py │ │ │ ├── paconv.py │ │ │ └── utils.py │ │ ├── pointnet_modules/ │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── paconv_sa_module.py │ │ │ ├── point_fp_module.py │ │ │ └── point_sa_module.py │ │ ├── sparse_block.py │ │ └── spconv/ │ │ ├── __init__.py │ │ └── overwrite_spconv/ │ │ ├── __init__.py │ │ └── write_spconv2.py │ ├── utils/ │ │ ├── __init__.py │ │ ├── collect_env.py │ │ ├── compat_cfg.py │ │ ├── logger.py │ │ ├── misc.py │ │ └── setup_env.py │ └── version.py ├── requirements/ │ ├── build.txt │ ├── docs.txt │ ├── mminstall.txt │ ├── optional.txt │ ├── readthedocs.txt │ ├── runtime.txt │ └── tests.txt └── tools/ ├── analysis_tools/ │ ├── analyze_logs.py │ ├── benchmark.py │ ├── benchmark_sequential.py │ ├── benchmark_trt.py │ ├── benchmark_view_transformer.py │ ├── create_video.py │ ├── generate_mask_based_on_lidar_points.py │ ├── get_flops.py │ ├── model_converter.py │ ├── occupancy_cbgs.py │ ├── vis.py │ └── vis_occupancy.py ├── create_data.py ├── create_data.sh ├── create_data_bev_planner.py ├── data_converter/ │ ├── __init__.py │ ├── create_gt_database.py │ ├── imgaug_demo.py │ ├── indoor_converter.py │ ├── kitti_converter.py │ ├── kitti_data_utils.py │ ├── lyft_converter.py │ ├── lyft_data_fixer.py │ ├── nuimage_converter.py │ ├── nuscenes_converter.py │ ├── nuscenes_prediction_tools.py │ ├── nuscenes_track_converter.py │ ├── s3dis_data_utils.py │ ├── scannet_data_utils.py │ ├── sunrgbd_data_utils.py │ └── waymo_converter.py ├── deployment/ │ ├── mmdet3d2torchserve.py │ ├── mmdet3d_handler.py │ └── test_torchserver.py ├── dist_test.sh ├── dist_train.sh ├── eval.py ├── misc/ │ ├── browse_dataset.py │ ├── download.sh │ ├── fuse_conv_bn.py │ ├── print_config.py │ ├── tmp.txt │ └── visualize_results.py ├── model_converters/ │ ├── convert_h3dnet_checkpoints.py │ ├── convert_votenet_checkpoints.py │ ├── publish_model.py │ └── regnet2mmdet.py ├── slurm_test.sh ├── slurm_train.sh ├── test.py ├── train.py ├── update_data_coords.py └── update_data_coords.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] *.vsix *$py.class *.ipynb *.zip # C extensions *.so *.npz *.npy # Distribution / packaging .Python *.mp4 *.pth *.jpg *.jpeg *.png *.log *.json *.csv ckpts work_dirs nuscenes-mini nuscenes-mini/ work_dirs/ work_dirs_/ data/ tests/ test/ test2/ val/ ckpts/ build/ develop-eggs/ dist/ downloads/ eggs/ .eggs/ lib/ wandb/ lib64/ parts/ sdist/ var/ wheels/ *.egg-info/ .installed.cfg *.egg MANIFEST # PyInstaller # Usually these files are written by a python script from a template # before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec # Installer logs pip-log.txt pip-delete-this-directory.txt # Unit test / coverage reports htmlcov/ .tox/ .coverage .coverage.* .cache nosetests.xml coverage.xml *.cover .hypothesis/ .pytest_cache/ # Translations *.mo *.pot # Django stuff: *.log local_settings.py db.sqlite3 # Flask stuff: instance/ .webassets-cache # Scrapy stuff: .scrapy # Sphinx documentation docs/_build/ # PyBuilder target/ # Jupyter Notebook .ipynb_checkpoints # pyenv .python-version # celery beat schedule file celerybeat-schedule # SageMath parsed files *.sage.py # Environments .env .venv env/ venv/ ENV/ env.bak/ venv.bak/ barrier bicycle bus car construction_vehicle driveable_surface manmade motorcycle other_flat others pedestrian per sidewalk terrain traffic_cone trailer truck vegetation # Spyder project settings .spyderproject .spyproject # Rope project settings .ropeproject # mkdocs documentation /site # mypy .mypy_cache/ # cython generated cpp data .vscode .idea # custom *.pkl *.pkl.json *.log.json work_dirs/ exps/ *~ mmdet3d/.mim # Pytorch *.pth # demo *.jpg *.png data/s3dis/Stanford3dDataset_v1.2_Aligned_Version/ data/scannet/scans/ data/sunrgbd/OFFICIAL_SUNRGBD/ *.obj *.ply *.pdf # Waymo evaluation mmdet3d/core/evaluation/waymo_utils/compute_detection_metrics_main ================================================ FILE: README.md ================================================ # Is Ego Status All You Need for Open-Loop End-to-End Autonomous Driving? ### [arXiv](http://arxiv.org/abs/2312.03031) | [知乎](https://zhuanlan.zhihu.com/p/669454065) https://github.com/NVlabs/BEV-Planner/assets/27915819/93afa127-813f-4d36-b4f2-84f6b8d9b905 ## INTRODUCTION End-to-end autonomous driving recently emerged as a promising research direction to target autonomy from a full-stack perspective. Along this line, many of the latest works follow an open-loop evaluation setting on nuScenes to study the planning behavior. In this paper, we delve deeper into the problem by conducting thorough analyses and demystifying more devils in the details. We initially observed that the nuScenes dataset, characterized by relatively simple driving scenarios, leads to an under-utilization of perception information in end-to-end models incorporating ego status, such as the ego vehicle's velocity. These models tend to rely predominantly on the ego vehicle's status for future path planning. Beyond the limitations of the dataset, we also note that current metrics do not comprehensively assess the planning quality, leading to potentially biased conclusions drawn from existing benchmarks. To address this issue, we introduce a new metric to evaluate whether the predicted trajectories adhere to the road. We further propose a simple baseline able to achieve competitive results without relying on perception annotations. Given the current limitations on the benchmark and metrics, we suggest the community reassess relevant prevailing research and be cautious whether the continued pursuit of state-of-the-art would yield convincing and universal conclusions. ## Start ### 1.Setting up Environment ### 2.Preparing Dataset ### 3.Training ### 4.Eval ================================================ FILE: configs/_base_/datasets/coco_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/coco/' img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict(type='Resize', img_scale=(1333, 800), keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1333, 800), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_train2017.json', img_prefix=data_root + 'train2017/', pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/instances_val2017.json', img_prefix=data_root + 'val2017/', pipeline=test_pipeline)) evaluation = dict(metric=['bbox', 'segm']) ================================================ FILE: configs/_base_/datasets/kitti-3d-3class.py ================================================ # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] point_cloud_range = [0, -40, -3, 70.4, 40, 1] input_modality = dict(use_lidar=True, use_camera=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/kitti/': # 's3://openmmlab/datasets/detection3d/kitti/', # 'data/kitti/': # 's3://openmmlab/datasets/detection3d/kitti/' # })) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=12, Pedestrian=6, Cyclist=6), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), file_client_args=file_client_args) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=6, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', file_client_args=file_client_args)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR', file_client_args=file_client_args), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR', file_client_args=file_client_args)) evaluation = dict(interval=1, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/kitti-3d-car.py ================================================ # dataset settings dataset_type = 'KittiDataset' data_root = 'data/kitti/' class_names = ['Car'] point_cloud_range = [0, -40, -3, 70.4, 40, 1] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'kitti_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15)) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://kitti_data/')) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ObjectNoise', num_try=100, translation_std=[1.0, 1.0, 0.5], global_rot_range=[0.0, 0.0], rot_range=[-0.78539816, 0.78539816]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=6, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR')), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val.pkl', split='training', pts_prefix='velodyne_reduced', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=1, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/kitti-mono3d.py ================================================ dataset_type = 'KittiMonoDataset' data_root = 'data/kitti/' class_names = ['Pedestrian', 'Cyclist', 'Car'] input_modality = dict(use_lidar=False, use_camera=True) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='LoadAnnotations3D', with_bbox=True, with_label=True, with_attr_label=False, with_bbox_3d=True, with_label_3d=True, with_bbox_depth=True), dict(type='Resize', img_scale=(1242, 375), keep_ratio=True), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'img', 'gt_bboxes', 'gt_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'centers2d', 'depths' ]), ] test_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='MultiScaleFlipAug', img_scale=(1242, 375), flip=False, transforms=[ dict(type='RandomFlip3D'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img']), ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_train_mono3d.coco.json', info_file=data_root + 'kitti_infos_train.pkl', img_prefix=data_root, classes=class_names, pipeline=train_pipeline, modality=input_modality, test_mode=False, box_type_3d='Camera'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val_mono3d.coco.json', info_file=data_root + 'kitti_infos_val.pkl', img_prefix=data_root, classes=class_names, pipeline=test_pipeline, modality=input_modality, test_mode=True, box_type_3d='Camera'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'kitti_infos_val_mono3d.coco.json', info_file=data_root + 'kitti_infos_val.pkl', img_prefix=data_root, classes=class_names, pipeline=test_pipeline, modality=input_modality, test_mode=True, box_type_3d='Camera')) evaluation = dict(interval=2) ================================================ FILE: configs/_base_/datasets/lyft-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-80, -80, -5, 80, 80, 3] # For Lyft we usually do 9-class detection class_names = [ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal' ] dataset_type = 'LyftDataset' data_root = 'data/lyft/' # Input modality for Lyft dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/lyft/': 's3://lyft/lyft/', # 'data/lyft/': 's3://lyft/lyft/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_test.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True)) # For Lyft dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/nuim_instance.py ================================================ dataset_type = 'CocoDataset' data_root = 'data/nuimages/' class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFile'), dict(type='LoadAnnotations', with_bbox=True, with_mask=True), dict( type='Resize', img_scale=[(1280, 720), (1920, 1080)], multiscale_mode='range', keep_ratio=True), dict(type='RandomFlip', flip_ratio=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle'), dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']), ] test_pipeline = [ dict(type='LoadImageFromFile'), dict( type='MultiScaleFlipAug', img_scale=(1600, 900), flip=False, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ]) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-train.json', img_prefix=data_root, classes=class_names, pipeline=train_pipeline), val=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-val.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline), test=dict( type=dataset_type, ann_file=data_root + 'annotations/nuimages_v1.0-val.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline)) evaluation = dict(metric=['bbox', 'segm']) ================================================ FILE: configs/_base_/datasets/nus-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-50, -50, -5, 50, 50, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' # Input modality for nuScenes dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/nuscenes/': 's3://nuscenes/nuscenes/', # 'data/nuscenes/': 's3://nuscenes/nuscenes/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, test_dataloader=dict(runner_type='EpochBasedRunner'), train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/nus-mono3d.py ================================================ dataset_type = 'NuScenesMonoDataset' data_root = 'data/nuscenes/' class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] # Input modality for nuScenes dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='LoadAnnotations3D', with_bbox=True, with_label=True, with_attr_label=True, with_bbox_3d=True, with_label_3d=True, with_bbox_depth=True), dict(type='Resize', img_scale=(1600, 900), keep_ratio=True), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'img', 'gt_bboxes', 'gt_labels', 'attr_labels', 'gt_bboxes_3d', 'gt_labels_3d', 'centers2d', 'depths' ]), ] test_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='MultiScaleFlipAug', scale_factor=1.0, flip=False, transforms=[ dict(type='RandomFlip3D'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img']), ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train_mono3d.coco.json', img_prefix=data_root, classes=class_names, pipeline=train_pipeline, modality=input_modality, test_mode=False, box_type_3d='Camera'), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline, modality=input_modality, test_mode=True, box_type_3d='Camera'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val_mono3d.coco.json', img_prefix=data_root, classes=class_names, pipeline=test_pipeline, modality=input_modality, test_mode=True, box_type_3d='Camera')) evaluation = dict(interval=2) ================================================ FILE: configs/_base_/datasets/range100_lyft-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-100, -100, -5, 100, 100, 3] # For Lyft we usually do 9-class detection class_names = [ 'car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal' ] dataset_type = 'LyftDataset' data_root = 'data/lyft/' # Input modality for Lyft dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/lyft/': 's3://lyft/lyft/', # 'data/lyft/': 's3://lyft/lyft/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=2, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'lyft_infos_test.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True)) # For Lyft dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/s3dis-3d-5class.py ================================================ # dataset settings dataset_type = 'S3DISDataset' data_root = './data/s3dis/' class_names = ('table', 'chair', 'sofa', 'bookcase', 'board') train_area = [1, 2, 3, 4, 6] test_area = 5 train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='PointSample', num_points=40000), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', # following ScanNet dataset the rotation range is 5 degrees rot_range=[-0.087266, 0.087266], scale_ratio_range=[1.0, 1.0], shift_height=True), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointSample', num_points=40000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type='ConcatDataset', datasets=[ dict( type=dataset_type, data_root=data_root, ann_file=data_root + f's3dis_infos_Area_{i}.pkl', pipeline=train_pipeline, filter_empty_gt=False, classes=class_names, box_type_3d='Depth') for i in train_area ], separate_eval=False)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + f's3dis_infos_Area_{test_area}.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth')) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/s3dis_seg-3d-13class.py ================================================ # dataset settings dataset_type = 'S3DISSegDataset' data_root = './data/s3dis/' class_names = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door', 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter') file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/s3dis/': # 's3://openmmlab/datasets/detection3d/s3dis_processed/', # 'data/s3dis/': # 's3://openmmlab/datasets/detection3d/s3dis_processed/' # })) num_points = 4096 train_area = [1, 2, 3, 4, 6] test_area = 5 train_pipeline = [ dict( type='LoadPointsFromFile', file_client_args=file_client_args, coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', file_client_args=file_client_args, with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=tuple(range(len(class_names))), max_cat_id=13), dict( type='IndoorPatchPointSample', num_points=num_points, block_size=1.0, ignore_index=len(class_names), use_normalized_coord=True, enlarge_size=0.2, min_unique_num=None), dict(type='NormalizePointsColor', color_mean=None), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] test_pipeline = [ dict( type='LoadPointsFromFile', file_client_args=file_client_args, coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict(type='NormalizePointsColor', color_mean=None), dict( # a wrapper in order to successfully call test function # actually we don't perform test-time-aug type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) # we need to load gt seg_mask! eval_pipeline = [ dict( type='LoadPointsFromFile', file_client_args=file_client_args, coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', file_client_args=file_client_args, with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=tuple(range(len(class_names))), max_cat_id=13), dict( type='DefaultFormatBundle3D', with_label=False, class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, # train on area 1, 2, 3, 4, 6 # test on area 5 train=dict( type=dataset_type, data_root=data_root, ann_files=[ data_root + f's3dis_infos_Area_{i}.pkl' for i in train_area ], pipeline=train_pipeline, classes=class_names, test_mode=False, ignore_index=len(class_names), scene_idxs=[ data_root + f'seg_info/Area_{i}_resampled_scene_idxs.npy' for i in train_area ], file_client_args=file_client_args), val=dict( type=dataset_type, data_root=data_root, ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names), scene_idxs=data_root + f'seg_info/Area_{test_area}_resampled_scene_idxs.npy', file_client_args=file_client_args), test=dict( type=dataset_type, data_root=data_root, ann_files=data_root + f's3dis_infos_Area_{test_area}.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names), file_client_args=file_client_args)) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/scannet-3d-18class.py ================================================ # dataset settings dataset_type = 'ScanNetDataset' data_root = './data/scannet/' class_names = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/scannet/': # 's3://openmmlab/datasets/detection3d/scannet_processed/', # 'data/scannet/': # 's3://openmmlab/datasets/detection3d/scannet_processed/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', file_client_args=file_client_args, coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict( type='LoadAnnotations3D', file_client_args=file_client_args, with_bbox_3d=True, with_label_3d=True, with_mask_3d=True, with_seg_3d=True), dict(type='GlobalAlignment', rotation_axis=2), dict( type='PointSegClassMapping', valid_cat_ids=(3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39), max_cat_id=40), dict(type='PointSample', num_points=40000), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.087266, 0.087266], scale_ratio_range=[1.0, 1.0], shift_height=True), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=[ 'points', 'gt_bboxes_3d', 'gt_labels_3d', 'pts_semantic_mask', 'pts_instance_mask' ]) ] test_pipeline = [ dict( type='LoadPointsFromFile', file_client_args=file_client_args, coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2]), dict(type='GlobalAlignment', rotation_axis=2), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict(type='PointSample', num_points=40000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', file_client_args=file_client_args, coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2]), dict(type='GlobalAlignment', rotation_axis=2), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_train.pkl', pipeline=train_pipeline, filter_empty_gt=False, classes=class_names, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth', file_client_args=file_client_args)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth', file_client_args=file_client_args), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth', file_client_args=file_client_args)) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/scannet_seg-3d-20class.py ================================================ # dataset settings dataset_type = 'ScanNetSegDataset' data_root = './data/scannet/' class_names = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'otherfurniture') num_points = 8192 file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/scannet/': # 's3://openmmlab/datasets/detection3d/scannet_processed/', # 'data/scannet/': # 's3://openmmlab/datasets/detection3d/scannet_processed/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5], file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True, file_client_args=file_client_args), dict( type='PointSegClassMapping', valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39), max_cat_id=40), dict( type='IndoorPatchPointSample', num_points=num_points, block_size=1.5, ignore_index=len(class_names), use_normalized_coord=False, enlarge_size=0.2, min_unique_num=None), dict(type='NormalizePointsColor', color_mean=None), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5], file_client_args=file_client_args), dict(type='NormalizePointsColor', color_mean=None), dict( # a wrapper in order to successfully call test function # actually we don't perform test-time-aug type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) # we need to load gt seg_mask! eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5], file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True, file_client_args=file_client_args), dict( type='PointSegClassMapping', valid_cat_ids=(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39), max_cat_id=40), dict( type='DefaultFormatBundle3D', with_label=False, class_names=class_names), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] data = dict( samples_per_gpu=8, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, ignore_index=len(class_names), scene_idxs=data_root + 'seg_info/train_resampled_scene_idxs.npy', file_client_args=file_client_args), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names), file_client_args=file_client_args), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'scannet_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, ignore_index=len(class_names), file_client_args=file_client_args)) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/sunrgbd-3d-10class.py ================================================ dataset_type = 'SUNRGBDDataset' data_root = 'data/sunrgbd/' class_names = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/sunrgbd/': # 's3://openmmlab/datasets/detection3d/sunrgbd_processed/', # 'data/sunrgbd/': # 's3://openmmlab/datasets/detection3d/sunrgbd_processed/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2], file_client_args=file_client_args), dict(type='LoadAnnotations3D', file_client_args=file_client_args), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict( type='GlobalRotScaleTrans', rot_range=[-0.523599, 0.523599], scale_ratio_range=[0.85, 1.15], shift_height=True), dict(type='PointSample', num_points=20000), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=True, load_dim=6, use_dim=[0, 1, 2], file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, ), dict(type='PointSample', num_points=20000), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2], file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=16, workers_per_gpu=4, train=dict( type='RepeatDataset', times=5, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_train.pkl', pipeline=train_pipeline, classes=class_names, filter_empty_gt=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='Depth', file_client_args=file_client_args)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth', file_client_args=file_client_args), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'sunrgbd_infos_val.pkl', pipeline=test_pipeline, classes=class_names, test_mode=True, box_type_3d='Depth', file_client_args=file_client_args)) evaluation = dict(pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/waymoD5-3d-3class.py ================================================ # dataset settings # D5 in the config name means the whole dataset is divided into 5 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://waymo_data/')) class_names = ['Car', 'Pedestrian', 'Cyclist'] point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict( filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5, Pedestrian=10, Cyclist=10)), classes=class_names, sample_groups=dict(Car=15, Pedestrian=10, Cyclist=10), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/datasets/waymoD5-3d-car.py ================================================ # dataset settings # D5 in the config name means the whole dataset is divided into 5 folds # We only use one fold for efficient experiments dataset_type = 'WaymoDataset' data_root = 'data/waymo/kitti_format/' file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', path_mapping=dict(data='s3://waymo_data/')) class_names = ['Car'] point_cloud_range = [-74.88, -74.88, -2, 74.88, 74.88, 4] input_modality = dict(use_lidar=True, use_camera=False) db_sampler = dict( data_root=data_root, info_path=data_root + 'waymo_dbinfos_train.pkl', rate=1.0, prepare=dict(filter_by_difficulty=[-1], filter_by_min_points=dict(Car=5)), classes=class_names, sample_groups=dict(Car=15), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args)) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args), dict(type='ObjectSample', db_sampler=db_sampler), dict( type='RandomFlip3D', sync_2d=False, flip_ratio_bev_horizontal=0.5, flip_ratio_bev_vertical=0.5), dict( type='GlobalRotScaleTrans', rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05]), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=5, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=2, workers_per_gpu=4, train=dict( type='RepeatDataset', times=2, dataset=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_train.pkl', split='training', pipeline=train_pipeline, modality=input_modality, classes=class_names, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR', # load one frame every five frames load_interval=5)), val=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'waymo_infos_val.pkl', split='training', pipeline=test_pipeline, modality=input_modality, classes=class_names, test_mode=True, box_type_3d='LiDAR')) evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: configs/_base_/default_runtime.py ================================================ checkpoint_config = dict(interval=1) # yapf:disable push # By default we use textlogger hook and tensorboard # For more loggers see # https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), dict(type='TensorboardLoggerHook') ]) # yapf:enable dist_params = dict(backend='nccl') log_level = 'INFO' work_dir = None load_from = None resume_from = None workflow = [('train', 1)] # disable opencv multithreading to avoid system being overloaded opencv_num_threads = 0 # set multi-process start method as `fork` to speed up the training mp_start_method = 'fork' ================================================ FILE: configs/_base_/init.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-40, -40, -1.0, 40, 40, 5.4] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } use_checkpoint = True sync_bn = True # Model grid_config = { 'x': [-40, 40, 0.8], 'y': [-40, 40, 0.8], 'z': [-1, 5.4, 0.8], 'depth': [2.0, 42.0, 0.5], } depth_categories = 80 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] use_custom_eval_hook=True bda_aug_conf = dict( rot_lim=(-22.5, 22.5), scale_lim=(1., 1.), flip_dx_ratio=0.5, flip_dy_ratio=0.5) num_Z_anchors = 8 voxel_size = [0.1, 0.1, 0.1] bev_h_ = 100 bev_w_ = 100 _dim_ = 256 _pos_dim_ = _dim_//2 _ffn_dim_ = _dim_ * 2 _num_levels_= 1 numC_Trans=80 empty_idx = 0 # noise 0-->255 num_cls = 18 # 0 free, 1-16 obj visible_mask = False img_norm_cfg = None cascade_ratio = 4 sample_from_voxel = False sample_from_img = False occ_size = [200, 200, 16] voxel_out_indices = (0, 1, 2) voxel_out_channel = 256 voxel_channels = [64, 64*2, 64*4] model = dict( type='NewBEV', use_depth_supervision=True, img_backbone=dict( # pretrained='ckpts/resnet50-0676ba61.pth', type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=False, with_cp=use_checkpoint, style='pytorch'), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=1., use_dcn=False, ), img_view_transformer=dict( type='LSSViewTransformerFunction3D', grid_config=grid_config, input_size=data_config['input_size'], # in_channels=256, # out_channels=numC_Trans, downsample=16), frpn=None, bevformer_encoder=None, img_bev_encoder_backbone=dict( type='CustomResNet3D', depth=18, with_cp=use_checkpoint, block_strides=[1, 2, 2], n_input_channels=numC_Trans, block_inplanes=voxel_channels, out_indices=voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), img_bev_encoder_neck=dict( type='FPN3D', with_cp=use_checkpoint, in_channels=voxel_channels, out_channels=voxel_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occupancy_head= dict( type='OccHead', with_cp=use_checkpoint, norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, cascade_ratio=cascade_ratio, sample_from_voxel=sample_from_voxel, sample_from_img=sample_from_img, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(voxel_out_indices), in_channels=[voxel_out_channel] * len(voxel_out_indices), out_channel=num_cls, point_cloud_range=point_cloud_range, loss_weight_cfg=dict( loss_voxel_ce_weight=1.0, loss_voxel_sem_scal_weight=1.0, loss_voxel_geo_scal_weight=1.0, loss_voxel_lovasz_weight=1.0, ), ), pts_bbox_head=None) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' dense_lidar_prefix = '/mount/data/nuscenes/' train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, dense_lidar_prefix=dense_lidar_prefix, file_client_args=file_client_args), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='LoadBEVMask', point_cloud_range=point_cloud_range, bev_size=(bev_h_, bev_w_)), dict(type='LoadOccupancy', ignore_nonvisible=True, occupancy_path=occupancy_path), # dict(type='PadMultiViewImage'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bev_mask', 'gt_occupancy', 'gt_depth' ]) ] test_pipeline = [ dict(type='PrepareImageInputs', data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, dense_lidar_prefix=dense_lidar_prefix, file_client_args=file_client_args), dict(type='LoadBEVMask'), dict(type='LoadOccupancy', occupancy_path=occupancy_path), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bev_mask', 'gt_occupancy', 'visible_mask']) ]) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, ) test_data_config = dict( pipeline=test_pipeline, ann_file=data_root + 'bevdetv2-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=2, workers_per_gpu=6, test_dataloader=dict(runner_type='EpochBasedRunner'), train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'bevdetv2-nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, use_valid_flag=True, modality=input_modality, img_info_prototype='bevdet', # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['val', 'test']: data[key].update(share_data_config) # data['train']['dataset'].update(share_data_config) # Optimizer optimizer = dict(type='AdamW', lr=1.4e-4, weight_decay=1e-2) optimizer_config = dict(grad_clip=dict(max_norm=5, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=200, warmup_ratio=0.001, step=[1,]) runner = dict(type='EpochBasedRunner', max_epochs=1) log_config = dict( interval=50, hooks=[ dict(type='WechatLoggerHook'), dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', ), dict( type='ForgeLoadWorker', priority='VERY_LOW', ), ] # load_from = 'ckpt1s/r50_256x705_depth_pretrain.pth' evaluation = dict(interval=12, pipeline=test_pipeline) fp16 = dict(loss_scale='dynamic') # checkpoint_config = dict(interval=5) # find_unused_parameters=True # Input shape: (256, 704) # Flops: 192.3 GFLOPs # Params: 58.39 M # find_unused_parameters=True ================================================ FILE: configs/_base_/models/3dssd.py ================================================ model = dict( type='SSD3DNet', backbone=dict( type='PointNet2SAMSG', in_channels=4, num_points=(4096, 512, (256, 256)), radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 64, 128), (64, 96, 128)), ((128, 128, 256), (128, 192, 256), (128, 256, 256))), aggregation_channels=(64, 128, 256), fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), fps_sample_range_lists=((-1), (-1), (512, -1)), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)), bbox_head=dict( type='SSD3DHead', in_channels=256, vote_module_cfg=dict( in_channels=256, num_points=256, gt_per_seed=1, conv_channels=(128, ), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), with_res_feat=False, vote_xyz_range=(3.0, 3.0, 2.0)), vote_aggregation_cfg=dict( type='PointSAModuleMSG', num_point=256, radii=(4.8, 6.4), sample_nums=(16, 32), mlp_channels=((256, 256, 256, 512), (256, 256, 512, 1024)), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), use_xyz=True, normalize_xyz=False, bias=True), pred_layer_cfg=dict( in_channels=1536, shared_conv_channels=(512, 128), cls_conv_channels=(128, ), reg_conv_channels=(128, ), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.1), objectness_loss=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), center_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), corner_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=1.0), vote_loss=dict(type='SmoothL1Loss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict( sample_mod='spec', pos_distance_thr=10.0, expand_dims_length=0.05), test_cfg=dict( nms_cfg=dict(type='nms', iou_thr=0.1), sample_mod='spec', score_thr=0.0, per_class_proposal=True, max_output_num=100)) ================================================ FILE: configs/_base_/models/cascade_mask_rcnn_r50_fpn.py ================================================ # model settings model = dict( type='CascadeRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0)), roi_head=dict( type='CascadeRoIHead', num_stages=3, stage_loss_weights=[1, 0.5, 0.25], bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=[ dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.05, 0.05, 0.1, 0.1]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)), dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.033, 0.033, 0.067, 0.067]), reg_class_agnostic=True, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0, loss_weight=1.0)) ], mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=80, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=2000, nms_post=2000, max_per_img=2000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=[ dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.6, neg_iou_thr=0.6, min_pos_iou=0.6, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False), dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.7, min_pos_iou=0.7, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False) ]), test_cfg=dict( rpn=dict( nms_pre=1000, nms_post=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: configs/_base_/models/centerpoint_01voxel_second_secfpn_nus.py ================================================ voxel_size = [0.1, 0.1, 0.2] model = dict( type='CenterPoint', pts_voxel_layer=dict( max_num_points=10, voxel_size=voxel_size, max_voxels=(90000, 120000)), pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), pts_middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[41, 1024, 1024], output_channels=128, order=('conv', 'norm', 'act'), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, [0, 1, 1]), (0, 0)), block_type='basicblock'), pts_backbone=dict( type='SECOND', in_channels=256, out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='CenterHead', in_channels=sum([256, 256]), tasks=[ dict(num_class=1, class_names=['car']), dict(num_class=2, class_names=['truck', 'construction_vehicle']), dict(num_class=2, class_names=['bus', 'trailer']), dict(num_class=1, class_names=['barrier']), dict(num_class=2, class_names=['motorcycle', 'bicycle']), dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), ], common_heads=dict( reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), share_conv_channel=64, bbox_coder=dict( type='CenterPointBBoxCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_num=500, score_threshold=0.1, out_size_factor=8, voxel_size=voxel_size[:2], code_size=9), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), norm_bbox=True), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[1024, 1024, 40], voxel_size=voxel_size, out_size_factor=8, dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), test_cfg=dict( pts=dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, max_pool_nms=False, min_radius=[4, 12, 10, 1, 0.85, 0.175], score_threshold=0.1, out_size_factor=8, voxel_size=voxel_size[:2], nms_type='rotate', pre_max_size=1000, post_max_size=83, nms_thr=0.2))) ================================================ FILE: configs/_base_/models/centerpoint_02pillar_second_secfpn_nus.py ================================================ voxel_size = [0.2, 0.2, 8] model = dict( type='CenterPoint', pts_voxel_layer=dict( max_num_points=20, voxel_size=voxel_size, max_voxels=(30000, 40000)), pts_voxel_encoder=dict( type='PillarFeatureNet', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=(0.2, 0.2, 8), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), legacy=False), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=(512, 512)), pts_backbone=dict( type='SECOND', in_channels=64, out_channels=[64, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], out_channels=[128, 128, 128], upsample_strides=[0.5, 1, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), pts_bbox_head=dict( type='CenterHead', in_channels=sum([128, 128, 128]), tasks=[ dict(num_class=1, class_names=['car']), dict(num_class=2, class_names=['truck', 'construction_vehicle']), dict(num_class=2, class_names=['bus', 'trailer']), dict(num_class=1, class_names=['barrier']), dict(num_class=2, class_names=['motorcycle', 'bicycle']), dict(num_class=2, class_names=['pedestrian', 'traffic_cone']), ], common_heads=dict( reg=(2, 2), height=(1, 2), dim=(3, 2), rot=(2, 2), vel=(2, 2)), share_conv_channel=64, bbox_coder=dict( type='CenterPointBBoxCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_num=500, score_threshold=0.1, out_size_factor=4, voxel_size=voxel_size[:2], code_size=9), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict(type='L1Loss', reduction='mean', loss_weight=0.25), norm_bbox=True), # model training and testing settings train_cfg=dict( pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, out_size_factor=4, dense_reg=1, gaussian_overlap=0.1, max_objs=500, min_radius=2, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2])), test_cfg=dict( pts=dict( post_center_limit_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], max_per_img=500, max_pool_nms=False, min_radius=[4, 12, 10, 1, 0.85, 0.175], score_threshold=0.1, pc_range=[-51.2, -51.2], out_size_factor=4, voxel_size=voxel_size[:2], nms_type='rotate', pre_max_size=1000, post_max_size=83, nms_thr=0.2))) ================================================ FILE: configs/_base_/models/dgcnn.py ================================================ # model settings model = dict( type='EncoderDecoder3D', backbone=dict( type='DGCNNBackbone', in_channels=9, # [xyz, rgb, normal_xyz], modified with dataset num_samples=(20, 20, 20), knn_modes=('D-KNN', 'F-KNN', 'F-KNN'), radius=(None, None, None), gf_channels=((64, 64), (64, 64), (64, )), fa_channels=(1024, ), act_cfg=dict(type='LeakyReLU', negative_slope=0.2)), decode_head=dict( type='DGCNNHead', fp_channels=(1216, 512), channels=256, dropout_ratio=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='LeakyReLU', negative_slope=0.2), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, class_weight=None, # modified with dataset loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide')) ================================================ FILE: configs/_base_/models/fcaf3d.py ================================================ model = dict( type='MinkSingleStage3DDetector', voxel_size=.01, backbone=dict(type='MinkResNet', in_channels=3, depth=34), head=dict( type='FCAF3DHead', in_channels=(64, 128, 256, 512), out_channels=128, voxel_size=.01, pts_prune_threshold=100000, pts_assign_threshold=27, pts_center_threshold=18, n_classes=18, n_reg_outs=6), train_cfg=dict(), test_cfg=dict(nms_pre=1000, iou_thr=.5, score_thr=.01)) ================================================ FILE: configs/_base_/models/fcos3d.py ================================================ model = dict( type='FCOSMono3D', backbone=dict( type='ResNet', depth=101, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe', init_cfg=dict( type='Pretrained', checkpoint='open-mmlab://detectron2/resnet101_caffe')), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, start_level=1, add_extra_convs='on_output', num_outs=5, relu_before_extra_convs=True), bbox_head=dict( type='FCOSMono3DHead', num_classes=10, in_channels=256, stacked_convs=2, feat_channels=256, use_direction_classifier=True, diff_rad_by_sin=True, pred_attrs=True, pred_velo=True, dir_offset=0.7854, # pi/4 dir_limit_offset=0, strides=[8, 16, 32, 64, 128], group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo cls_branch=(256, ), reg_branch=( (256, ), # offset (256, ), # depth (256, ), # size (256, ), # rot () # velo ), dir_branch=(256, ), attr_branch=(256, ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), norm_on_bbox=True, centerness_on_reg=True, center_sampling=True, conv_bias=True, dcn_on_last_conv=True), train_cfg=dict( allowed_border=0, code_weight=[1.0, 1.0, 0.2, 1.0, 1.0, 1.0, 1.0, 0.05, 0.05], pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.8, score_thr=0.05, min_bbox_size=0, max_per_img=200)) ================================================ FILE: configs/_base_/models/groupfree3d.py ================================================ model = dict( type='GroupFree3DNet', backbone=dict( type='PointNet2SASSG', in_channels=3, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 288)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), bbox_head=dict( type='GroupFree3DHead', in_channels=288, num_decoder_layers=6, num_proposal=256, transformerlayers=dict( type='BaseTransformerLayer', attn_cfgs=dict( type='GroupFree3DMHA', embed_dims=288, num_heads=8, attn_drop=0.1, dropout_layer=dict(type='Dropout', drop_prob=0.1)), ffn_cfgs=dict( embed_dims=288, feedforward_channels=2048, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True)), operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), pred_layer_cfg=dict( in_channels=288, shared_conv_channels=(288, 288), bias=True), sampling_objectness_loss=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=8.0), objectness_loss=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), center_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', beta=1.0, reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict(sample_mod='kps'), test_cfg=dict( sample_mod='kps', nms_thr=0.25, score_thr=0.0, per_class_proposal=True, prediction_stages='last')) ================================================ FILE: configs/_base_/models/h3dnet.py ================================================ primitive_z_cfg = dict( type='PrimitiveHead', num_dims=2, num_classes=18, primitive_mode='z', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) primitive_xy_cfg = dict( type='PrimitiveHead', num_dims=1, num_classes=18, primitive_mode='xy', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=0.5, loss_dst_weight=0.5), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) primitive_line_cfg = dict( type='PrimitiveHead', num_dims=0, num_classes=18, primitive_mode='line', upper_thresh=100.0, surface_thresh=0.5, vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=1, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=1024, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.4, 0.6], reduction='mean', loss_weight=30.0), center_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0), semantic_reg_loss=dict( type='ChamferDistance', mode='l1', reduction='sum', loss_src_weight=1.0, loss_dst_weight=1.0), semantic_cls_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=2.0), train_cfg=dict( dist_thresh=0.2, var_thresh=1e-2, lower_thresh=1e-6, num_point=100, num_point_line=10, line_thresh=0.2)) model = dict( type='H3DNet', backbone=dict( type='MultiBackbone', num_streams=4, suffixes=['net0', 'net1', 'net2', 'net3'], conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), act_cfg=dict(type='ReLU'), backbones=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True))), rpn_head=dict( type='VoteHead', vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), roi_head=dict( type='H3DRoIHead', primitive_list=[primitive_z_cfg, primitive_xy_cfg, primitive_line_cfg], bbox_head=dict( type='H3DBboxHead', gt_per_seed=3, num_proposal=256, suface_matching_cfg=dict( type='PointSAModule', num_point=256 * 6, radius=0.5, num_sample=32, mlp_channels=[128 + 6, 128, 64, 32], use_xyz=True, normalize_xyz=True), line_matching_cfg=dict( type='PointSAModule', num_point=256 * 12, radius=0.5, num_sample=32, mlp_channels=[128 + 12, 128, 64, 32], use_xyz=True, normalize_xyz=True), feat_channels=(128, 128), primitive_refine_channels=[128, 128, 128], upper_thresh=100.0, surface_thresh=0.5, line_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=0.1), cues_objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.3, 0.7], reduction='mean', loss_weight=5.0), cues_semantic_loss=dict( type='CrossEntropyLoss', class_weight=[0.3, 0.7], reduction='mean', loss_weight=5.0), proposal_objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='none', loss_weight=5.0), primitive_center_loss=dict( type='MSELoss', reduction='none', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), rpn_proposal=dict(use_nms=False), rcnn=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote', far_threshold=0.6, near_threshold=0.3, mask_surface_threshold=0.3, label_surface_threshold=0.3, mask_line_threshold=0.3, label_line_threshold=0.3)), test_cfg=dict( rpn=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True, use_nms=False), rcnn=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True))) ================================================ FILE: configs/_base_/models/hv_pointpillars_fpn_lyft.py ================================================ _base_ = './hv_pointpillars_fpn_nus.py' # model settings (based on nuScenes model settings) # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. model = dict( pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-80, -80, -5, 80, 80, 3], max_voxels=(60000, 60000)), pts_voxel_encoder=dict( feat_channels=[64], point_cloud_range=[-80, -80, -5, 80, 80, 3]), pts_middle_encoder=dict(output_shape=[640, 640]), pts_bbox_head=dict( num_classes=9, anchor_generator=dict( ranges=[[-80, -80, -1.8, 80, 80, -1.8]], custom_values=[]), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), # model training settings (based on nuScenes model settings) train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) ================================================ FILE: configs/_base_/models/hv_pointpillars_fpn_nus.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.25, 0.25, 8] model = dict( type='MVXFasterRCNN', pts_voxel_layer=dict( max_num_points=64, point_cloud_range=[-50, -50, -5, 50, 50, 3], voxel_size=voxel_size, max_voxels=(30000, 40000)), pts_voxel_encoder=dict( type='HardVFE', in_channels=4, feat_channels=[64, 64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=[-50, -50, -5, 50, 50, 3], norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[400, 400]), pts_backbone=dict( type='SECOND', in_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), pts_neck=dict( type='FPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), act_cfg=dict(type='ReLU'), in_channels=[64, 128, 256], out_channels=256, start_level=0, num_outs=3), pts_bbox_head=dict( type='Anchor3DHead', num_classes=10, in_channels=256, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-50, -50, -1.8, 50, 50, -1.8]], scales=[1, 2, 4], sizes=[ [2.5981, 0.8660, 1.], # 1.5 / sqrt(3) [1.7321, 0.5774, 1.], # 1 / sqrt(3) [1., 1., 1.], [0.4, 0.4, 1], ], custom_values=[0, 0], rotations=[0, 1.57], reshape_out=True), assigner_per_size=False, diff_rad_by_sin=True, dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=9), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=dict( type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=1000, nms_thr=0.2, score_thr=0.05, min_bbox_size=0, max_num=500))) ================================================ FILE: configs/_base_/models/hv_pointpillars_fpn_range100_lyft.py ================================================ _base_ = './hv_pointpillars_fpn_nus.py' # model settings (based on nuScenes model settings) # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. model = dict( pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-100, -100, -5, 100, 100, 3], max_voxels=(60000, 60000)), pts_voxel_encoder=dict( feat_channels=[64], point_cloud_range=[-100, -100, -5, 100, 100, 3]), pts_middle_encoder=dict(output_shape=[800, 800]), pts_bbox_head=dict( num_classes=9, anchor_generator=dict( ranges=[[-100, -100, -1.8, 100, 100, -1.8]], custom_values=[]), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7)), # model training settings (based on nuScenes model settings) train_cfg=dict(pts=dict(code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]))) ================================================ FILE: configs/_base_/models/hv_pointpillars_secfpn_kitti.py ================================================ voxel_size = [0.16, 0.16, 4] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=32, # max_points_per_voxel point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1], voxel_size=voxel_size, max_voxels=(16000, 40000) # (training, testing) max_voxels ), voxel_encoder=dict( type='PillarFeatureNet', in_channels=4, feat_channels=[64], with_distance=False, voxel_size=voxel_size, point_cloud_range=[0, -39.68, -3, 69.12, 39.68, 1]), middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[496, 432]), backbone=dict( type='SECOND', in_channels=64, layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], out_channels=[64, 128, 256]), neck=dict( type='SECONDFPN', in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, assign_per_class=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[ [0, -39.68, -0.6, 69.12, 39.68, -0.6], [0, -39.68, -0.6, 69.12, 39.68, -0.6], [0, -39.68, -1.78, 69.12, 39.68, -1.78], ], sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) ================================================ FILE: configs/_base_/models/hv_pointpillars_secfpn_waymo.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.32, 0.32, 6] model = dict( type='MVXFasterRCNN', pts_voxel_layer=dict( max_num_points=20, point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], voxel_size=voxel_size, max_voxels=(32000, 32000)), pts_voxel_encoder=dict( type='HardVFE', in_channels=5, feat_channels=[64], with_distance=False, voxel_size=voxel_size, with_cluster_center=True, with_voxel_center=True, point_cloud_range=[-74.88, -74.88, -2, 74.88, 74.88, 4], norm_cfg=dict(type='naiveSyncBN1d', eps=1e-3, momentum=0.01)), pts_middle_encoder=dict( type='PointPillarsScatter', in_channels=64, output_shape=[468, 468]), pts_backbone=dict( type='SECOND', in_channels=64, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[3, 5, 5], layer_strides=[1, 2, 2], out_channels=[64, 128, 256]), pts_neck=dict( type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[64, 128, 256], upsample_strides=[1, 2, 4], out_channels=[128, 128, 128]), pts_bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=384, feat_channels=384, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-74.88, -74.88, -0.0345, 74.88, 74.88, -0.0345], [-74.88, -74.88, -0.1188, 74.88, 74.88, -0.1188], [-74.88, -74.88, 0, 74.88, 74.88, 0]], sizes=[ [4.73, 2.08, 1.77], # car [1.81, 0.84, 1.77], # cyclist [0.91, 0.84, 1.74] # pedestrian ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( pts=dict( assigner=[ dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False)), test_cfg=dict( pts=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.25, score_thr=0.1, min_bbox_size=0, max_num=500))) ================================================ FILE: configs/_base_/models/hv_second_secfpn_kitti.py ================================================ voxel_size = [0.05, 0.05, 0.1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=5, point_cloud_range=[0, -40, -3, 70.4, 40, 1], voxel_size=voxel_size, max_voxels=(16000, 40000)), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseEncoder', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[ [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78], ], sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.35, neg_iou_thr=0.2, min_pos_iou=0.2, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1), ], allowed_border=0, pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_thr=0.01, score_thr=0.1, min_bbox_size=0, nms_pre=100, max_num=50)) ================================================ FILE: configs/_base_/models/hv_second_secfpn_waymo.py ================================================ # model settings # Voxel size for voxel encoder # Usually voxel size is changed consistently with the point cloud range # If point cloud range is modified, do remember to change all related # keys in the config. voxel_size = [0.08, 0.08, 0.1] model = dict( type='VoxelNet', voxel_layer=dict( max_num_points=10, point_cloud_range=[-76.8, -51.2, -2, 76.8, 51.2, 4], voxel_size=voxel_size, max_voxels=(80000, 90000)), voxel_encoder=dict(type='HardSimpleVFE', num_features=5), middle_encoder=dict( type='SparseEncoder', in_channels=5, sparse_shape=[61, 1280, 1920], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=384, norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', norm_cfg=dict(type='naiveSyncBN2d', eps=1e-3, momentum=0.01), in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), bbox_head=dict( type='Anchor3DHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='AlignedAnchor3DRangeGenerator', ranges=[[-76.8, -51.2, -0.0345, 76.8, 51.2, -0.0345], [-76.8, -51.2, 0, 76.8, 51.2, 0], [-76.8, -51.2, -0.1188, 76.8, 51.2, -0.1188]], sizes=[ [4.73, 2.08, 1.77], # car [0.91, 0.84, 1.74], # pedestrian [1.81, 0.84, 1.77] # cyclist ], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, dir_offset=-0.7854, # -pi / 4 bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder', code_size=7), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), # model training and testing settings train_cfg=dict( assigner=[ dict( # car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.55, neg_iou_thr=0.4, min_pos_iou=0.4, ignore_iof_thr=-1), dict( # pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1), dict( # cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.3, min_pos_iou=0.3, ignore_iof_thr=-1) ], allowed_border=0, code_weight=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], pos_weight=-1, debug=False), test_cfg=dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.25, score_thr=0.1, min_bbox_size=0, max_num=500)) ================================================ FILE: configs/_base_/models/imvotenet_image.py ================================================ model = dict( type='ImVoteNet', img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=False), norm_eval=True, style='caffe'), img_neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), img_rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), img_roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=10, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0))), # model training and testing settings train_cfg=dict( img_rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), img_rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), img_rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=False, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), pos_weight=-1, debug=False)), test_cfg=dict( img_rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), img_rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100))) ================================================ FILE: configs/_base_/models/mask_rcnn_r50_fpn.py ================================================ # model settings model = dict( type='MaskRCNN', pretrained='torchvision://resnet50', backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=1, norm_cfg=dict(type='BN', requires_grad=True), norm_eval=True, style='pytorch'), neck=dict( type='FPN', in_channels=[256, 512, 1024, 2048], out_channels=256, num_outs=5), rpn_head=dict( type='RPNHead', in_channels=256, feat_channels=256, anchor_generator=dict( type='AnchorGenerator', scales=[8], ratios=[0.5, 1.0, 2.0], strides=[4, 8, 16, 32, 64]), bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[.0, .0, .0, .0], target_stds=[1.0, 1.0, 1.0, 1.0]), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), roi_head=dict( type='StandardRoIHead', bbox_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), bbox_head=dict( type='Shared2FCBBoxHead', in_channels=256, fc_out_channels=1024, roi_feat_size=7, num_classes=80, bbox_coder=dict( type='DeltaXYWHBBoxCoder', target_means=[0., 0., 0., 0.], target_stds=[0.1, 0.1, 0.2, 0.2]), reg_class_agnostic=False, loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=1.0)), mask_roi_extractor=dict( type='SingleRoIExtractor', roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0), out_channels=256, featmap_strides=[4, 8, 16, 32]), mask_head=dict( type='FCNMaskHead', num_convs=4, in_channels=256, conv_out_channels=256, num_classes=80, loss_mask=dict( type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.7, neg_iou_thr=0.3, min_pos_iou=0.3, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=256, pos_fraction=0.5, neg_pos_ub=-1, add_gt_as_proposals=False), allowed_border=-1, pos_weight=-1, debug=False), rpn_proposal=dict( nms_across_levels=False, nms_pre=2000, nms_post=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( assigner=dict( type='MaxIoUAssigner', pos_iou_thr=0.5, neg_iou_thr=0.5, min_pos_iou=0.5, match_low_quality=True, ignore_iof_thr=-1), sampler=dict( type='RandomSampler', num=512, pos_fraction=0.25, neg_pos_ub=-1, add_gt_as_proposals=True), mask_size=28, pos_weight=-1, debug=False)), test_cfg=dict( rpn=dict( nms_across_levels=False, nms_pre=1000, nms_post=1000, max_per_img=1000, nms=dict(type='nms', iou_threshold=0.7), min_bbox_size=0), rcnn=dict( score_thr=0.05, nms=dict(type='nms', iou_threshold=0.5), max_per_img=100, mask_thr_binary=0.5))) ================================================ FILE: configs/_base_/models/paconv_cuda_ssg.py ================================================ _base_ = './paconv_ssg.py' model = dict( backbone=dict( sa_cfg=dict( type='PAConvCUDASAModule', scorenet_cfg=dict(mlp_channels=[8, 16, 16])))) ================================================ FILE: configs/_base_/models/paconv_ssg.py ================================================ # model settings model = dict( type='EncoderDecoder3D', backbone=dict( type='PointNet2SASSG', in_channels=9, # [xyz, rgb, normalized_xyz] num_points=(1024, 256, 64, 16), radius=(None, None, None, None), # use kNN instead of ball query num_samples=(32, 32, 32, 32), sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, 512)), fp_channels=(), norm_cfg=dict(type='BN2d', momentum=0.1), sa_cfg=dict( type='PAConvSAModule', pool_mod='max', use_xyz=True, normalize_xyz=False, paconv_num_kernels=[16, 16, 16], paconv_kernel_input='w_neighbor', scorenet_input='w_neighbor_dist', scorenet_cfg=dict( mlp_channels=[16, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False))), decode_head=dict( type='PAConvHead', # PAConv model's decoder takes skip connections from beckbone # different from PointNet++, it also concats input features in the last # level of decoder, leading to `128 + 6` as the channel number fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), (128 + 6, 128, 128, 128)), channels=128, dropout_ratio=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, class_weight=None, # should be modified with dataset loss_weight=1.0)), # correlation loss to regularize PAConv's kernel weights loss_regularization=dict( type='PAConvRegularizationLoss', reduction='sum', loss_weight=10.0), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide')) ================================================ FILE: configs/_base_/models/parta2.py ================================================ # model settings voxel_size = [0.05, 0.05, 0.1] point_cloud_range = [0, -40, -3, 70.4, 40, 1] model = dict( type='PartA2', voxel_layer=dict( max_num_points=5, # max_points_per_voxel point_cloud_range=point_cloud_range, voxel_size=voxel_size, max_voxels=(16000, 40000) # (training, testing) max_voxels ), voxel_encoder=dict(type='HardSimpleVFE'), middle_encoder=dict( type='SparseUNet', in_channels=4, sparse_shape=[41, 1600, 1408], order=('conv', 'norm', 'act')), backbone=dict( type='SECOND', in_channels=256, layer_nums=[5, 5], layer_strides=[1, 2], out_channels=[128, 256]), neck=dict( type='SECONDFPN', in_channels=[128, 256], upsample_strides=[1, 2], out_channels=[256, 256]), rpn_head=dict( type='PartA2RPNHead', num_classes=3, in_channels=512, feat_channels=512, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', ranges=[[0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -0.6, 70.4, 40.0, -0.6], [0, -40.0, -1.78, 70.4, 40.0, -1.78]], sizes=[[0.8, 0.6, 1.73], [1.76, 0.6, 1.73], [3.9, 1.6, 1.56]], rotations=[0, 1.57], reshape_out=False), diff_rad_by_sin=True, assigner_per_size=True, assign_per_class=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.2)), roi_head=dict( type='PartAggregationROIHead', num_classes=3, semantic_head=dict( type='PointwiseSemanticHead', in_channels=16, extra_width=0.2, seg_score_thr=0.3, num_classes=3, loss_seg=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), loss_part=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)), seg_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='max')), part_roi_extractor=dict( type='Single3DRoIAwareExtractor', roi_layer=dict( type='RoIAwarePool3d', out_size=14, max_pts_per_voxel=128, mode='avg')), bbox_head=dict( type='PartA2BboxHead', num_classes=3, seg_in_channels=16, part_in_channels=4, seg_conv_channels=[64, 64], part_conv_channels=[64, 64], merge_conv_channels=[128, 128], down_conv_channels=[128, 256], bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), shared_fc_channels=[256, 512, 512, 512], cls_channels=[256, 256], reg_channels=[256, 256], dropout_ratio=0.1, roi_feat_size=14, with_corner_loss=True, loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0))), # model training and testing settings train_cfg=dict( rpn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.5, neg_iou_thr=0.35, min_pos_iou=0.35, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict(type='BboxOverlapsNearest3D'), pos_iou_thr=0.6, neg_iou_thr=0.45, min_pos_iou=0.45, ignore_iof_thr=-1) ], allowed_border=0, pos_weight=-1, debug=False), rpn_proposal=dict( nms_pre=9000, nms_post=512, max_num=512, nms_thr=0.8, score_thr=0, use_rotate_nms=False), rcnn=dict( assigner=[ dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1), dict( # for Car type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1) ], sampler=dict( type='IoUNegPiecewiseSampler', num=128, pos_fraction=0.55, neg_piece_fractions=[0.8, 0.2], neg_iou_piece_thrs=[0.55, 0.1], neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=True), cls_pos_thr=0.75, cls_neg_thr=0.25)), test_cfg=dict( rpn=dict( nms_pre=1024, nms_post=100, max_num=100, nms_thr=0.7, score_thr=0, use_rotate_nms=True), rcnn=dict( use_rotate_nms=True, use_raw_score=True, nms_thr=0.01, score_thr=0.1))) ================================================ FILE: configs/_base_/models/pgd.py ================================================ _base_ = './fcos3d.py' # model settings model = dict( bbox_head=dict( _delete_=True, type='PGDHead', num_classes=10, in_channels=256, stacked_convs=2, feat_channels=256, use_direction_classifier=True, diff_rad_by_sin=True, pred_attrs=True, pred_velo=True, pred_bbox2d=True, pred_keypoints=False, dir_offset=0.7854, # pi/4 strides=[8, 16, 32, 64, 128], group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo cls_branch=(256, ), reg_branch=( (256, ), # offset (256, ), # depth (256, ), # size (256, ), # rot () # velo ), dir_branch=(256, ), attr_branch=(256, ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), norm_on_bbox=True, centerness_on_reg=True, center_sampling=True, conv_bias=True, dcn_on_last_conv=True, use_depth_classifier=True, depth_branch=(256, ), depth_range=(0, 50), depth_unit=10, division='uniform', depth_bins=6, bbox_coder=dict(type='PGDBBoxCoder', code_size=9)), test_cfg=dict(nms_pre=1000, nms_thr=0.8, score_thr=0.01, max_per_img=200)) ================================================ FILE: configs/_base_/models/point_rcnn.py ================================================ model = dict( type='PointRCNN', backbone=dict( type='PointNet2SAMSG', in_channels=4, num_points=(4096, 1024, 256, 64), radii=((0.1, 0.5), (0.5, 1.0), (1.0, 2.0), (2.0, 4.0)), num_samples=((16, 32), (16, 32), (16, 32), (16, 32)), sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96, 128)), ((128, 196, 256), (128, 196, 256)), ((256, 256, 512), (256, 384, 512))), fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')), fps_sample_range_lists=((-1), (-1), (-1), (-1)), aggregation_channels=(None, None, None, None), dilated_group=(False, False, False, False), out_indices=(0, 1, 2, 3), norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.1), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)), neck=dict( type='PointNetFPNeck', fp_channels=((1536, 512, 512), (768, 512, 512), (608, 256, 256), (257, 128, 128))), rpn_head=dict( type='PointRPNHead', num_classes=3, enlarge_width=0.1, pred_layer_cfg=dict( in_channels=128, cls_linear_channels=(256, 256), reg_linear_channels=(256, 256)), cls_loss=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), bbox_loss=dict( type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0), bbox_coder=dict( type='PointXYZWHLRBBoxCoder', code_size=8, # code_size: (center residual (3), size regression (3), # torch.cos(yaw) (1), torch.sin(yaw) (1) use_mean_size=True, mean_size=[[3.9, 1.6, 1.56], [0.8, 0.6, 1.73], [1.76, 0.6, 1.73]])), roi_head=dict( type='PointRCNNRoIHead', point_roi_extractor=dict( type='Single3DRoIPointExtractor', roi_layer=dict(type='RoIPointPool3d', num_sampled_points=512)), bbox_head=dict( type='PointRCNNBboxHead', num_classes=1, pred_layer_cfg=dict( in_channels=512, cls_conv_channels=(256, 256), reg_conv_channels=(256, 256), bias=True), in_channels=5, # 5 = 3 (xyz) + scores + depth mlp_channels=[128, 128], num_points=(128, 32, -1), radius=(0.2, 0.4, 100), num_samples=(16, 16, 16), sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)), with_corner_loss=True), depth_normalizer=70.0), # model training and testing settings train_cfg=dict( pos_distance_thr=10.0, rpn=dict( nms_cfg=dict( use_rotate_nms=True, iou_thr=0.8, nms_pre=9000, nms_post=512), score_thr=None), rcnn=dict( assigner=[ dict( # for Car type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1, match_low_quality=False), dict( # for Pedestrian type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1, match_low_quality=False), dict( # for Cyclist type='MaxIoUAssigner', iou_calculator=dict( type='BboxOverlaps3D', coordinate='lidar'), pos_iou_thr=0.55, neg_iou_thr=0.55, min_pos_iou=0.55, ignore_iof_thr=-1, match_low_quality=False) ], sampler=dict( type='IoUNegPiecewiseSampler', num=128, pos_fraction=0.5, neg_piece_fractions=[0.8, 0.2], neg_iou_piece_thrs=[0.55, 0.1], neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=True), cls_pos_thr=0.7, cls_neg_thr=0.25)), test_cfg=dict( rpn=dict( nms_cfg=dict( use_rotate_nms=True, iou_thr=0.85, nms_pre=9000, nms_post=512), score_thr=None), rcnn=dict(use_rotate_nms=True, nms_thr=0.1, score_thr=0.1))) ================================================ FILE: configs/_base_/models/pointnet2_msg.py ================================================ _base_ = './pointnet2_ssg.py' # model settings model = dict( backbone=dict( _delete_=True, type='PointNet2SAMSG', in_channels=6, # [xyz, rgb], should be modified with dataset num_points=(1024, 256, 64, 16), radii=((0.05, 0.1), (0.1, 0.2), (0.2, 0.4), (0.4, 0.8)), num_samples=((16, 32), (16, 32), (16, 32), (16, 32)), sa_channels=(((16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 96, 128)), ((128, 196, 256), (128, 196, 256)), ((256, 256, 512), (256, 384, 512))), aggregation_channels=(None, None, None, None), fps_mods=(('D-FPS'), ('D-FPS'), ('D-FPS'), ('D-FPS')), fps_sample_range_lists=((-1), (-1), (-1), (-1)), dilated_group=(False, False, False, False), out_indices=(0, 1, 2, 3), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False)), decode_head=dict( fp_channels=((1536, 256, 256), (512, 256, 256), (352, 256, 128), (128, 128, 128, 128)))) ================================================ FILE: configs/_base_/models/pointnet2_ssg.py ================================================ # model settings model = dict( type='EncoderDecoder3D', backbone=dict( type='PointNet2SASSG', in_channels=6, # [xyz, rgb], should be modified with dataset num_points=(1024, 256, 64, 16), radius=(0.1, 0.2, 0.4, 0.8), num_samples=(32, 32, 32, 32), sa_channels=((32, 32, 64), (64, 64, 128), (128, 128, 256), (256, 256, 512)), fp_channels=(), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=False)), decode_head=dict( type='PointNet2Head', fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), (128, 128, 128, 128)), channels=128, dropout_ratio=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, class_weight=None, # should be modified with dataset loss_weight=1.0)), # model training and testing settings train_cfg=dict(), test_cfg=dict(mode='slide')) ================================================ FILE: configs/_base_/models/smoke.py ================================================ model = dict( type='SMOKEMono3D', backbone=dict( type='DLANet', depth=34, in_channels=3, norm_cfg=dict(type='GN', num_groups=32), init_cfg=dict( type='Pretrained', checkpoint='http://dl.yf.io/dla/models/imagenet/dla34-ba72cf86.pth' )), neck=dict( type='DLANeck', in_channels=[16, 32, 64, 128, 256, 512], start_level=2, end_level=5, norm_cfg=dict(type='GN', num_groups=32)), bbox_head=dict( type='SMOKEMono3DHead', num_classes=3, in_channels=64, dim_channel=[3, 4, 5], ori_channel=[6, 7], stacked_convs=0, feat_channels=64, use_direction_classifier=False, diff_rad_by_sin=False, pred_attrs=False, pred_velo=False, dir_offset=0, strides=None, group_reg_dims=(8, ), cls_branch=(256, ), reg_branch=((256, ), ), num_attrs=0, bbox_code_size=7, dir_branch=(), attr_branch=(), bbox_coder=dict( type='SMOKECoder', base_depth=(28.01, 16.32), base_dims=((0.88, 1.73, 0.67), (1.78, 1.70, 0.58), (3.88, 1.63, 1.53)), code_size=7), loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), loss_bbox=dict(type='L1Loss', reduction='sum', loss_weight=1 / 300), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=None, conv_bias=True, dcn_on_last_conv=False), train_cfg=None, test_cfg=dict(topK=100, local_maximum_kernel=3, max_per_img=100)) ================================================ FILE: configs/_base_/models/votenet.py ================================================ model = dict( type='VoteNet', backbone=dict( type='PointNet2SASSG', in_channels=4, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True)), bbox_head=dict( type='VoteHead', vote_module_cfg=dict( in_channels=256, vote_per_seed=1, gt_per_seed=3, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), norm_feats=True, vote_loss=dict( type='ChamferDistance', mode='l1', reduction='none', loss_dst_weight=10.0)), vote_aggregation_cfg=dict( type='PointSAModule', num_point=256, radius=0.3, num_sample=16, mlp_channels=[256, 128, 128, 128], use_xyz=True, normalize_xyz=True), pred_layer_cfg=dict( in_channels=128, shared_conv_channels=(128, 128), bias=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=dict( type='CrossEntropyLoss', class_weight=[0.2, 0.8], reduction='sum', loss_weight=5.0), center_loss=dict( type='ChamferDistance', mode='l2', reduction='sum', loss_src_weight=10.0, loss_dst_weight=10.0), dir_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), dir_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0), size_class_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0), size_res_loss=dict( type='SmoothL1Loss', reduction='sum', loss_weight=10.0 / 3.0), semantic_loss=dict( type='CrossEntropyLoss', reduction='sum', loss_weight=1.0)), # model training and testing settings train_cfg=dict( pos_distance_thr=0.3, neg_distance_thr=0.6, sample_mod='vote'), test_cfg=dict( sample_mod='seed', nms_thr=0.25, score_thr=0.05, per_class_proposal=True)) ================================================ FILE: configs/_base_/schedules/cosine.py ================================================ # This schedule is mainly used by models with dynamic voxelization # optimizer lr = 0.003 # max learning rate optimizer = dict( type='AdamW', lr=lr, betas=(0.95, 0.99), # the momentum is change during training weight_decay=0.001) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 10, min_lr_ratio=1e-5) momentum_config = None runner = dict(type='EpochBasedRunner', max_epochs=40) ================================================ FILE: configs/_base_/schedules/cyclic_20e.py ================================================ # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 20. Please change the interval accordingly if you do not # use a default schedule. # optimizer # This schedule is mainly used by models on nuScenes dataset optimizer = dict(type='AdamW', lr=1e-4, weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, ) # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=20) ================================================ FILE: configs/_base_/schedules/cyclic_40e.py ================================================ # The schedule is usually used by models trained on KITTI dataset # The learning rate set in the cyclic schedule is the initial learning rate # rather than the max learning rate. Since the target_ratio is (10, 1e-4), # the learning rate will change from 0.0018 to 0.018, than go to 0.0018*1e-4 lr = 0.0018 # The optimizer follows the setting in SECOND.Pytorch, but here we use # the official AdamW optimizer implemented by PyTorch. optimizer = dict(type='AdamW', lr=lr, betas=(0.95, 0.99), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) # We use cyclic learning rate and momentum schedule following SECOND.Pytorch # https://github.com/traveller59/second.pytorch/blob/3aba19c9688274f75ebb5e576f65cfe54773c021/torchplus/train/learning_schedules_fastai.py#L69 # noqa # We implement them in mmcv, for more details, please refer to # https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327 # noqa # https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130 # noqa lr_config = dict( policy='cyclic', target_ratio=(10, 1e-4), cyclic_times=1, step_ratio_up=0.4, ) momentum_config = dict( policy='cyclic', target_ratio=(0.85 / 0.95, 1), cyclic_times=1, step_ratio_up=0.4, ) # Although the max_epochs is 40, this schedule is usually used we # RepeatDataset with repeat ratio N, thus the actual max epoch # number could be Nx40 runner = dict(type='EpochBasedRunner', max_epochs=40) ================================================ FILE: configs/_base_/schedules/mmdet_schedule_1x.py ================================================ # optimizer optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) # learning policy lr_config = dict( policy='step', warmup='linear', warmup_iters=500, warmup_ratio=0.001, step=[8, 11]) runner = dict(type='EpochBasedRunner', max_epochs=12) ================================================ FILE: configs/_base_/schedules/schedule_2x.py ================================================ # optimizer # This schedule is mainly used by models on nuScenes dataset optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.01) # max_norm=10 is better for SECOND optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='step', warmup='linear', warmup_iters=1000, warmup_ratio=1.0 / 1000, step=[20, 23]) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=24) ================================================ FILE: configs/_base_/schedules/schedule_3x.py ================================================ # optimizer # This schedule is mainly used by models on indoor dataset, # e.g., VoteNet on SUNRGBD and ScanNet lr = 0.008 # max learning rate optimizer = dict(type='AdamW', lr=lr, weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=10, norm_type=2)) lr_config = dict(policy='step', warmup=None, step=[24, 32]) # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=36) ================================================ FILE: configs/_base_/schedules/seg_cosine_100e.py ================================================ # optimizer # This schedule is mainly used on S3DIS dataset in segmentation task optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=100) ================================================ FILE: configs/_base_/schedules/seg_cosine_150e.py ================================================ # optimizer # This schedule is mainly used on S3DIS dataset in segmentation task optimizer = dict(type='SGD', lr=0.2, weight_decay=0.0001, momentum=0.9) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=0.002) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=150) ================================================ FILE: configs/_base_/schedules/seg_cosine_200e.py ================================================ # optimizer # This schedule is mainly used on ScanNet dataset in segmentation task optimizer = dict(type='Adam', lr=0.001, weight_decay=0.01) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=200) ================================================ FILE: configs/_base_/schedules/seg_cosine_50e.py ================================================ # optimizer # This schedule is mainly used on S3DIS dataset in segmentation task optimizer = dict(type='Adam', lr=0.001, weight_decay=0.001) optimizer_config = dict(grad_clip=None) lr_config = dict(policy='CosineAnnealing', warmup=None, min_lr=1e-5) momentum_config = None # runtime settings runner = dict(type='EpochBasedRunner', max_epochs=50) ================================================ FILE: configs/bev_next/bev_planner.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 4 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 12 checkpoint_epoch_interval = 1 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly # bev configs roi_size = (102.4, 102.4) bev_h = 128 bev_w = 128 point_cloud_range = [-roi_size[0]/2, -roi_size[1]/2, -5, roi_size[0]/2, roi_size[1]/2, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (0.38, 0.55), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(1., 1.), flip_dx_ratio=0., flip_dy_ratio=0.) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 ### occupancy config empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 ### map_classes = ['divider', 'ped_crossing', 'boundary'] map_num_vec = 100 map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 map_fixed_ptsnum_per_pred_line = 20 map_eval_use_same_gt_sample_num_flag = True map_num_classes = len(map_classes) embed_dims = 256 num_feat_levels = 1 norm_cfg = dict(type='BN2d') num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 num_points = 20 permute = True with_ego_as_agent = False ### model = dict( type='BEVPlanner', use_depth_supervision=False, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, fuse_history_bev=True, use_grid_mask=True, align_prev_bev=False, img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=False, # pretrained='torchvision://resnet50', init_cfg=dict( type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth", prefix='backbone.'), style='pytorch'), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=1., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=None, pts_bbox_head=None, map_head=None, motion_head=None, planner_head=dict( type='NaivePlannerHead' ), # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=None), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap2', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='LoadGTMotion'), dict(type='LoadGTPlaner'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', 'map_gt_labels_3d', 'map_gt_bboxes_3d' ] + ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+ ['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks'] ) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', # img_corruptions='sun', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='LoadGTPlaner'), dict(type='LoadGTMotion', with_ego_as_agent=with_ego_as_agent), dict(type='LoadFutBoxInfo'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'map_gt_bboxes_3d', 'map_gt_labels_3d']+ ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks']+ ['gt_fut_segmentations', 'gt_fut_segmentations_plus', 'fut_boxes_in_cur_ego_list'] ) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, map_ann_file=data_root + 'nuscenes_map_infos_102x102_val.pkl', map_eval_cfg=dict( region = (102.4, 102.4) # (H, W) ), load_fut_bbox_info=True, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=2, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, test_mode=False, use_valid_flag=True, sequences_split_num=train_sequences_split_num, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['train', 'val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=1e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=2*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter=0, ), # dict( # type='TimerCP', # ) ] # load_from = None # resume_from = None ================================================ FILE: configs/bev_next/bev_planner_plus.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 4 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 12 checkpoint_epoch_interval = 1 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly # bev configs roi_size = (102.4, 102.4) bev_h = 128 bev_w = 128 point_cloud_range = [-roi_size[0]/2, -roi_size[1]/2, -5, roi_size[0]/2, roi_size[1]/2, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (0.38, 0.55), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(1., 1.), flip_dx_ratio=0., flip_dy_ratio=0.) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 ### occupancy config empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 ### map_classes = ['divider', 'ped_crossing', 'boundary'] map_num_vec = 100 map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 map_fixed_ptsnum_per_pred_line = 20 map_eval_use_same_gt_sample_num_flag = True map_num_classes = len(map_classes) embed_dims = 256 num_feat_levels = 1 norm_cfg = dict(type='BN2d') num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 num_points = 20 permute = True with_ego_as_agent = False ### model = dict( type='BEVPlanner', use_depth_supervision=False, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, fuse_history_bev=True, use_grid_mask=True, align_prev_bev=False, with_ego_status=True, img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=False, # pretrained='torchvision://resnet50', init_cfg=dict( type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth", prefix='backbone.'), style='pytorch'), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=1., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=None, pts_bbox_head=None, map_head=None, motion_head=None, planner_head=dict( type='NaivePlannerHead' ), # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=None), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap2', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='LoadGTMotion'), dict(type='LoadGTPlaner'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', 'map_gt_labels_3d', 'map_gt_bboxes_3d' ] + ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+['can_bus_info']+ ['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks'] ) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', # img_corruptions='sun', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='LoadGTPlaner'), dict(type='LoadGTMotion', with_ego_as_agent=with_ego_as_agent), dict(type='LoadFutBoxInfo'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'map_gt_bboxes_3d', 'map_gt_labels_3d']+ ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks']+ ['gt_fut_segmentations', 'gt_fut_segmentations_plus', 'fut_boxes_in_cur_ego_list']+ ['can_bus_info'] ) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, map_ann_file=data_root + 'nuscenes_map_infos_102x102_val.pkl', map_eval_cfg=dict( region = (102.4, 102.4) # (H, W) ), load_fut_bbox_info=True, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=2, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, test_mode=False, use_valid_flag=True, sequences_split_num=train_sequences_split_num, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['train', 'val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=1e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=2*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter=0, ), # dict( # type='TimerCP', # ) ] # load_from = None # resume_from = None ================================================ FILE: configs/bev_next/bev_planner_plus_plus.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 4 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 12 checkpoint_epoch_interval = 1 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly # bev configs roi_size = (102.4, 102.4) bev_h = 128 bev_w = 128 point_cloud_range = [-roi_size[0]/2, -roi_size[1]/2, -5, roi_size[0]/2, roi_size[1]/2, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (0.38, 0.55), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(1., 1.), flip_dx_ratio=0., flip_dy_ratio=0.) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 ### occupancy config empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 ### map_classes = ['divider', 'ped_crossing', 'boundary'] map_num_vec = 100 map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 map_fixed_ptsnum_per_pred_line = 20 map_eval_use_same_gt_sample_num_flag = True map_num_classes = len(map_classes) embed_dims = 256 num_feat_levels = 1 norm_cfg = dict(type='BN2d') num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 num_points = 20 permute = True with_ego_as_agent = False ### model = dict( type='BEVPlanner', use_depth_supervision=False, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, fuse_history_bev=True, use_grid_mask=True, align_prev_bev=False, with_ego_status=True, img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=False, # pretrained='torchvision://resnet50', init_cfg=dict( type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth", prefix='backbone.'), style='pytorch'), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=1., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=None, pts_bbox_head=None, map_head=None, motion_head=None, planner_head=dict( type='NaivePlannerHead', with_ego_status=True, ), # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=None), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap2', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='LoadGTMotion'), dict(type='LoadGTPlaner'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', 'map_gt_labels_3d', 'map_gt_bboxes_3d' ] + ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+ ['can_bus_info']+ ['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks'] ) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', # img_corruptions='sun', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='LoadGTPlaner'), dict(type='LoadGTMotion', with_ego_as_agent=with_ego_as_agent), dict(type='LoadFutBoxInfo'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'map_gt_bboxes_3d', 'map_gt_labels_3d']+ ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks']+ ['gt_fut_segmentations', 'gt_fut_segmentations_plus', 'fut_boxes_in_cur_ego_list'] + ['can_bus_info'] ) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, map_ann_file=data_root + 'nuscenes_map_infos_102x102_val.pkl', map_eval_cfg=dict( region = (102.4, 102.4) # (H, W) ), load_fut_bbox_info=True, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=2, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, test_mode=False, use_valid_flag=True, sequences_split_num=train_sequences_split_num, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['train', 'val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=1e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=2*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter=0, ), # dict( # type='TimerCP', # ) ] # load_from = None # resume_from = None ================================================ FILE: configs/bev_next/bev_planner_w_map.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 4 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 12 checkpoint_epoch_interval = 1 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly # bev configs roi_size = (102.4, 102.4) bev_h = 128 bev_w = 128 point_cloud_range = [-roi_size[0]/2, -roi_size[1]/2, -5, roi_size[0]/2, roi_size[1]/2, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (0.38, 0.55), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(1., 1.), flip_dx_ratio=0., flip_dy_ratio=0.) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 ### occupancy config empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 ### map_classes = ['divider', 'ped_crossing', 'boundary'] map_num_vec = 100 map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 map_fixed_ptsnum_per_pred_line = 20 map_eval_use_same_gt_sample_num_flag = True map_num_classes = len(map_classes) embed_dims = 256 num_feat_levels = 1 norm_cfg = dict(type='BN2d') num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 num_points = 20 permute = True with_ego_as_agent = False ### model = dict( type='BEVPlanner', use_depth_supervision=False, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, fuse_history_bev=True, use_grid_mask=True, align_prev_bev=False, img_backbone=dict( type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=True, pretrained='torchvision://resnet50', style='pytorch'), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=1., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=None, pts_bbox_head=None, map_head=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, streaming_cfg=dict( streaming=False, batch_size=samples_per_gpu, topk=int(num_queries*(1/3)), trans_loss_weight=0.1, ), # streaming_cfg=None, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=0.5 ), loss_reg=dict( type='LinesL1Loss', loss_weight=5.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=0.5), reg_cost=dict(type='LinesL1Cost', weight=5.0, beta=0.01, permute=permute), ), ), ), motion_head=None, planner_head=dict( type='NaivePlannerHead', use_map_info=True, loss_plan_reg=dict(type='L1Loss', loss_weight=20.0), loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=20.0), ), # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=None), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap2', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='LoadGTMotion'), dict(type='LoadGTPlaner'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', 'map_gt_labels_3d', 'map_gt_bboxes_3d' ] + ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+ ['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks'] ) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', # img_corruptions='sun', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='LoadGTPlaner'), dict(type='LoadGTMotion', with_ego_as_agent=with_ego_as_agent), dict(type='LoadFutBoxInfo'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'map_gt_bboxes_3d', 'map_gt_labels_3d']+ ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks']+ ['gt_fut_segmentations', 'gt_fut_segmentations_plus', 'fut_boxes_in_cur_ego_list'] ) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, map_ann_file=data_root + 'nuscenes_map_infos_102x102_val.pkl', map_eval_cfg=dict( region = (102.4, 102.4) # (H, W) ), load_fut_bbox_info=True, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=2, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, test_mode=False, use_valid_flag=True, sequences_split_num=train_sequences_split_num, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['train', 'val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=1e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=2*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter=0, ), # dict( # type='TimerCP', # ) ] # load_from = None # resume_from = None ================================================ FILE: configs/bev_next/det_pretrain_320x800_vov_36ep.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 2 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 36 checkpoint_epoch_interval = 2 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (320, 800), 'src_size': (900, 1600), # Augmentation 'resize': (0.47, 0.625), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-22.5, 22.5), scale_lim=(1., 1.), flip_dx_ratio=0.5, flip_dy_ratio=0.5) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 model = dict( type='BEVPlanner', use_depth_supervision=True, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, use_grid_mask=True, with_ego_status=False, img_backbone=dict( type='VoVNetCP', ###use checkpoint to save memory spec_name='V-99-eSE', norm_eval=True, frozen_stages=-1, input_ch=3, out_features=('stage4','stage5',)), img_neck=dict( type='CustomFPN', in_channels=[768, 1024], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=3., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=dict( type='YOLOXHeadCustom', num_classes=10, in_channels=80, strides=[16], train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)), ), pts_bbox_head=dict( type='SparseHead4BEV', num_classes=10, in_channels=_dim_, num_query=300, memory_len=512, topk_proposals=128, num_propagated=128, scalar=10, ##noise groups noise_scale = 1.0, dn_weight= 1.0, ##dn loss weight split = 0.75, ###positive rate with_dn=True, with_ego_pos=True, match_with_velo=False, code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], transformer=dict( type='Detr3DTransformer', decoder=dict( type='Detr3DTransformerDecoder', embed_dims=_dim_, num_layers=6, transformerlayers=dict( type='Detr3DTemporalDecoderLayer', batch_first=True, attn_cfgs=[ dict( type='SparseBEVSelfAttention', embed_dims=_dim_, num_heads=8, dropout=0.0), dict( type='DeformableFeatureAggregationCuda', embed_dims=_dim_, num_groups=8, num_levels=1, # num_cams=6, dropout=0.0, num_pts=13, bias=2.), ], feedforward_channels=2048, ffn_dropout=0.0, with_cp=True, ###use checkpoint to save memory operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), )), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=point_cloud_range, max_num=300, num_classes=10), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0),), map_head=None, # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=point_cloud_range), ), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', ] + ['can_bus_info']) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'] + ['can_bus_info']) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=6, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, use_valid_flag=True, modality=input_modality, img_info_prototype='bevdet', sequences_split_num=train_sequences_split_num, use_sequence_group_flag=True, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=4e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=checkpoint_epoch_interval*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter= num_iters_per_epoch*2, ), dict( type='TimerCP', ) ] load_from = 'ckpts/fcos3d_vovnet_imgbackbone-remapped.pth' ================================================ FILE: configs/bev_next/det_pretrain_640x1600_vov_36ep.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 2 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 36 checkpoint_epoch_interval = 2 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (640, 1600), 'src_size': (900, 1600), # Augmentation 'resize': (0.94, 1.25), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-22.5, 22.5), scale_lim=(1., 1.), flip_dx_ratio=0.5, flip_dy_ratio=0.5) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 model = dict( type='BEVPlanner', use_depth_supervision=True, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, use_grid_mask=True, with_ego_status=False, img_backbone=dict( type='VoVNetCP', ###use checkpoint to save memory spec_name='V-99-eSE', norm_eval=True, frozen_stages=-1, input_ch=3, out_features=('stage4','stage5',)), img_neck=dict( type='CustomFPN', in_channels=[768, 1024], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=3., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=dict( type='YOLOXHeadCustom', num_classes=10, in_channels=80, strides=[16], train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)), test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)), ), pts_bbox_head=dict( type='SparseHead4BEV', num_classes=10, in_channels=_dim_, num_query=300, memory_len=512, topk_proposals=128, num_propagated=128, scalar=10, ##noise groups noise_scale = 1.0, dn_weight= 1.0, ##dn loss weight split = 0.75, ###positive rate with_dn=True, with_ego_pos=True, match_with_velo=False, code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], transformer=dict( type='Detr3DTransformer', decoder=dict( type='Detr3DTransformerDecoder', embed_dims=_dim_, num_layers=6, transformerlayers=dict( type='Detr3DTemporalDecoderLayer', batch_first=True, attn_cfgs=[ dict( type='SparseBEVSelfAttention', embed_dims=_dim_, num_heads=8, dropout=0.0), dict( type='DeformableFeatureAggregationCuda', embed_dims=_dim_, num_groups=8, num_levels=1, # num_cams=6, dropout=0.0, num_pts=13, bias=2.), ], feedforward_channels=2048, ffn_dropout=0.0, with_cp=True, ###use checkpoint to save memory operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm')), )), bbox_coder=dict( type='NMSFreeCoder', post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], pc_range=point_cloud_range, max_num=300, num_classes=10), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0),), map_head=None, # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=point_cloud_range), ), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', ] + ['can_bus_info']) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d'] + ['can_bus_info']) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=6, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, test_mode=False, use_valid_flag=True, modality=input_modality, img_info_prototype='bevdet', sequences_split_num=train_sequences_split_num, use_sequence_group_flag=True, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=4e-4, # bs 8: 2e-4 || bs 16: 4e-4 paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=checkpoint_epoch_interval*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter= num_iters_per_epoch*2, ), dict( type='TimerCP', ) ] load_from = 'ckpts/fcos3d_vovnet_imgbackbone-remapped.pth' ================================================ FILE: configs/bev_next/map_pretrain.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here # we follow the online training settings from solofusion num_gpus = 8 samples_per_gpu = 4 num_iters_per_epoch = int(28130 // (num_gpus * samples_per_gpu) ) num_epochs = 60 checkpoint_epoch_interval = 12 use_custom_eval_hook=True # Each nuScenes sequence is ~40 keyframes long. Our training procedure samples # sequences first, then loads frames from the sampled sequence in order # starting from the first frame. This reduces training step-to-step diversity, # lowering performance. To increase diversity, we split each training sequence # in half to ~20 keyframes, and sample these shorter sequences during training. # During testing, we do not do this splitting. train_sequences_split_num = 4 test_sequences_split_num = 1 # By default, 3D detection datasets randomly choose another sample if there is # no GT object in the current sample. This does not make sense when doing # sequential sampling of frames, so we disable it. filter_empty_gt = False # Long-Term Fusion Parameters do_history = False history_cat_num = 4 history_cat_conv_out_channels = 160 _base_ = ['../_base_/datasets/nus-3d.py', '../_base_/default_runtime.py'] # Global # If point cloud range is changed, the models should also change their point # cloud range accordingly # bev configs roi_size = (102.4, 102.4) bev_h = 128 bev_w = 128 point_cloud_range = [-roi_size[0]/2, -roi_size[1]/2, -5, roi_size[0]/2, roi_size[1]/2, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (0.38, 0.55), 'rot': (0, 0), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(1., 1.), flip_dx_ratio=0., flip_dy_ratio=0.) voxel_size = [0.2, 0.2, 8] use_checkpoint = False sync_bn = True # Model grid_config = { 'x': [-51.2, 51.2, 0.8], 'y': [-51.2, 51.2, 0.8], 'z': [-5, 3, 8], 'depth': [1.0, 60.0, 0.5], } depth_categories = 118 #(grid_config['depth'][1]-grid_config['depth'][0])//grid_config['depth'][2] numC_Trans=80 _dim_ = 256 ### occupancy config empty_idx = 18 # noise 0-->255 num_cls = 19 # 0 others, 1-16 obj, 17 free fix_void = num_cls == 19 ### map_classes = ['divider', 'ped_crossing', 'boundary'] map_num_vec = 100 map_fixed_ptsnum_per_gt_line = 20 # now only support fixed_pts > 0 map_fixed_ptsnum_per_pred_line = 20 map_eval_use_same_gt_sample_num_flag = True map_num_classes = len(map_classes) embed_dims = 256 num_feat_levels = 1 norm_cfg = dict(type='BN2d') num_queries = 100 # category configs cat2id = { 'ped_crossing': 0, 'divider': 1, 'boundary': 2, } num_class = max(list(cat2id.values())) + 1 num_points = 20 permute = True with_ego_as_agent = False ### model = dict( type='BEVPlanner', use_depth_supervision=True, fix_void=fix_void, do_history = do_history, history_cat_num=history_cat_num, single_bev_num_channels=numC_Trans, fuse_history_bev=True, use_grid_mask=True, align_prev_bev=False, img_backbone=dict( init_cfg=dict( type='Pretrained', checkpoint="ckpts/cascade_mask_rcnn_r50_fpn_coco-20e_20e_nuim_20201009_124951-40963960.pth", prefix='backbone.'), type='ResNet', depth=50, num_stages=4, out_indices=(2, 3), frozen_stages=-1, norm_cfg=dict(type='BN2d', requires_grad=False), norm_eval=True, with_cp=False, style='pytorch'), img_neck=dict( type='CustomFPN', in_channels=[1024, 2048], out_channels=_dim_, num_outs=1, start_level=0, with_cp=use_checkpoint, out_ids=[0]), depth_net=dict( type='CM_DepthNet', # camera-aware depth net in_channels=_dim_, context_channels=numC_Trans, downsample=16, grid_config=grid_config, depth_channels=depth_categories, with_cp=use_checkpoint, loss_depth_weight=1., aspp_mid_channels=96, use_dcn=False, ), forward_projection=dict( type='LSSViewTransformerFunction', grid_config=grid_config, input_size=data_config['input_size'], downsample=16), frpn=None, backward_projection=None, img_bev_encoder_backbone=dict( type='CustomResNet', numC_input=numC_Trans, num_channels=[numC_Trans * 2, numC_Trans * 4, numC_Trans * 8]), img_bev_encoder_neck=dict( type='FPN_LSS', in_channels=numC_Trans * 8 + numC_Trans * 2, out_channels=256), occupancy_head=None, img_det_2d_head=None, pts_bbox_head=None, map_head=dict( type='MapDetectorHead', num_queries=num_queries, embed_dims=embed_dims, num_classes=num_class, in_channels=embed_dims, num_points=num_points, roi_size=roi_size, coord_dim=2, different_heads=False, predict_refine=False, sync_cls_avg_factor=True, streaming_cfg=dict( streaming=False, batch_size=samples_per_gpu, topk=int(num_queries*(1/3)), trans_loss_weight=0.1, ), # streaming_cfg=None, transformer=dict( type='MapTransformer', num_feature_levels=1, num_points=num_points, coord_dim=2, encoder=dict( type='PlaceHolderEncoder', embed_dims=embed_dims, ), decoder=dict( type='MapTransformerDecoder_new', num_layers=6, return_intermediate=True, transformerlayers=dict( type='MapTransformerLayer', attn_cfgs=[ dict( type='MultiheadAttention', embed_dims=embed_dims, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), dict( type='CustomMSDeformableAttention', embed_dims=embed_dims, num_heads=8, num_levels=1, num_points=num_points, dropout=0.1, ), ], ffn_cfgs=dict( type='FFN', embed_dims=embed_dims, feedforward_channels=embed_dims*2, num_fcs=2, ffn_drop=0.1, act_cfg=dict(type='ReLU', inplace=True), ), feedforward_channels=embed_dims*2, ffn_dropout=0.1, # operation_order=('norm', 'self_attn', 'norm', 'cross_attn', # 'norm', 'ffn',) operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn', 'norm') ) ) ), loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=0.5 ), loss_reg=dict( type='LinesL1Loss', loss_weight=5.0, beta=0.01, ), assigner=dict( type='HungarianLinesAssigner', cost=dict( type='MapQueriesCost', cls_cost=dict(type='FocalLossCost', weight=0.5), reg_cost=dict(type='LinesL1Cost', weight=5.0, beta=0.01, permute=permute), ), ), ), motion_head=None, planner_head=None, # model training and testing settings train_cfg=dict(pts=dict( grid_size=[512, 512, 1], voxel_size=voxel_size, point_cloud_range=point_cloud_range, out_size_factor=4, assigner=None), ) ) # Data dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' file_client_args = dict(backend='disk') occupancy_path = '/mount/data/occupancy_cvpr2023/gts' normalize_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True) train_pipeline = [ dict( type='PrepareImageInputs', is_train=True, normalize_cfg=normalize_cfg, data_config=data_config), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, with_2d_bbox=True, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap2', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='PointToMultiViewDepth', downsample=1, grid_config=grid_config), dict(type='LoadGTMotion'), dict(type='LoadGTPlaner'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), # dict(type='VisualInputsAndGT'), # dict(type='LoadOccupancy', ignore_nonvisible=True, fix_void=fix_void, occupancy_path=occupancy_path), dict(type='DefaultFormatBundle3D', class_names=class_names), dict( type='Collect3D', keys=['img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_depth', 'gt_bboxes_2d', 'gt_labels_2d', 'centers2d', 'depths2d', 'map_gt_labels_3d', 'map_gt_bboxes_3d' ] + ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+ ['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks'] ) ] test_pipeline = [ dict( type='CustomDistMultiScaleFlipAug3D', tta=False, transforms=[ dict(type='PrepareImageInputs', # img_corruptions='sun', data_config=data_config, normalize_cfg=normalize_cfg), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=bda_aug_conf, classes=class_names, with_2d_bbox=True, is_train=False), dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadVectorMap', data_root = data_root, point_cloud_range =point_cloud_range, map_classes = ['divider', 'ped_crossing', 'boundary'], map_num_vec = 100, map_fixed_ptsnum_per_line = 20, # now only support fixed_pts > 0, map_eval_use_same_gt_sample_num_flag = True, map_num_classes = 3, ), dict(type='LoadGTPlaner'), dict(type='LoadGTMotion', with_ego_as_agent=with_ego_as_agent), dict(type='LoadFutBoxInfo'), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points', 'img_inputs', 'gt_bboxes_3d', 'gt_labels_3d', 'map_gt_bboxes_3d', 'map_gt_labels_3d']+ ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask']+['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks']+ ['gt_fut_segmentations'] ) ] ) ] input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) share_data_config = dict( type=dataset_type, classes=class_names, modality=input_modality, img_info_prototype='bevdet', occupancy_path=occupancy_path, data_root=data_root, use_sequence_group_flag=True, ) test_data_config = dict( pipeline=test_pipeline, map_ann_file=data_root + 'nuscenes_map_infos_102x102_val.pkl', map_eval_cfg=dict( region = (102.4, 102.4) # (H, W) ), load_fut_bbox_info=True, sequences_split_num=test_sequences_split_num, ann_file=data_root + 'bev-next-nuscenes_infos_val.pkl') data = dict( samples_per_gpu=samples_per_gpu, workers_per_gpu=2, test_dataloader=dict(runner_type='IterBasedRunnerEval'), train=dict( type=dataset_type, ann_file=data_root + 'bev-next-nuscenes_infos_train.pkl', pipeline=train_pipeline, test_mode=False, use_valid_flag=True, sequences_split_num=train_sequences_split_num, filter_empty_gt=filter_empty_gt, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=test_data_config, test=test_data_config) for key in ['train', 'val', 'test']: data[key].update(share_data_config) optimizer = dict( type='AdamW', lr=1e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3, ) runner = dict(type='IterBasedRunner', max_iters=num_epochs * num_iters_per_epoch) checkpoint_config = dict( interval=checkpoint_epoch_interval * num_iters_per_epoch) evaluation = dict( interval=num_epochs * num_iters_per_epoch, pipeline=test_pipeline) log_config = dict( interval=50, hooks=[ dict(type='TextLoggerHook'), ]) custom_hooks = [ dict( type='MEGVIIEMAHook', init_updates=10560, priority='NORMAL', interval=checkpoint_epoch_interval*num_iters_per_epoch, ), dict( type='SequentialControlHook', temporal_start_iter=0, ), dict( type='TimerCP', ) ] # load_from = None # resume_from = None ================================================ FILE: mmdet3d/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv import mmdet import mmseg from .version import __version__, short_version def digit_version(version_str): digit_version = [] for x in version_str.split('.'): if x.isdigit(): digit_version.append(int(x)) elif x.find('rc') != -1: patch_version = x.split('rc') digit_version.append(int(patch_version[0]) - 1) digit_version.append(int(patch_version[1])) return digit_version mmcv_minimum_version = '1.5.2' mmcv_maximum_version = '1.7.0' mmcv_version = digit_version(mmcv.__version__) assert (mmcv_version >= digit_version(mmcv_minimum_version) and mmcv_version <= digit_version(mmcv_maximum_version)), \ f'MMCV=={mmcv.__version__} is used but incompatible. ' \ f'Please install mmcv>={mmcv_minimum_version}, <={mmcv_maximum_version}.' mmdet_minimum_version = '2.24.0' mmdet_maximum_version = '3.0.0' mmdet_version = digit_version(mmdet.__version__) assert (mmdet_version >= digit_version(mmdet_minimum_version) and mmdet_version <= digit_version(mmdet_maximum_version)), \ f'MMDET=={mmdet.__version__} is used but incompatible. ' \ f'Please install mmdet>={mmdet_minimum_version}, ' \ f'<={mmdet_maximum_version}.' mmseg_minimum_version = '0.20.0' mmseg_maximum_version = '1.0.0' mmseg_version = digit_version(mmseg.__version__) assert (mmseg_version >= digit_version(mmseg_minimum_version) and mmseg_version <= digit_version(mmseg_maximum_version)), \ f'MMSEG=={mmseg.__version__} is used but incompatible. ' \ f'Please install mmseg>={mmseg_minimum_version}, ' \ f'<={mmseg_maximum_version}.' __all__ = ['__version__', 'short_version'] ================================================ FILE: mmdet3d/apis/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .inference import (convert_SyncBN, inference_detector, inference_mono_3d_detector, inference_multi_modality_detector, inference_segmentor, init_model, show_result_meshlab) from .test import single_gpu_test from .train import init_random_seed, train_model __all__ = [ 'inference_detector', 'init_model', 'single_gpu_test', 'inference_mono_3d_detector', 'show_result_meshlab', 'convert_SyncBN', 'train_model', 'inference_multi_modality_detector', 'inference_segmentor', 'init_random_seed' ] ================================================ FILE: mmdet3d/apis/inference.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import re from copy import deepcopy from os import path as osp import mmcv import numpy as np import torch from mmcv.parallel import collate, scatter from mmcv.runner import load_checkpoint from mmdet3d.core import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes, show_multi_modality_result, show_result, show_seg_result) from mmdet3d.core.bbox import get_box_type from mmdet3d.datasets.pipelines import Compose from mmdet3d.models import build_model from mmdet3d.utils import get_root_logger def convert_SyncBN(config): """Convert config's naiveSyncBN to BN. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. """ if isinstance(config, dict): for item in config: if item == 'norm_cfg': config[item]['type'] = config[item]['type']. \ replace('naiveSyncBN', 'BN') else: convert_SyncBN(config[item]) def init_model(config, checkpoint=None, device='cuda:0'): """Initialize a model from config file, which could be a 3D detector or a 3D segmentor. Args: config (str or :obj:`mmcv.Config`): Config file path or the config object. checkpoint (str, optional): Checkpoint path. If left as None, the model will not load any weights. device (str): Device to use. Returns: nn.Module: The constructed detector. """ if isinstance(config, str): config = mmcv.Config.fromfile(config) elif not isinstance(config, mmcv.Config): raise TypeError('config must be a filename or Config object, ' f'but got {type(config)}') config.model.pretrained = None convert_SyncBN(config.model) config.model.train_cfg = None model = build_model(config.model, test_cfg=config.get('test_cfg')) if checkpoint is not None: checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') if 'CLASSES' in checkpoint['meta']: model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = config.class_names if 'PALETTE' in checkpoint['meta']: # 3D Segmentor model.PALETTE = checkpoint['meta']['PALETTE'] model.cfg = config # save the config in the model for convenience if device != 'cpu': torch.cuda.set_device(device) else: logger = get_root_logger() logger.warning('Don\'t suggest using CPU device. ' 'Some functions are not supported for now.') model.to(device) model.eval() return model def inference_detector(model, pcd): """Inference point cloud with the detector. Args: model (nn.Module): The loaded detector. pcd (str): Point cloud files. Returns: tuple: Predicted results and data from pipeline. """ cfg = model.cfg device = next(model.parameters()).device # model device if not isinstance(pcd, str): cfg = cfg.copy() # set loading pipeline type cfg.data.test.pipeline[0].type = 'LoadPointsFromDict' # build the data pipeline test_pipeline = deepcopy(cfg.data.test.pipeline) test_pipeline = Compose(test_pipeline) box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d) if isinstance(pcd, str): # load from point clouds file data = dict( pts_filename=pcd, box_type_3d=box_type_3d, box_mode_3d=box_mode_3d, # for ScanNet demo we need axis_align_matrix ann_info=dict(axis_align_matrix=np.eye(4)), sweeps=[], # set timestamp = 0 timestamp=[0], img_fields=[], bbox3d_fields=[], pts_mask_fields=[], pts_seg_fields=[], bbox_fields=[], mask_fields=[], seg_fields=[]) else: # load from http data = dict( points=pcd, box_type_3d=box_type_3d, box_mode_3d=box_mode_3d, # for ScanNet demo we need axis_align_matrix ann_info=dict(axis_align_matrix=np.eye(4)), sweeps=[], # set timestamp = 0 timestamp=[0], img_fields=[], bbox3d_fields=[], pts_mask_fields=[], pts_seg_fields=[], bbox_fields=[], mask_fields=[], seg_fields=[]) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device.index])[0] else: # this is a workaround to avoid the bug of MMDataParallel data['img_metas'] = data['img_metas'][0].data data['points'] = data['points'][0].data # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result, data def inference_multi_modality_detector(model, pcd, image, ann_file): """Inference point cloud with the multi-modality detector. Args: model (nn.Module): The loaded detector. pcd (str): Point cloud files. image (str): Image files. ann_file (str): Annotation files. Returns: tuple: Predicted results and data from pipeline. """ cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = deepcopy(cfg.data.test.pipeline) test_pipeline = Compose(test_pipeline) box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d) # get data info containing calib data_infos = mmcv.load(ann_file) image_idx = int(re.findall(r'\d+', image)[-1]) # xxx/sunrgbd_000017.jpg for x in data_infos: if int(x['image']['image_idx']) != image_idx: continue info = x break data = dict( pts_filename=pcd, img_prefix=osp.dirname(image), img_info=dict(filename=osp.basename(image)), box_type_3d=box_type_3d, box_mode_3d=box_mode_3d, img_fields=[], bbox3d_fields=[], pts_mask_fields=[], pts_seg_fields=[], bbox_fields=[], mask_fields=[], seg_fields=[]) data = test_pipeline(data) # TODO: this code is dataset-specific. Move lidar2img and # depth2img to .pkl annotations in the future. # LiDAR to image conversion if box_mode_3d == Box3DMode.LIDAR: rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P2 = info['calib']['P2'].astype(np.float32) lidar2img = P2 @ rect @ Trv2c data['img_metas'][0].data['lidar2img'] = lidar2img # Depth to image conversion elif box_mode_3d == Box3DMode.DEPTH: rt_mat = info['calib']['Rt'] # follow Coord3DMode.convert_point rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0] ]) @ rt_mat.transpose(1, 0) depth2img = info['calib']['K'] @ rt_mat data['img_metas'][0].data['depth2img'] = depth2img data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device.index])[0] else: # this is a workaround to avoid the bug of MMDataParallel data['img_metas'] = data['img_metas'][0].data data['points'] = data['points'][0].data data['img'] = data['img'][0].data # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result, data def inference_mono_3d_detector(model, image, ann_file): """Inference image with the monocular 3D detector. Args: model (nn.Module): The loaded detector. image (str): Image files. ann_file (str): Annotation files. Returns: tuple: Predicted results and data from pipeline. """ cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = deepcopy(cfg.data.test.pipeline) test_pipeline = Compose(test_pipeline) box_type_3d, box_mode_3d = get_box_type(cfg.data.test.box_type_3d) # get data info containing calib data_infos = mmcv.load(ann_file) # find the info corresponding to this image for x in data_infos['images']: if osp.basename(x['file_name']) != osp.basename(image): continue img_info = x break data = dict( img_prefix=osp.dirname(image), img_info=dict(filename=osp.basename(image)), box_type_3d=box_type_3d, box_mode_3d=box_mode_3d, img_fields=[], bbox3d_fields=[], pts_mask_fields=[], pts_seg_fields=[], bbox_fields=[], mask_fields=[], seg_fields=[]) # camera points to image conversion if box_mode_3d == Box3DMode.CAM: data['img_info'].update(dict(cam_intrinsic=img_info['cam_intrinsic'])) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device.index])[0] else: # this is a workaround to avoid the bug of MMDataParallel data['img_metas'] = data['img_metas'][0].data data['img'] = data['img'][0].data # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result, data def inference_segmentor(model, pcd): """Inference point cloud with the segmentor. Args: model (nn.Module): The loaded segmentor. pcd (str): Point cloud files. Returns: tuple: Predicted results and data from pipeline. """ cfg = model.cfg device = next(model.parameters()).device # model device # build the data pipeline test_pipeline = deepcopy(cfg.data.test.pipeline) test_pipeline = Compose(test_pipeline) data = dict( pts_filename=pcd, img_fields=[], bbox3d_fields=[], pts_mask_fields=[], pts_seg_fields=[], bbox_fields=[], mask_fields=[], seg_fields=[]) data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: # scatter to specified GPU data = scatter(data, [device.index])[0] else: # this is a workaround to avoid the bug of MMDataParallel data['img_metas'] = data['img_metas'][0].data data['points'] = data['points'][0].data # forward the model with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) return result, data def show_det_result_meshlab(data, result, out_dir, score_thr=0.0, show=False, snapshot=False): """Show 3D detection result by meshlab.""" points = data['points'][0][0].cpu().numpy() pts_filename = data['img_metas'][0][0]['pts_filename'] file_name = osp.split(pts_filename)[-1].split('.')[0] if 'pts_bbox' in result[0].keys(): pred_bboxes = result[0]['pts_bbox']['boxes_3d'].tensor.numpy() pred_scores = result[0]['pts_bbox']['scores_3d'].numpy() else: pred_bboxes = result[0]['boxes_3d'].tensor.numpy() pred_scores = result[0]['scores_3d'].numpy() # filter out low score bboxes for visualization if score_thr > 0: inds = pred_scores > score_thr pred_bboxes = pred_bboxes[inds] # for now we convert points into depth mode box_mode = data['img_metas'][0][0]['box_mode_3d'] if box_mode != Box3DMode.DEPTH: points = Coord3DMode.convert(points, box_mode, Coord3DMode.DEPTH) show_bboxes = Box3DMode.convert(pred_bboxes, box_mode, Box3DMode.DEPTH) else: show_bboxes = deepcopy(pred_bboxes) show_result( points, None, show_bboxes, out_dir, file_name, show=show, snapshot=snapshot) return file_name def show_seg_result_meshlab(data, result, out_dir, palette, show=False, snapshot=False): """Show 3D segmentation result by meshlab.""" points = data['points'][0][0].cpu().numpy() pts_filename = data['img_metas'][0][0]['pts_filename'] file_name = osp.split(pts_filename)[-1].split('.')[0] pred_seg = result[0]['semantic_mask'].numpy() if palette is None: # generate random color map max_idx = pred_seg.max() palette = np.random.randint(0, 256, size=(max_idx + 1, 3)) palette = np.array(palette).astype(np.int) show_seg_result( points, None, pred_seg, out_dir, file_name, palette=palette, show=show, snapshot=snapshot) return file_name def show_proj_det_result_meshlab(data, result, out_dir, score_thr=0.0, show=False, snapshot=False): """Show result of projecting 3D bbox to 2D image by meshlab.""" assert 'img' in data.keys(), 'image data is not provided for visualization' img_filename = data['img_metas'][0][0]['filename'] file_name = osp.split(img_filename)[-1].split('.')[0] # read from file because img in data_dict has undergone pipeline transform img = mmcv.imread(img_filename) if 'pts_bbox' in result[0].keys(): result[0] = result[0]['pts_bbox'] elif 'img_bbox' in result[0].keys(): result[0] = result[0]['img_bbox'] pred_bboxes = result[0]['boxes_3d'].tensor.numpy() pred_scores = result[0]['scores_3d'].numpy() # filter out low score bboxes for visualization if score_thr > 0: inds = pred_scores > score_thr pred_bboxes = pred_bboxes[inds] box_mode = data['img_metas'][0][0]['box_mode_3d'] if box_mode == Box3DMode.LIDAR: if 'lidar2img' not in data['img_metas'][0][0]: raise NotImplementedError( 'LiDAR to image transformation matrix is not provided') show_bboxes = LiDARInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0)) show_multi_modality_result( img, None, show_bboxes, data['img_metas'][0][0]['lidar2img'], out_dir, file_name, box_mode='lidar', show=show) elif box_mode == Box3DMode.DEPTH: show_bboxes = DepthInstance3DBoxes(pred_bboxes, origin=(0.5, 0.5, 0)) show_multi_modality_result( img, None, show_bboxes, None, out_dir, file_name, box_mode='depth', img_metas=data['img_metas'][0][0], show=show) elif box_mode == Box3DMode.CAM: if 'cam2img' not in data['img_metas'][0][0]: raise NotImplementedError( 'camera intrinsic matrix is not provided') show_bboxes = CameraInstance3DBoxes( pred_bboxes, box_dim=pred_bboxes.shape[-1], origin=(0.5, 1.0, 0.5)) show_multi_modality_result( img, None, show_bboxes, data['img_metas'][0][0]['cam2img'], out_dir, file_name, box_mode='camera', show=show) else: raise NotImplementedError( f'visualization of {box_mode} bbox is not supported') return file_name def show_result_meshlab(data, result, out_dir, score_thr=0.0, show=False, snapshot=False, task='det', palette=None): """Show result by meshlab. Args: data (dict): Contain data from pipeline. result (dict): Predicted result from model. out_dir (str): Directory to save visualized result. score_thr (float, optional): Minimum score of bboxes to be shown. Default: 0.0 show (bool, optional): Visualize the results online. Defaults to False. snapshot (bool, optional): Whether to save the online results. Defaults to False. task (str, optional): Distinguish which task result to visualize. Currently we support 3D detection, multi-modality detection and 3D segmentation. Defaults to 'det'. palette (list[list[int]]] | np.ndarray, optional): The palette of segmentation map. If None is given, random palette will be generated. Defaults to None. """ assert task in ['det', 'multi_modality-det', 'seg', 'mono-det'], \ f'unsupported visualization task {task}' assert out_dir is not None, 'Expect out_dir, got none.' if task in ['det', 'multi_modality-det']: file_name = show_det_result_meshlab(data, result, out_dir, score_thr, show, snapshot) if task in ['seg']: file_name = show_seg_result_meshlab(data, result, out_dir, palette, show, snapshot) if task in ['multi_modality-det', 'mono-det']: file_name = show_proj_det_result_meshlab(data, result, out_dir, score_thr, show, snapshot) return out_dir, file_name ================================================ FILE: mmdet3d/apis/test.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp import mmcv import torch from mmcv.image import tensor2imgs import time from mmdet3d.models import (Base3DDetector, Base3DSegmentor, SingleStageMono3DDetector) def single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3): """Test model with single gpu. This method tests model with single gpu and gives the 'show' option. By setting ``show=True``, it saves the visualization results under ``out_dir``. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. show (bool, optional): Whether to save viualization results. Default: True. out_dir (str, optional): The path to save visualization results. Default: None. Returns: list[dict]: The prediction results. """ model.eval() results = [] dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if show: # Visualize the results of MMDetection3D model # 'show_results' is MMdetection3D visualization API models_3d = (Base3DDetector, Base3DSegmentor, SingleStageMono3DDetector) if isinstance(model.module, models_3d): model.module.show_results( data, result, out_dir=out_dir, show=show, score_thr=show_score_thr) # Visualize the results of MMDetection model # 'show_result' is MMdetection visualization API else: batch_size = len(result) if batch_size == 1 and isinstance(data['img'][0], torch.Tensor): img_tensor = data['img'][0] else: img_tensor = data['img'][0].data[0] img_metas = data['img_metas'][0].data[0] imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) assert len(imgs) == len(img_metas) for i, (img, img_meta) in enumerate(zip(imgs, img_metas)): h, w, _ = img_meta['img_shape'] img_show = img[:h, :w, :] ori_h, ori_w = img_meta['ori_shape'][:-1] img_show = mmcv.imresize(img_show, (ori_w, ori_h)) if out_dir: out_file = osp.join(out_dir, img_meta['ori_filename']) else: out_file = None model.module.show_result( img_show, result[i], show=show, out_file=out_file, score_thr=show_score_thr) results.extend(result) batch_size = len(result) for _ in range(batch_size): prog_bar.update() return results # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import os.path as osp import pickle import shutil import tempfile import time import mmcv import torch import torch.distributed as dist from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmdet.core import encode_mask_results import mmcv import numpy as np import pycocotools.mask as mask_util def custom_encode_mask_results(mask_results): """Encode bitmap mask to RLE code. Semantic Masks only Args: mask_results (list | tuple[list]): bitmap mask results. In mask scoring rcnn, mask_results is a tuple of (segm_results, segm_cls_score). Returns: list | tuple: RLE encoded mask. """ cls_segms = mask_results num_classes = len(cls_segms) encoded_mask_results = [] for i in range(len(cls_segms)): encoded_mask_results.append( mask_util.encode( np.array( cls_segms[i][:, :, np.newaxis], order='F', dtype='uint8'))[0]) # encoded with RLE return [encoded_mask_results] def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() bbox_results = [] mask_results = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. have_mask = False for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) # encode mask results if isinstance(result, dict): if 'bbox_results' in result.keys(): bbox_result = result['bbox_results'] batch_size = len(result['bbox_results']) bbox_results.extend(bbox_result) if 'mask_results' in result.keys() and result['mask_results'] is not None: mask_result = custom_encode_mask_results(result['mask_results']) mask_results.extend(mask_result) have_mask = True else: batch_size = len(result) bbox_results.extend(result) #if isinstance(result[0], tuple): # assert False, 'this code is for instance segmentation, which our code will not utilize.' # result = [(bbox_results, encode_mask_results(mask_results)) # for bbox_results, mask_results in result] if rank == 0: for _ in range(batch_size * world_size): prog_bar.update() # collect results from all ranks if gpu_collect: bbox_results = collect_results_gpu(bbox_results, len(dataset)) if have_mask: mask_results = collect_results_gpu(mask_results, len(dataset)) else: mask_results = None else: bbox_results = collect_results_cpu(bbox_results, len(dataset), tmpdir) tmpdir = tmpdir+'_mask' if tmpdir is not None else None if have_mask: mask_results = collect_results_cpu(mask_results, len(dataset), tmpdir) else: mask_results = None if mask_results is None: return bbox_results return {'bbox_results': bbox_results, 'mask_results': mask_results} def collect_results_cpu(result_part, size, tmpdir=None): rank, world_size = get_dist_info() # create a tmp dir if it is not specified tmpdir = None if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN, ), 32, dtype=torch.uint8, device='cuda') if rank == 0: mmcv.mkdir_or_exist('.dist_test') prefix = str(time.time())[-5:] tmpdir = tempfile.mkdtemp(dir='.dist_test', prefix=prefix) tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank != 0: return None else: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_list.append(mmcv.load(part_file)) # sort the results ordered_results = [] ''' bacause we change the sample of the evaluation stage to make sure that each gpu will handle continuous sample, ''' #for res in zip(*part_list): for res in part_list: ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results #[:size] # remove tmp dir shutil.rmtree(tmpdir) return ordered_results def collect_results_gpu(result_part, size): collect_results_cpu(result_part, size) ================================================ FILE: mmdet3d/apis/train.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import random import warnings import numpy as np import torch from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_optimizer, build_runner, get_dist_info) from mmcv.utils import build_from_cfg from torch import distributed as dist from mmdet3d.datasets import build_dataset from mmdet3d.utils import find_latest_checkpoint from mmdet.core import DistEvalHook as MMDET_DistEvalHook from mmdet.core import EvalHook as MMDET_EvalHook from mmdet3d.datasets import build_dataloader as build_mmdet_dataloader from mmdet.datasets import replace_ImageToTensor from mmdet.utils import get_root_logger as get_mmdet_root_logger from mmseg.core import DistEvalHook as MMSEG_DistEvalHook from mmseg.core import EvalHook as MMSEG_EvalHook from mmseg.datasets import build_dataloader as build_mmseg_dataloader from mmseg.utils import get_root_logger as get_mmseg_root_logger import time from mmdet3d.models.fbbev.utils import CustomDistEvalHook import os.path as osp def init_random_seed(seed=None, device='cuda'): """Initialize random seed. If the seed is not set, the seed will be automatically randomized, and then broadcast to all processes to prevent some potential bugs. Args: seed (int, optional): The seed. Default to None. device (str, optional): The device where the seed will be put on. Default to 'cuda'. Returns: int: Seed to be used. """ if seed is not None: return seed # Make sure all ranks share the same random seed to prevent # some potential bugs. Please refer to # https://github.com/open-mmlab/mmdetection/issues/6339 rank, world_size = get_dist_info() seed = np.random.randint(2**31) if world_size == 1: return seed if rank == 0: random_num = torch.tensor(seed, dtype=torch.int32, device=device) else: random_num = torch.tensor(0, dtype=torch.int32, device=device) dist.broadcast(random_num, src=0) return random_num.item() def set_random_seed(seed, deterministic=False): """Set random seed. Args: seed (int): Seed to be used. deterministic (bool): Whether to set the deterministic option for CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` to True and `torch.backends.cudnn.benchmark` to False. Default: False. """ random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) if deterministic: torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False def train_segmentor(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """Launch segmentor training.""" logger = get_mmseg_root_logger(cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_mmseg_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, drop_last=True) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if cfg.get('runner') is None: cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters} warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) runner = build_runner( cfg.runner, default_args=dict( model=model, batch_processor=None, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # register hooks runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) # an ugly walkaround to make the .log and .log.json filenames the same runner.timestamp = timestamp # register eval hooks if validate: val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_mmseg_dataloader( val_dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = MMSEG_DistEvalHook if distributed else MMSEG_EvalHook # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. runner.register_hook( eval_hook(val_dataloader, **eval_cfg), priority='LOW') # user-defined hooks if cfg.get('custom_hooks', None): custom_hooks = cfg.custom_hooks assert isinstance(custom_hooks, list), \ f'custom_hooks expect list type, but got {type(custom_hooks)}' for hook_cfg in cfg.custom_hooks: assert isinstance(hook_cfg, dict), \ 'Each item in custom_hooks expects dict type, but got ' \ f'{type(hook_cfg)}' hook_cfg = hook_cfg.copy() priority = hook_cfg.pop('priority', 'NORMAL') hook = build_from_cfg(hook_cfg, HOOKS) runner.register_hook(hook, priority=priority) if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) def train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_mmdet_root_logger(log_level=cfg.log_level) # prepare data loaders dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] if 'imgs_per_gpu' in cfg.data: logger.warning('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: logger.warning( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: logger.warning( 'Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu runner_type = 'EpochBasedRunner' if 'runner' not in cfg else cfg.runner[ 'type'] data_loaders = [ build_mmdet_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # `num_gpus` will be ignored if distributed num_gpus=len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, runner_type=runner_type, persistent_workers=cfg.data.get('persistent_workers', False)) for ds in dataset ] # put model on gpus if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) # Sets the `find_unused_parameters` parameter in # torch.nn.parallel.DistributedDataParallel model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) if 'runner' not in cfg: cfg.runner = { 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs } warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks( cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None), custom_hooks_config=cfg.get('custom_hooks', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) # register eval hooks if validate: # Support batch_size > 1 in validation val_samples_per_gpu = cfg.data.val.pop('samples_per_gpu', 1) if val_samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.val.pipeline = replace_ImageToTensor( cfg.data.val.pipeline) val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) val_dataloader = build_mmdet_dataloader( val_dataset, samples_per_gpu=val_samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, val=True, runner_type=cfg.data.test_dataloader.get('runner_type', 'EpochBasedRunner'), shuffle=False) eval_cfg = cfg.get('evaluation', {}) eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' eval_hook = MMDET_DistEvalHook if distributed else MMDET_EvalHook # In this PR (https://github.com/open-mmlab/mmcv/pull/1193), the # priority of IterTimerHook has been modified from 'NORMAL' to 'LOW'. # eval_cfg['jsonfile_prefix'] = osp.join('val', cfg.work_dir, time.ctime().replace(' ','_').replace(':','_')) if cfg.get('use_custom_eval_hook', False): eval_hook = CustomDistEvalHook if distributed else eval_hook runner.register_hook( eval_hook(val_dataloader, work_dir=cfg.work_dir, **eval_cfg), priority='LOW') resume_from = None if cfg.resume_from is None and cfg.get('auto_resume'): resume_from = find_latest_checkpoint(cfg.work_dir) if resume_from is not None: cfg.resume_from = resume_from if cfg.resume_from: runner.resume(cfg.resume_from) elif cfg.load_from: runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """A function wrapper for launching model training according to cfg. Because we need different eval_hook in runner. Should be deprecated in the future. """ if cfg.model.type in ['EncoderDecoder3D']: train_segmentor( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) else: train_detector( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) ================================================ FILE: mmdet3d/core/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .anchor import * # noqa: F401, F403 from .bbox import * # noqa: F401, F403 from .evaluation import * # noqa: F401, F403 from .hook import * # noqa: F401, F403 from .points import * # noqa: F401, F403 from .post_processing import * # noqa: F401, F403 from .utils import * # noqa: F401, F403 from .visualizer import * # noqa: F401, F403 from .voxel import * # noqa: F401, F403 ================================================ FILE: mmdet3d/core/anchor/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.core.anchor import build_prior_generator from .anchor_3d_generator import (AlignedAnchor3DRangeGenerator, AlignedAnchor3DRangeGeneratorPerCls, Anchor3DRangeGenerator) __all__ = [ 'AlignedAnchor3DRangeGenerator', 'Anchor3DRangeGenerator', 'build_prior_generator', 'AlignedAnchor3DRangeGeneratorPerCls' ] ================================================ FILE: mmdet3d/core/anchor/anchor_3d_generator.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv import torch from mmdet.core.anchor import ANCHOR_GENERATORS @ANCHOR_GENERATORS.register_module() class Anchor3DRangeGenerator(object): """3D Anchor Generator by range. This anchor generator generates anchors by the given range in different feature levels. Due the convention in 3D detection, different anchor sizes are related to different ranges for different categories. However we find this setting does not effect the performance much in some datasets, e.g., nuScenes. Args: ranges (list[list[float]]): Ranges of different anchors. The ranges are the same across different feature levels. But may vary for different anchor sizes if size_per_range is True. sizes (list[list[float]], optional): 3D sizes of anchors. Defaults to [[3.9, 1.6, 1.56]]. scales (list[int], optional): Scales of anchors in different feature levels. Defaults to [1]. rotations (list[float], optional): Rotations of anchors in a feature grid. Defaults to [0, 1.5707963]. custom_values (tuple[float], optional): Customized values of that anchor. For example, in nuScenes the anchors have velocities. Defaults to (). reshape_out (bool, optional): Whether to reshape the output into (N x 4). Defaults to True. size_per_range (bool, optional): Whether to use separate ranges for different sizes. If size_per_range is True, the ranges should have the same length as the sizes, if not, it will be duplicated. Defaults to True. """ def __init__(self, ranges, sizes=[[3.9, 1.6, 1.56]], scales=[1], rotations=[0, 1.5707963], custom_values=(), reshape_out=True, size_per_range=True): assert mmcv.is_list_of(ranges, list) if size_per_range: if len(sizes) != len(ranges): assert len(ranges) == 1 ranges = ranges * len(sizes) assert len(ranges) == len(sizes) else: assert len(ranges) == 1 assert mmcv.is_list_of(sizes, list) assert isinstance(scales, list) self.sizes = sizes self.scales = scales self.ranges = ranges self.rotations = rotations self.custom_values = custom_values self.cached_anchors = None self.reshape_out = reshape_out self.size_per_range = size_per_range def __repr__(self): s = self.__class__.__name__ + '(' s += f'anchor_range={self.ranges},\n' s += f'scales={self.scales},\n' s += f'sizes={self.sizes},\n' s += f'rotations={self.rotations},\n' s += f'reshape_out={self.reshape_out},\n' s += f'size_per_range={self.size_per_range})' return s @property def num_base_anchors(self): """list[int]: Total number of base anchors in a feature grid.""" num_rot = len(self.rotations) num_size = torch.tensor(self.sizes).reshape(-1, 3).size(0) return num_rot * num_size @property def num_levels(self): """int: Number of feature levels that the generator is applied to.""" return len(self.scales) def grid_anchors(self, featmap_sizes, device='cuda'): """Generate grid anchors in multiple feature levels. Args: featmap_sizes (list[tuple]): List of feature map sizes in multiple feature levels. device (str, optional): Device where the anchors will be put on. Defaults to 'cuda'. Returns: list[torch.Tensor]: Anchors in multiple feature levels. The sizes of each tensor should be [N, 4], where N = width * height * num_base_anchors, width and height are the sizes of the corresponding feature level, num_base_anchors is the number of anchors for that level. """ assert self.num_levels == len(featmap_sizes) multi_level_anchors = [] for i in range(self.num_levels): anchors = self.single_level_grid_anchors( featmap_sizes[i], self.scales[i], device=device) if self.reshape_out: anchors = anchors.reshape(-1, anchors.size(-1)) multi_level_anchors.append(anchors) return multi_level_anchors def single_level_grid_anchors(self, featmap_size, scale, device='cuda'): """Generate grid anchors of a single level feature map. This function is usually called by method ``self.grid_anchors``. Args: featmap_size (tuple[int]): Size of the feature map. scale (float): Scale factor of the anchors in the current level. device (str, optional): Device the tensor will be put on. Defaults to 'cuda'. Returns: torch.Tensor: Anchors in the overall feature map. """ # We reimplement the anchor generator using torch in cuda # torch: 0.6975 s for 1000 times # numpy: 4.3345 s for 1000 times # which is ~5 times faster than the numpy implementation if not self.size_per_range: return self.anchors_single_range( featmap_size, self.ranges[0], scale, self.sizes, self.rotations, device=device) mr_anchors = [] for anchor_range, anchor_size in zip(self.ranges, self.sizes): mr_anchors.append( self.anchors_single_range( featmap_size, anchor_range, scale, anchor_size, self.rotations, device=device)) mr_anchors = torch.cat(mr_anchors, dim=-3) return mr_anchors def anchors_single_range(self, feature_size, anchor_range, scale=1, sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.5707963], device='cuda'): """Generate anchors in a single range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). scale (float | int, optional): The scale factor of anchors. Defaults to 1. sizes (list[list] | np.ndarray | torch.Tensor, optional): Anchor size with shape [N, 3], in order of x, y, z. Defaults to [[3.9, 1.6, 1.56]]. rotations (list[float] | np.ndarray | torch.Tensor, optional): Rotations of anchors in a single feature grid. Defaults to [0, 1.5707963]. device (str): Devices that the anchors will be put on. Defaults to 'cuda'. Returns: torch.Tensor: Anchors with shape [*feature_size, num_sizes, num_rots, 7]. """ if len(feature_size) == 2: feature_size = [1, feature_size[0], feature_size[1]] anchor_range = torch.tensor(anchor_range, device=device) z_centers = torch.linspace( anchor_range[2], anchor_range[5], feature_size[0], device=device) y_centers = torch.linspace( anchor_range[1], anchor_range[4], feature_size[1], device=device) x_centers = torch.linspace( anchor_range[0], anchor_range[3], feature_size[2], device=device) sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale rotations = torch.tensor(rotations, device=device) # torch.meshgrid default behavior is 'id', np's default is 'xy' rets = torch.meshgrid(x_centers, y_centers, z_centers, rotations) # torch.meshgrid returns a tuple rather than list rets = list(rets) tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1) sizes = sizes.reshape([1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = sizes.repeat(tile_size_shape) rets.insert(3, sizes) ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5]) # [1, 200, 176, N, 2, 7] for kitti after permute if len(self.custom_values) > 0: custom_ndim = len(self.custom_values) custom = ret.new_zeros([*ret.shape[:-1], custom_ndim]) # custom[:] = self.custom_values ret = torch.cat([ret, custom], dim=-1) # [1, 200, 176, N, 2, 9] for nus dataset after permute return ret @ANCHOR_GENERATORS.register_module() class AlignedAnchor3DRangeGenerator(Anchor3DRangeGenerator): """Aligned 3D Anchor Generator by range. This anchor generator uses a different manner to generate the positions of anchors' centers from :class:`Anchor3DRangeGenerator`. Note: The `align` means that the anchor's center is aligned with the voxel grid, which is also the feature grid. The previous implementation of :class:`Anchor3DRangeGenerator` does not generate the anchors' center according to the voxel grid. Rather, it generates the center by uniformly distributing the anchors inside the minimum and maximum anchor ranges according to the feature map sizes. However, this makes the anchors center does not match the feature grid. The :class:`AlignedAnchor3DRangeGenerator` add + 1 when using the feature map sizes to obtain the corners of the voxel grid. Then it shifts the coordinates to the center of voxel grid and use the left up corner to distribute anchors. Args: anchor_corner (bool, optional): Whether to align with the corner of the voxel grid. By default it is False and the anchor's center will be the same as the corresponding voxel's center, which is also the center of the corresponding greature grid. Defaults to False. """ def __init__(self, align_corner=False, **kwargs): super(AlignedAnchor3DRangeGenerator, self).__init__(**kwargs) self.align_corner = align_corner def anchors_single_range(self, feature_size, anchor_range, scale, sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.5707963], device='cuda'): """Generate anchors in a single range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). scale (float | int): The scale factor of anchors. sizes (list[list] | np.ndarray | torch.Tensor, optional): Anchor size with shape [N, 3], in order of x, y, z. Defaults to [[3.9, 1.6, 1.56]]. rotations (list[float] | np.ndarray | torch.Tensor, optional): Rotations of anchors in a single feature grid. Defaults to [0, 1.5707963]. device (str, optional): Devices that the anchors will be put on. Defaults to 'cuda'. Returns: torch.Tensor: Anchors with shape [*feature_size, num_sizes, num_rots, 7]. """ if len(feature_size) == 2: feature_size = [1, feature_size[0], feature_size[1]] anchor_range = torch.tensor(anchor_range, device=device) z_centers = torch.linspace( anchor_range[2], anchor_range[5], feature_size[0] + 1, device=device) y_centers = torch.linspace( anchor_range[1], anchor_range[4], feature_size[1] + 1, device=device) x_centers = torch.linspace( anchor_range[0], anchor_range[3], feature_size[2] + 1, device=device) sizes = torch.tensor(sizes, device=device).reshape(-1, 3) * scale rotations = torch.tensor(rotations, device=device) # shift the anchor center if not self.align_corner: z_shift = (z_centers[1] - z_centers[0]) / 2 y_shift = (y_centers[1] - y_centers[0]) / 2 x_shift = (x_centers[1] - x_centers[0]) / 2 z_centers += z_shift y_centers += y_shift x_centers += x_shift # torch.meshgrid default behavior is 'id', np's default is 'xy' rets = torch.meshgrid(x_centers[:feature_size[2]], y_centers[:feature_size[1]], z_centers[:feature_size[0]], rotations) # torch.meshgrid returns a tuple rather than list rets = list(rets) tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = rets[i].unsqueeze(-2).repeat(tile_shape).unsqueeze(-1) sizes = sizes.reshape([1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = sizes.repeat(tile_size_shape) rets.insert(3, sizes) ret = torch.cat(rets, dim=-1).permute([2, 1, 0, 3, 4, 5]) if len(self.custom_values) > 0: custom_ndim = len(self.custom_values) custom = ret.new_zeros([*ret.shape[:-1], custom_ndim]) # TODO: check the support of custom values # custom[:] = self.custom_values ret = torch.cat([ret, custom], dim=-1) return ret @ANCHOR_GENERATORS.register_module() class AlignedAnchor3DRangeGeneratorPerCls(AlignedAnchor3DRangeGenerator): """3D Anchor Generator by range for per class. This anchor generator generates anchors by the given range for per class. Note that feature maps of different classes may be different. Args: kwargs (dict): Arguments are the same as those in :class:`AlignedAnchor3DRangeGenerator`. """ def __init__(self, **kwargs): super(AlignedAnchor3DRangeGeneratorPerCls, self).__init__(**kwargs) assert len(self.scales) == 1, 'Multi-scale feature map levels are' + \ ' not supported currently in this kind of anchor generator.' def grid_anchors(self, featmap_sizes, device='cuda'): """Generate grid anchors in multiple feature levels. Args: featmap_sizes (list[tuple]): List of feature map sizes for different classes in a single feature level. device (str, optional): Device where the anchors will be put on. Defaults to 'cuda'. Returns: list[list[torch.Tensor]]: Anchors in multiple feature levels. Note that in this anchor generator, we currently only support single feature level. The sizes of each tensor should be [num_sizes/ranges*num_rots*featmap_size, box_code_size]. """ multi_level_anchors = [] anchors = self.multi_cls_grid_anchors( featmap_sizes, self.scales[0], device=device) multi_level_anchors.append(anchors) return multi_level_anchors def multi_cls_grid_anchors(self, featmap_sizes, scale, device='cuda'): """Generate grid anchors of a single level feature map for multi-class with different feature map sizes. This function is usually called by method ``self.grid_anchors``. Args: featmap_sizes (list[tuple]): List of feature map sizes for different classes in a single feature level. scale (float): Scale factor of the anchors in the current level. device (str, optional): Device the tensor will be put on. Defaults to 'cuda'. Returns: torch.Tensor: Anchors in the overall feature map. """ assert len(featmap_sizes) == len(self.sizes) == len(self.ranges), \ 'The number of different feature map sizes anchor sizes and ' + \ 'ranges should be the same.' multi_cls_anchors = [] for i in range(len(featmap_sizes)): anchors = self.anchors_single_range( featmap_sizes[i], self.ranges[i], scale, self.sizes[i], self.rotations, device=device) # [*featmap_size, num_sizes/ranges, num_rots, box_code_size] ndim = len(featmap_sizes[i]) anchors = anchors.view(*featmap_sizes[i], -1, anchors.size(-1)) # [*featmap_size, num_sizes/ranges*num_rots, box_code_size] anchors = anchors.permute(ndim, *range(0, ndim), ndim + 1) # [num_sizes/ranges*num_rots, *featmap_size, box_code_size] multi_cls_anchors.append(anchors.reshape(-1, anchors.size(-1))) # [num_sizes/ranges*num_rots*featmap_size, box_code_size] return multi_cls_anchors ================================================ FILE: mmdet3d/core/bbox/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .assigners import AssignResult, BaseAssigner, MaxIoUAssigner from .coders import DeltaXYZWLHRBBoxCoder # from .bbox_target import bbox_target from .iou_calculators import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d, bbox_overlaps_nearest_3d) from .samplers import (BaseSampler, CombinedSampler, InstanceBalancedPosSampler, IoUBalancedNegSampler, PseudoSampler, RandomSampler, SamplingResult) from .structures import (BaseInstance3DBoxes, Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes, get_box_type, limit_period, mono_cam_box2vis, points_cam2img, points_img2cam, xywhr2xyxyr, CustomBox) from .transforms import bbox3d2result, bbox3d2roi, bbox3d_mapping_back from .util import * __all__ = [ 'BaseSampler', 'AssignResult', 'BaseAssigner', 'MaxIoUAssigner', 'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'SamplingResult', 'DeltaXYZWLHRBBoxCoder', 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d', 'Box3DMode', 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'bbox3d2roi', 'bbox3d2result', 'DepthInstance3DBoxes', 'BaseInstance3DBoxes', 'bbox3d_mapping_back', 'xywhr2xyxyr', 'limit_period', 'points_cam2img', 'points_img2cam', 'get_box_type', 'Coord3DMode', 'mono_cam_box2vis' ] ================================================ FILE: mmdet3d/core/bbox/assigners/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.core.bbox import AssignResult, BaseAssigner, MaxIoUAssigner __all__ = ['BaseAssigner', 'MaxIoUAssigner', 'AssignResult'] ================================================ FILE: mmdet3d/core/bbox/box_np_ops.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # TODO: clean the functions in this file and move the APIs into box structures # in the future # NOTICE: All functions in this file are valid for LiDAR or depth boxes only # if we use default parameters. import numba import numpy as np from .structures.utils import limit_period, points_cam2img, rotation_3d_in_axis def camera_to_lidar(points, r_rect, velo2cam): """Convert points in camera coordinate to lidar coordinate. Note: This function is for KITTI only. Args: points (np.ndarray, shape=[N, 3]): Points in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray, shape=[N, 3]: Points in lidar coordinate. """ points_shape = list(points.shape[0:-1]) if points.shape[-1] == 3: points = np.concatenate([points, np.ones(points_shape + [1])], axis=-1) lidar_points = points @ np.linalg.inv((r_rect @ velo2cam).T) return lidar_points[..., :3] def box_camera_to_lidar(data, r_rect, velo2cam): """Convert boxes in camera coordinate to lidar coordinate. Note: This function is for KITTI only. Args: data (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray, shape=[N, 3]: Boxes in lidar coordinate. """ xyz = data[:, 0:3] x_size, y_size, z_size = data[:, 3:4], data[:, 4:5], data[:, 5:6] r = data[:, 6:7] xyz_lidar = camera_to_lidar(xyz, r_rect, velo2cam) # yaw and dims also needs to be converted r_new = -r - np.pi / 2 r_new = limit_period(r_new, period=np.pi * 2) return np.concatenate([xyz_lidar, x_size, z_size, y_size, r_new], axis=1) def corners_nd(dims, origin=0.5): """Generate relative box corners based on length per dim and origin point. Args: dims (np.ndarray, shape=[N, ndim]): Array of length per dim origin (list or array or float, optional): origin point relate to smallest point. Defaults to 0.5 Returns: np.ndarray, shape=[N, 2 ** ndim, ndim]: Returned corners. point layout example: (2d) x0y0, x0y1, x1y0, x1y1; (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 where x0 < x1, y0 < y1, z0 < z1. """ ndim = int(dims.shape[1]) corners_norm = np.stack( np.unravel_index(np.arange(2**ndim), [2] * ndim), axis=1).astype(dims.dtype) # now corners_norm has format: (2d) x0y0, x0y1, x1y0, x1y1 # (3d) x0y0z0, x0y0z1, x0y1z0, x0y1z1, x1y0z0, x1y0z1, x1y1z0, x1y1z1 # so need to convert to a format which is convenient to do other computing. # for 2d boxes, format is clockwise start with minimum point # for 3d boxes, please draw lines by your hand. if ndim == 2: # generate clockwise box corners corners_norm = corners_norm[[0, 1, 3, 2]] elif ndim == 3: corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] corners_norm = corners_norm - np.array(origin, dtype=dims.dtype) corners = dims.reshape([-1, 1, ndim]) * corners_norm.reshape( [1, 2**ndim, ndim]) return corners def center_to_corner_box2d(centers, dims, angles=None, origin=0.5): """Convert kitti locations, dimensions and angles to corners. format: center(xy), dims(xy), angles(counterclockwise when positive) Args: centers (np.ndarray): Locations in kitti label file with shape (N, 2). dims (np.ndarray): Dimensions in kitti label file with shape (N, 2). angles (np.ndarray, optional): Rotation_y in kitti label file with shape (N). Defaults to None. origin (list or array or float, optional): origin point relate to smallest point. Defaults to 0.5. Returns: np.ndarray: Corners with the shape of (N, 4, 2). """ # 'length' in kitti format is in x axis. # xyz(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(wlh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 4, 2] if angles is not None: corners = rotation_3d_in_axis(corners, angles) corners += centers.reshape([-1, 1, 2]) return corners @numba.jit(nopython=True) def depth_to_points(depth, trunc_pixel): """Convert depth map to points. Args: depth (np.array, shape=[H, W]): Depth map which the row of [0~`trunc_pixel`] are truncated. trunc_pixel (int): The number of truncated row. Returns: np.ndarray: Points in camera coordinates. """ num_pts = np.sum(depth[trunc_pixel:, ] > 0.1) points = np.zeros((num_pts, 3), dtype=depth.dtype) x = np.array([0, 0, 1], dtype=depth.dtype) k = 0 for i in range(trunc_pixel, depth.shape[0]): for j in range(depth.shape[1]): if depth[i, j] > 0.1: x = np.array([j, i, 1], dtype=depth.dtype) points[k] = x * depth[i, j] k += 1 return points def depth_to_lidar_points(depth, trunc_pixel, P2, r_rect, velo2cam): """Convert depth map to points in lidar coordinate. Args: depth (np.array, shape=[H, W]): Depth map which the row of [0~`trunc_pixel`] are truncated. trunc_pixel (int): The number of truncated row. P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. r_rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. velo2cam (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. Returns: np.ndarray: Points in lidar coordinates. """ pts = depth_to_points(depth, trunc_pixel) points_shape = list(pts.shape[0:-1]) points = np.concatenate([pts, np.ones(points_shape + [1])], axis=-1) points = points @ np.linalg.inv(P2.T) lidar_points = camera_to_lidar(points, r_rect, velo2cam) return lidar_points def center_to_corner_box3d(centers, dims, angles=None, origin=(0.5, 1.0, 0.5), axis=1): """Convert kitti locations, dimensions and angles to corners. Args: centers (np.ndarray): Locations in kitti label file with shape (N, 3). dims (np.ndarray): Dimensions in kitti label file with shape (N, 3). angles (np.ndarray, optional): Rotation_y in kitti label file with shape (N). Defaults to None. origin (list or array or float, optional): Origin point relate to smallest point. Use (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. Defaults to (0.5, 1.0, 0.5). axis (int, optional): Rotation axis. 1 for camera and 2 for lidar. Defaults to 1. Returns: np.ndarray: Corners with the shape of (N, 8, 3). """ # 'length' in kitti format is in x axis. # yzx(hwl)(kitti label file)<->xyz(lhw)(camera)<->z(-x)(-y)(lwh)(lidar) # center in kitti format is [0.5, 1.0, 0.5] in xyz. corners = corners_nd(dims, origin=origin) # corners: [N, 8, 3] if angles is not None: corners = rotation_3d_in_axis(corners, angles, axis=axis) corners += centers.reshape([-1, 1, 3]) return corners @numba.jit(nopython=True) def box2d_to_corner_jit(boxes): """Convert box2d to corner. Args: boxes (np.ndarray, shape=[N, 5]): Boxes2d with rotation. Returns: box_corners (np.ndarray, shape=[N, 4, 2]): Box corners. """ num_box = boxes.shape[0] corners_norm = np.zeros((4, 2), dtype=boxes.dtype) corners_norm[1, 1] = 1.0 corners_norm[2] = 1.0 corners_norm[3, 0] = 1.0 corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) corners = boxes.reshape(num_box, 1, 5)[:, :, 2:4] * corners_norm.reshape( 1, 4, 2) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) box_corners = np.zeros((num_box, 4, 2), dtype=boxes.dtype) for i in range(num_box): rot_sin = np.sin(boxes[i, -1]) rot_cos = np.cos(boxes[i, -1]) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = rot_sin rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos box_corners[i] = corners[i] @ rot_mat_T + boxes[i, :2] return box_corners @numba.njit def corner_to_standup_nd_jit(boxes_corner): """Convert boxes_corner to aligned (min-max) boxes. Args: boxes_corner (np.ndarray, shape=[N, 2**dim, dim]): Boxes corners. Returns: np.ndarray, shape=[N, dim*2]: Aligned (min-max) boxes. """ num_boxes = boxes_corner.shape[0] ndim = boxes_corner.shape[-1] result = np.zeros((num_boxes, ndim * 2), dtype=boxes_corner.dtype) for i in range(num_boxes): for j in range(ndim): result[i, j] = np.min(boxes_corner[i, :, j]) for j in range(ndim): result[i, j + ndim] = np.max(boxes_corner[i, :, j]) return result @numba.jit(nopython=True) def corner_to_surfaces_3d_jit(corners): """Convert 3d box corners from corner function above to surfaces that normal vectors all direct to internal. Args: corners (np.ndarray): 3d box corners with the shape of (N, 8, 3). Returns: np.ndarray: Surfaces with the shape of (N, 6, 4, 3). """ # box_corners: [N, 8, 3], must from corner functions in this module num_boxes = corners.shape[0] surfaces = np.zeros((num_boxes, 6, 4, 3), dtype=corners.dtype) corner_idxes = np.array([ 0, 1, 2, 3, 7, 6, 5, 4, 0, 3, 7, 4, 1, 5, 6, 2, 0, 4, 5, 1, 3, 2, 6, 7 ]).reshape(6, 4) for i in range(num_boxes): for j in range(6): for k in range(4): surfaces[i, j, k] = corners[i, corner_idxes[j, k]] return surfaces def rotation_points_single_angle(points, angle, axis=0): """Rotate points with a single angle. Args: points (np.ndarray, shape=[N, 3]]): angle (np.ndarray, shape=[1]]): axis (int, optional): Axis to rotate at. Defaults to 0. Returns: np.ndarray: Rotated points. """ # points: [N, 3] rot_sin = np.sin(angle) rot_cos = np.cos(angle) if axis == 1: rot_mat_T = np.array( [[rot_cos, 0, rot_sin], [0, 1, 0], [-rot_sin, 0, rot_cos]], dtype=points.dtype) elif axis == 2 or axis == -1: rot_mat_T = np.array( [[rot_cos, rot_sin, 0], [-rot_sin, rot_cos, 0], [0, 0, 1]], dtype=points.dtype) elif axis == 0: rot_mat_T = np.array( [[1, 0, 0], [0, rot_cos, rot_sin], [0, -rot_sin, rot_cos]], dtype=points.dtype) else: raise ValueError('axis should in range') return points @ rot_mat_T, rot_mat_T def box3d_to_bbox(box3d, P2): """Convert box3d in camera coordinates to bbox in image coordinates. Args: box3d (np.ndarray, shape=[N, 7]): Boxes in camera coordinate. P2 (np.array, shape=[4, 4]): Intrinsics of Camera2. Returns: np.ndarray, shape=[N, 4]: Boxes 2d in image coordinates. """ box_corners = center_to_corner_box3d( box3d[:, :3], box3d[:, 3:6], box3d[:, 6], [0.5, 1.0, 0.5], axis=1) box_corners_in_image = points_cam2img(box_corners, P2) # box_corners_in_image: [N, 8, 2] minxy = np.min(box_corners_in_image, axis=1) maxxy = np.max(box_corners_in_image, axis=1) bbox = np.concatenate([minxy, maxxy], axis=1) return bbox def corner_to_surfaces_3d(corners): """convert 3d box corners from corner function above to surfaces that normal vectors all direct to internal. Args: corners (np.ndarray): 3D box corners with shape of (N, 8, 3). Returns: np.ndarray: Surfaces with the shape of (N, 6, 4, 3). """ # box_corners: [N, 8, 3], must from corner functions in this module surfaces = np.array([ [corners[:, 0], corners[:, 1], corners[:, 2], corners[:, 3]], [corners[:, 7], corners[:, 6], corners[:, 5], corners[:, 4]], [corners[:, 0], corners[:, 3], corners[:, 7], corners[:, 4]], [corners[:, 1], corners[:, 5], corners[:, 6], corners[:, 2]], [corners[:, 0], corners[:, 4], corners[:, 5], corners[:, 1]], [corners[:, 3], corners[:, 2], corners[:, 6], corners[:, 7]], ]).transpose([2, 0, 1, 3]) return surfaces def points_in_rbbox(points, rbbox, z_axis=2, origin=(0.5, 0.5, 0)): """Check points in rotated bbox and return indices. Note: This function is for counterclockwise boxes. Args: points (np.ndarray, shape=[N, 3+dim]): Points to query. rbbox (np.ndarray, shape=[M, 7]): Boxes3d with rotation. z_axis (int, optional): Indicate which axis is height. Defaults to 2. origin (tuple[int], optional): Indicate the position of box center. Defaults to (0.5, 0.5, 0). Returns: np.ndarray, shape=[N, M]: Indices of points in each box. """ # TODO: this function is different from PointCloud3D, be careful # when start to use nuscene, check the input rbbox_corners = center_to_corner_box3d( rbbox[:, :3], rbbox[:, 3:6], rbbox[:, 6], origin=origin, axis=z_axis) surfaces = corner_to_surfaces_3d(rbbox_corners) indices = points_in_convex_polygon_3d_jit(points[:, :3], surfaces) return indices def minmax_to_corner_2d(minmax_box): """Convert minmax box to corners2d. Args: minmax_box (np.ndarray, shape=[N, dims]): minmax boxes. Returns: np.ndarray: 2d corners of boxes """ ndim = minmax_box.shape[-1] // 2 center = minmax_box[..., :ndim] dims = minmax_box[..., ndim:] - center return center_to_corner_box2d(center, dims, origin=0.0) def create_anchors_3d_range(feature_size, anchor_range, sizes=((3.9, 1.6, 1.56), ), rotations=(0, np.pi / 2), dtype=np.float32): """Create anchors 3d by range. Args: feature_size (list[float] | tuple[float]): Feature map size. It is either a list of a tuple of [D, H, W](in order of z, y, and x). anchor_range (torch.Tensor | list[float]): Range of anchors with shape [6]. The order is consistent with that of anchors, i.e., (x_min, y_min, z_min, x_max, y_max, z_max). sizes (list[list] | np.ndarray | torch.Tensor, optional): Anchor size with shape [N, 3], in order of x, y, z. Defaults to ((3.9, 1.6, 1.56), ). rotations (list[float] | np.ndarray | torch.Tensor, optional): Rotations of anchors in a single feature grid. Defaults to (0, np.pi / 2). dtype (type, optional): Data type. Defaults to np.float32. Returns: np.ndarray: Range based anchors with shape of (*feature_size, num_sizes, num_rots, 7). """ anchor_range = np.array(anchor_range, dtype) z_centers = np.linspace( anchor_range[2], anchor_range[5], feature_size[0], dtype=dtype) y_centers = np.linspace( anchor_range[1], anchor_range[4], feature_size[1], dtype=dtype) x_centers = np.linspace( anchor_range[0], anchor_range[3], feature_size[2], dtype=dtype) sizes = np.reshape(np.array(sizes, dtype=dtype), [-1, 3]) rotations = np.array(rotations, dtype=dtype) rets = np.meshgrid( x_centers, y_centers, z_centers, rotations, indexing='ij') tile_shape = [1] * 5 tile_shape[-2] = int(sizes.shape[0]) for i in range(len(rets)): rets[i] = np.tile(rets[i][..., np.newaxis, :], tile_shape) rets[i] = rets[i][..., np.newaxis] # for concat sizes = np.reshape(sizes, [1, 1, 1, -1, 1, 3]) tile_size_shape = list(rets[0].shape) tile_size_shape[3] = 1 sizes = np.tile(sizes, tile_size_shape) rets.insert(3, sizes) ret = np.concatenate(rets, axis=-1) return np.transpose(ret, [2, 1, 0, 3, 4, 5]) def center_to_minmax_2d(centers, dims, origin=0.5): """Center to minmax. Args: centers (np.ndarray): Center points. dims (np.ndarray): Dimensions. origin (list or array or float, optional): Origin point relate to smallest point. Defaults to 0.5. Returns: np.ndarray: Minmax points. """ if origin == 0.5: return np.concatenate([centers - dims / 2, centers + dims / 2], axis=-1) corners = center_to_corner_box2d(centers, dims, origin=origin) return corners[:, [0, 2]].reshape([-1, 4]) def rbbox2d_to_near_bbox(rbboxes): """convert rotated bbox to nearest 'standing' or 'lying' bbox. Args: rbboxes (np.ndarray): Rotated bboxes with shape of (N, 5(x, y, xdim, ydim, rad)). Returns: np.ndarray: Bounding boxes with the shape of (N, 4(xmin, ymin, xmax, ymax)). """ rots = rbboxes[..., -1] rots_0_pi_div_2 = np.abs(limit_period(rots, 0.5, np.pi)) cond = (rots_0_pi_div_2 > np.pi / 4)[..., np.newaxis] bboxes_center = np.where(cond, rbboxes[:, [0, 1, 3, 2]], rbboxes[:, :4]) bboxes = center_to_minmax_2d(bboxes_center[:, :2], bboxes_center[:, 2:]) return bboxes @numba.jit(nopython=True) def iou_jit(boxes, query_boxes, mode='iou', eps=0.0): """Calculate box iou. Note that jit version runs ~10x faster than the box_overlaps function in mmdet3d.core.evaluation. Note: This function is for counterclockwise boxes. Args: boxes (np.ndarray): Input bounding boxes with shape of (N, 4). query_boxes (np.ndarray): Query boxes with shape of (K, 4). mode (str, optional): IoU mode. Defaults to 'iou'. eps (float, optional): Value added to denominator. Defaults to 0. Returns: np.ndarray: Overlap between boxes and query_boxes with the shape of [N, K]. """ N = boxes.shape[0] K = query_boxes.shape[0] overlaps = np.zeros((N, K), dtype=boxes.dtype) for k in range(K): box_area = ((query_boxes[k, 2] - query_boxes[k, 0] + eps) * (query_boxes[k, 3] - query_boxes[k, 1] + eps)) for n in range(N): iw = ( min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0]) + eps) if iw > 0: ih = ( min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1]) + eps) if ih > 0: if mode == 'iou': ua = ((boxes[n, 2] - boxes[n, 0] + eps) * (boxes[n, 3] - boxes[n, 1] + eps) + box_area - iw * ih) else: ua = ((boxes[n, 2] - boxes[n, 0] + eps) * (boxes[n, 3] - boxes[n, 1] + eps)) overlaps[n, k] = iw * ih / ua return overlaps def projection_matrix_to_CRT_kitti(proj): """Split projection matrix of KITTI. Note: This function is for KITTI only. P = C @ [R|T] C is upper triangular matrix, so we need to inverse CR and use QR stable for all kitti camera projection matrix. Args: proj (p.array, shape=[4, 4]): Intrinsics of camera. Returns: tuple[np.ndarray]: Splited matrix of C, R and T. """ CR = proj[0:3, 0:3] CT = proj[0:3, 3] RinvCinv = np.linalg.inv(CR) Rinv, Cinv = np.linalg.qr(RinvCinv) C = np.linalg.inv(Cinv) R = np.linalg.inv(Rinv) T = Cinv @ CT return C, R, T def remove_outside_points(points, rect, Trv2c, P2, image_shape): """Remove points which are outside of image. Note: This function is for KITTI only. Args: points (np.ndarray, shape=[N, 3+dims]): Total points. rect (np.ndarray, shape=[4, 4]): Matrix to project points in specific camera coordinate (e.g. CAM2) to CAM0. Trv2c (np.ndarray, shape=[4, 4]): Matrix to project points in camera coordinate to lidar coordinate. P2 (p.array, shape=[4, 4]): Intrinsics of Camera2. image_shape (list[int]): Shape of image. Returns: np.ndarray, shape=[N, 3+dims]: Filtered points. """ # 5x faster than remove_outside_points_v1(2ms vs 10ms) C, R, T = projection_matrix_to_CRT_kitti(P2) image_bbox = [0, 0, image_shape[1], image_shape[0]] frustum = get_frustum(image_bbox, C) frustum -= T frustum = np.linalg.inv(R) @ frustum.T frustum = camera_to_lidar(frustum.T, rect, Trv2c) frustum_surfaces = corner_to_surfaces_3d_jit(frustum[np.newaxis, ...]) indices = points_in_convex_polygon_3d_jit(points[:, :3], frustum_surfaces) points = points[indices.reshape([-1])] return points def get_frustum(bbox_image, C, near_clip=0.001, far_clip=100): """Get frustum corners in camera coordinates. Args: bbox_image (list[int]): box in image coordinates. C (np.ndarray): Intrinsics. near_clip (float, optional): Nearest distance of frustum. Defaults to 0.001. far_clip (float, optional): Farthest distance of frustum. Defaults to 100. Returns: np.ndarray, shape=[8, 3]: coordinates of frustum corners. """ fku = C[0, 0] fkv = -C[1, 1] u0v0 = C[0:2, 2] z_points = np.array( [near_clip] * 4 + [far_clip] * 4, dtype=C.dtype)[:, np.newaxis] b = bbox_image box_corners = np.array( [[b[0], b[1]], [b[0], b[3]], [b[2], b[3]], [b[2], b[1]]], dtype=C.dtype) near_box_corners = (box_corners - u0v0) / np.array( [fku / near_clip, -fkv / near_clip], dtype=C.dtype) far_box_corners = (box_corners - u0v0) / np.array( [fku / far_clip, -fkv / far_clip], dtype=C.dtype) ret_xy = np.concatenate([near_box_corners, far_box_corners], axis=0) # [8, 2] ret_xyz = np.concatenate([ret_xy, z_points], axis=1) return ret_xyz def surface_equ_3d(polygon_surfaces): """ Args: polygon_surfaces (np.ndarray): Polygon surfaces with shape of [num_polygon, max_num_surfaces, max_num_points_of_surface, 3]. All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. Returns: tuple: normal vector and its direction. """ # return [a, b, c], d in ax+by+cz+d=0 # polygon_surfaces: [num_polygon, num_surfaces, num_points_of_polygon, 3] surface_vec = polygon_surfaces[:, :, :2, :] - \ polygon_surfaces[:, :, 1:3, :] # normal_vec: [..., 3] normal_vec = np.cross(surface_vec[:, :, 0, :], surface_vec[:, :, 1, :]) # print(normal_vec.shape, points[..., 0, :].shape) # d = -np.inner(normal_vec, points[..., 0, :]) d = np.einsum('aij, aij->ai', normal_vec, polygon_surfaces[:, :, 0, :]) return normal_vec, -d @numba.njit def _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, num_surfaces): """ Args: points (np.ndarray): Input points with shape of (num_points, 3). polygon_surfaces (np.ndarray): Polygon surfaces with shape of (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. normal_vec (np.ndarray): Normal vector of polygon_surfaces. d (int): Directions of normal vector. num_surfaces (np.ndarray): Number of surfaces a polygon contains shape of (num_polygon). Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] num_points = points.shape[0] num_polygons = polygon_surfaces.shape[0] ret = np.ones((num_points, num_polygons), dtype=np.bool_) sign = 0.0 for i in range(num_points): for j in range(num_polygons): for k in range(max_num_surfaces): if k > num_surfaces[j]: break sign = ( points[i, 0] * normal_vec[j, k, 0] + points[i, 1] * normal_vec[j, k, 1] + points[i, 2] * normal_vec[j, k, 2] + d[j, k]) if sign >= 0: ret[i, j] = False break return ret def points_in_convex_polygon_3d_jit(points, polygon_surfaces, num_surfaces=None): """Check points is in 3d convex polygons. Args: points (np.ndarray): Input points with shape of (num_points, 3). polygon_surfaces (np.ndarray): Polygon surfaces with shape of (num_polygon, max_num_surfaces, max_num_points_of_surface, 3). All surfaces' normal vector must direct to internal. Max_num_points_of_surface must at least 3. num_surfaces (np.ndarray, optional): Number of surfaces a polygon contains shape of (num_polygon). Defaults to None. Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ max_num_surfaces, max_num_points_of_surface = polygon_surfaces.shape[1:3] # num_points = points.shape[0] num_polygons = polygon_surfaces.shape[0] if num_surfaces is None: num_surfaces = np.full((num_polygons, ), 9999999, dtype=np.int64) normal_vec, d = surface_equ_3d(polygon_surfaces[:, :, :3, :]) # normal_vec: [num_polygon, max_num_surfaces, 3] # d: [num_polygon, max_num_surfaces] return _points_in_convex_polygon_3d_jit(points, polygon_surfaces, normal_vec, d, num_surfaces) @numba.njit def points_in_convex_polygon_jit(points, polygon, clockwise=False): """Check points is in 2d convex polygons. True when point in polygon. Args: points (np.ndarray): Input points with the shape of [num_points, 2]. polygon (np.ndarray): Input polygon with the shape of [num_polygon, num_points_of_polygon, 2]. clockwise (bool, optional): Indicate polygon is clockwise. Defaults to True. Returns: np.ndarray: Result matrix with the shape of [num_points, num_polygon]. """ # first convert polygon to directed lines num_points_of_polygon = polygon.shape[1] num_points = points.shape[0] num_polygons = polygon.shape[0] # vec for all the polygons if clockwise: vec1 = polygon - polygon[:, np.array([num_points_of_polygon - 1] + list( range(num_points_of_polygon - 1))), :] else: vec1 = polygon[:, np.array([num_points_of_polygon - 1] + list(range(num_points_of_polygon - 1))), :] - polygon ret = np.zeros((num_points, num_polygons), dtype=np.bool_) success = True cross = 0.0 for i in range(num_points): for j in range(num_polygons): success = True for k in range(num_points_of_polygon): vec = vec1[j, k] cross = vec[1] * (polygon[j, k, 0] - points[i, 0]) cross -= vec[0] * (polygon[j, k, 1] - points[i, 1]) if cross >= 0: success = False break ret[i, j] = success return ret def boxes3d_to_corners3d_lidar(boxes3d, bottom_center=True): """Convert kitti center boxes to corners. 7 -------- 4 /| /| 6 -------- 5 . | | | | . 3 -------- 0 |/ |/ 2 -------- 1 Note: This function is for LiDAR boxes only. Args: boxes3d (np.ndarray): Boxes with shape of (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR coords, see the definition of ry in KITTI dataset. bottom_center (bool, optional): Whether z is on the bottom center of object. Defaults to True. Returns: np.ndarray: Box corners with the shape of [N, 8, 3]. """ boxes_num = boxes3d.shape[0] x_size, y_size, z_size = boxes3d[:, 3], boxes3d[:, 4], boxes3d[:, 5] x_corners = np.array([ x_size / 2., -x_size / 2., -x_size / 2., x_size / 2., x_size / 2., -x_size / 2., -x_size / 2., x_size / 2. ], dtype=np.float32).T y_corners = np.array([ -y_size / 2., -y_size / 2., y_size / 2., y_size / 2., -y_size / 2., -y_size / 2., y_size / 2., y_size / 2. ], dtype=np.float32).T if bottom_center: z_corners = np.zeros((boxes_num, 8), dtype=np.float32) z_corners[:, 4:8] = z_size.reshape(boxes_num, 1).repeat( 4, axis=1) # (N, 8) else: z_corners = np.array([ -z_size / 2., -z_size / 2., -z_size / 2., -z_size / 2., z_size / 2., z_size / 2., z_size / 2., z_size / 2. ], dtype=np.float32).T ry = boxes3d[:, 6] zeros, ones = np.zeros( ry.size, dtype=np.float32), np.ones( ry.size, dtype=np.float32) rot_list = np.array([[np.cos(ry), np.sin(ry), zeros], [-np.sin(ry), np.cos(ry), zeros], [zeros, zeros, ones]]) # (3, 3, N) R_list = np.transpose(rot_list, (2, 0, 1)) # (N, 3, 3) temp_corners = np.concatenate((x_corners.reshape( -1, 8, 1), y_corners.reshape(-1, 8, 1), z_corners.reshape(-1, 8, 1)), axis=2) # (N, 8, 3) rotated_corners = np.matmul(temp_corners, R_list) # (N, 8, 3) x_corners = rotated_corners[:, :, 0] y_corners = rotated_corners[:, :, 1] z_corners = rotated_corners[:, :, 2] x_loc, y_loc, z_loc = boxes3d[:, 0], boxes3d[:, 1], boxes3d[:, 2] x = x_loc.reshape(-1, 1) + x_corners.reshape(-1, 8) y = y_loc.reshape(-1, 1) + y_corners.reshape(-1, 8) z = z_loc.reshape(-1, 1) + z_corners.reshape(-1, 8) corners = np.concatenate( (x.reshape(-1, 8, 1), y.reshape(-1, 8, 1), z.reshape(-1, 8, 1)), axis=2) return corners.astype(np.float32) ================================================ FILE: mmdet3d/core/bbox/coders/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.core.bbox import build_bbox_coder from .anchor_free_bbox_coder import AnchorFreeBBoxCoder from .centerpoint_bbox_coders import CenterPointBBoxCoder from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder from .fcos3d_bbox_coder import FCOS3DBBoxCoder from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder from .monoflex_bbox_coder import MonoFlexCoder from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder from .pgd_bbox_coder import PGDBBoxCoder from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder from .smoke_bbox_coder import SMOKECoder __all__ = [ 'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder', 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder', 'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder', 'PGDBBoxCoder', 'SMOKECoder', 'MonoFlexCoder' ] ================================================ FILE: mmdet3d/core/bbox/coders/anchor_free_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core.bbox.builder import BBOX_CODERS from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder @BBOX_CODERS.register_module() class AnchorFreeBBoxCoder(PartialBinBasedBBoxCoder): """Anchor free bbox coder for 3D boxes. Args: num_dir_bins (int): Number of bins to encode direction angle. with_rot (bool): Whether the bbox is with rotation. """ def __init__(self, num_dir_bins, with_rot=True): super(AnchorFreeBBoxCoder, self).__init__( num_dir_bins, 0, [], with_rot=with_rot) self.num_dir_bins = num_dir_bins self.with_rot = with_rot def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. Returns: tuple: Targets of center, size and direction. """ # generate center target center_target = gt_bboxes_3d.gravity_center # generate bbox size target size_res_target = gt_bboxes_3d.dims / 2 # generate dir target box_num = gt_labels_3d.shape[0] if self.with_rot: (dir_class_target, dir_res_target) = self.angle2class(gt_bboxes_3d.yaw) dir_res_target /= (2 * np.pi / self.num_dir_bins) else: dir_class_target = gt_labels_3d.new_zeros(box_num) dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num) return (center_target, size_res_target, dir_class_target, dir_res_target) def decode(self, bbox_out): """Decode predicted parts to bbox3d. Args: bbox_out (dict): Predictions from model, should contain keys below. - center: predicted bottom center of bboxes. - dir_class: predicted bbox direction class. - dir_res: predicted bbox direction residual. - size: predicted bbox size. Returns: torch.Tensor: Decoded bbox3d with shape (batch, n, 7). """ center = bbox_out['center'] batch_size, num_proposal = center.shape[:2] # decode heading angle if self.with_rot: dir_class = torch.argmax(bbox_out['dir_class'], -1) dir_res = torch.gather(bbox_out['dir_res'], 2, dir_class.unsqueeze(-1)) dir_res.squeeze_(2) dir_angle = self.class2angle(dir_class, dir_res).reshape( batch_size, num_proposal, 1) else: dir_angle = center.new_zeros(batch_size, num_proposal, 1) # decode bbox size bbox_size = torch.clamp(bbox_out['size'] * 2, min=0.1) bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1) return bbox3d def split_pred(self, cls_preds, reg_preds, base_xyz): """Split predicted features to specific parts. Args: cls_preds (torch.Tensor): Class predicted features to split. reg_preds (torch.Tensor): Regression predicted features to split. base_xyz (torch.Tensor): Coordinates of points. Returns: dict[str, torch.Tensor]: Split results. """ results = {} results['obj_scores'] = cls_preds start, end = 0, 0 reg_preds_trans = reg_preds.transpose(2, 1) # decode center end += 3 # (batch_size, num_proposal, 3) results['center_offset'] = reg_preds_trans[..., start:end] results['center'] = base_xyz.detach() + reg_preds_trans[..., start:end] start = end # decode center end += 3 # (batch_size, num_proposal, 3) results['size'] = reg_preds_trans[..., start:end] start = end # decode direction end += self.num_dir_bins results['dir_class'] = reg_preds_trans[..., start:end] start = end end += self.num_dir_bins dir_res_norm = reg_preds_trans[..., start:end] start = end results['dir_res_norm'] = dir_res_norm results['dir_res'] = dir_res_norm * (2 * np.pi / self.num_dir_bins) return results ================================================ FILE: mmdet3d/core/bbox/coders/centerpoint_bbox_coders.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class CenterPointBBoxCoder(BaseBBoxCoder): """Bbox coder for CenterPoint. Args: pc_range (list[float]): Range of point cloud. out_size_factor (int): Downsample factor of the model. voxel_size (list[float]): Size of voxel. post_center_range (list[float], optional): Limit of the center. Default: None. max_num (int, optional): Max number to be kept. Default: 100. score_threshold (float, optional): Threshold to filter boxes based on score. Default: None. code_size (int, optional): Code size of bboxes. Default: 9 """ def __init__(self, pc_range, out_size_factor, voxel_size, post_center_range=None, max_num=100, score_threshold=None, code_size=9): self.pc_range = pc_range self.out_size_factor = out_size_factor self.voxel_size = voxel_size self.post_center_range = post_center_range self.max_num = max_num self.score_threshold = score_threshold self.code_size = code_size def _gather_feat(self, feats, inds, feat_masks=None): """Given feats and indexes, returns the gathered feats. Args: feats (torch.Tensor): Features to be transposed and gathered with the shape of [B, 2, W, H]. inds (torch.Tensor): Indexes with the shape of [B, N]. feat_masks (torch.Tensor, optional): Mask of the feats. Default: None. Returns: torch.Tensor: Gathered feats. """ dim = feats.size(2) inds = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), dim) feats = feats.gather(1, inds) if feat_masks is not None: feat_masks = feat_masks.unsqueeze(2).expand_as(feats) feats = feats[feat_masks] feats = feats.view(-1, dim) return feats def _topk(self, scores, K=80): """Get indexes based on scores. Args: scores (torch.Tensor): scores with the shape of [B, N, W, H]. K (int, optional): Number to be kept. Defaults to 80. Returns: tuple[torch.Tensor] torch.Tensor: Selected scores with the shape of [B, K]. torch.Tensor: Selected indexes with the shape of [B, K]. torch.Tensor: Selected classes with the shape of [B, K]. torch.Tensor: Selected y coord with the shape of [B, K]. torch.Tensor: Selected x coord with the shape of [B, K]. """ batch, cat, height, width = scores.size() topk_scores, topk_inds = torch.topk(scores.view(batch, cat, -1), K) topk_inds = topk_inds % (height * width) topk_ys = (topk_inds.float() / torch.tensor(width, dtype=torch.float)).int().float() topk_xs = (topk_inds % width).int().float() topk_score, topk_ind = torch.topk(topk_scores.view(batch, -1), K) topk_clses = (topk_ind / torch.tensor(K, dtype=torch.float)).int() topk_inds = self._gather_feat(topk_inds.view(batch, -1, 1), topk_ind).view(batch, K) topk_ys = self._gather_feat(topk_ys.view(batch, -1, 1), topk_ind).view(batch, K) topk_xs = self._gather_feat(topk_xs.view(batch, -1, 1), topk_ind).view(batch, K) return topk_score, topk_inds, topk_clses, topk_ys, topk_xs def _transpose_and_gather_feat(self, feat, ind): """Given feats and indexes, returns the transposed and gathered feats. Args: feat (torch.Tensor): Features to be transposed and gathered with the shape of [B, 2, W, H]. ind (torch.Tensor): Indexes with the shape of [B, N]. Returns: torch.Tensor: Transposed and gathered feats. """ feat = feat.permute(0, 2, 3, 1).contiguous() feat = feat.view(feat.size(0), -1, feat.size(3)) feat = self._gather_feat(feat, ind) return feat def encode(self): pass def decode(self, heat, rot_sine, rot_cosine, hei, dim, vel, reg=None, task_id=-1): """Decode bboxes. Args: heat (torch.Tensor): Heatmap with the shape of [B, N, W, H]. rot_sine (torch.Tensor): Sine of rotation with the shape of [B, 1, W, H]. rot_cosine (torch.Tensor): Cosine of rotation with the shape of [B, 1, W, H]. hei (torch.Tensor): Height of the boxes with the shape of [B, 1, W, H]. dim (torch.Tensor): Dim of the boxes with the shape of [B, 1, W, H]. vel (torch.Tensor): Velocity with the shape of [B, 1, W, H]. reg (torch.Tensor, optional): Regression value of the boxes in 2D with the shape of [B, 2, W, H]. Default: None. task_id (int, optional): Index of task. Default: -1. Returns: list[dict]: Decoded boxes. """ batch, cat, _, _ = heat.size() scores, inds, clses, ys, xs = self._topk(heat, K=self.max_num) if reg is not None: reg = self._transpose_and_gather_feat(reg, inds) reg = reg.view(batch, self.max_num, 2) xs = xs.view(batch, self.max_num, 1) + reg[:, :, 0:1] ys = ys.view(batch, self.max_num, 1) + reg[:, :, 1:2] else: xs = xs.view(batch, self.max_num, 1) + 0.5 ys = ys.view(batch, self.max_num, 1) + 0.5 # rotation value and direction label rot_sine = self._transpose_and_gather_feat(rot_sine, inds) rot_sine = rot_sine.view(batch, self.max_num, 1) rot_cosine = self._transpose_and_gather_feat(rot_cosine, inds) rot_cosine = rot_cosine.view(batch, self.max_num, 1) rot = torch.atan2(rot_sine, rot_cosine) # height in the bev hei = self._transpose_and_gather_feat(hei, inds) hei = hei.view(batch, self.max_num, 1) # dim of the box dim = self._transpose_and_gather_feat(dim, inds) dim = dim.view(batch, self.max_num, 3) # class label clses = clses.view(batch, self.max_num).float() scores = scores.view(batch, self.max_num) xs = xs.view( batch, self.max_num, 1) * self.out_size_factor * self.voxel_size[0] + self.pc_range[0] ys = ys.view( batch, self.max_num, 1) * self.out_size_factor * self.voxel_size[1] + self.pc_range[1] if vel is None: # KITTI FORMAT final_box_preds = torch.cat([xs, ys, hei, dim, rot], dim=2) else: # exist velocity, nuscene format vel = self._transpose_and_gather_feat(vel, inds) vel = vel.view(batch, self.max_num, 2) final_box_preds = torch.cat([xs, ys, hei, dim, rot, vel], dim=2) final_scores = scores final_preds = clses # use score threshold if self.score_threshold is not None: thresh_mask = final_scores > self.score_threshold if self.post_center_range is not None: self.post_center_range = torch.tensor( self.post_center_range, device=heat.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(2) mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(2) predictions_dicts = [] for i in range(batch): cmask = mask[i, :] if self.score_threshold: cmask &= thresh_mask[i] boxes3d = final_box_preds[i, cmask] scores = final_scores[i, cmask] labels = final_preds[i, cmask] predictions_dict = { 'bboxes': boxes3d, 'scores': scores, 'labels': labels } predictions_dicts.append(predictions_dict) else: raise NotImplementedError( 'Need to reorganize output as a batch, only ' 'support post_center_range is not None for now!') return predictions_dicts ================================================ FILE: mmdet3d/core/bbox/coders/delta_xyzwhlr_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class DeltaXYZWLHRBBoxCoder(BaseBBoxCoder): """Bbox Coder for 3D boxes. Args: code_size (int): The dimension of boxes to be encoded. """ def __init__(self, code_size=7): super(DeltaXYZWLHRBBoxCoder, self).__init__() self.code_size = code_size @staticmethod def encode(src_boxes, dst_boxes): """Get box regression transformation deltas (dx, dy, dz, dx_size, dy_size, dz_size, dr, dv*) that can be used to transform the `src_boxes` into the `target_boxes`. Args: src_boxes (torch.Tensor): source boxes, e.g., object proposals. dst_boxes (torch.Tensor): target of the transformation, e.g., ground-truth boxes. Returns: torch.Tensor: Box transformation deltas. """ box_ndim = src_boxes.shape[-1] cas, cgs, cts = [], [], [] if box_ndim > 7: xa, ya, za, wa, la, ha, ra, *cas = torch.split( src_boxes, 1, dim=-1) xg, yg, zg, wg, lg, hg, rg, *cgs = torch.split( dst_boxes, 1, dim=-1) cts = [g - a for g, a in zip(cgs, cas)] else: xa, ya, za, wa, la, ha, ra = torch.split(src_boxes, 1, dim=-1) xg, yg, zg, wg, lg, hg, rg = torch.split(dst_boxes, 1, dim=-1) za = za + ha / 2 zg = zg + hg / 2 diagonal = torch.sqrt(la**2 + wa**2) xt = (xg - xa) / diagonal yt = (yg - ya) / diagonal zt = (zg - za) / ha lt = torch.log(lg / la) wt = torch.log(wg / wa) ht = torch.log(hg / ha) rt = rg - ra return torch.cat([xt, yt, zt, wt, lt, ht, rt, *cts], dim=-1) @staticmethod def decode(anchors, deltas): """Apply transformation `deltas` (dx, dy, dz, dx_size, dy_size, dz_size, dr, dv*) to `boxes`. Args: anchors (torch.Tensor): Parameters of anchors with shape (N, 7). deltas (torch.Tensor): Encoded boxes with shape (N, 7+n) [x, y, z, x_size, y_size, z_size, r, velo*]. Returns: torch.Tensor: Decoded boxes. """ cas, cts = [], [] box_ndim = anchors.shape[-1] if box_ndim > 7: xa, ya, za, wa, la, ha, ra, *cas = torch.split(anchors, 1, dim=-1) xt, yt, zt, wt, lt, ht, rt, *cts = torch.split(deltas, 1, dim=-1) else: xa, ya, za, wa, la, ha, ra = torch.split(anchors, 1, dim=-1) xt, yt, zt, wt, lt, ht, rt = torch.split(deltas, 1, dim=-1) za = za + ha / 2 diagonal = torch.sqrt(la**2 + wa**2) xg = xt * diagonal + xa yg = yt * diagonal + ya zg = zt * ha + za lg = torch.exp(lt) * la wg = torch.exp(wt) * wa hg = torch.exp(ht) * ha rg = rt + ra zg = zg - hg / 2 cgs = [t + a for t, a in zip(cts, cas)] return torch.cat([xg, yg, zg, wg, lg, hg, rg, *cgs], dim=-1) ================================================ FILE: mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS from ..structures import limit_period @BBOX_CODERS.register_module() class FCOS3DBBoxCoder(BaseBBoxCoder): """Bounding box coder for FCOS3D. Args: base_depths (tuple[tuple[float]]): Depth references for decode box depth. Defaults to None. base_dims (tuple[tuple[float]]): Dimension references for decode box dimension. Defaults to None. code_size (int): The dimension of boxes to be encoded. Defaults to 7. norm_on_bbox (bool): Whether to apply normalization on the bounding box 2D attributes. Defaults to True. """ def __init__(self, base_depths=None, base_dims=None, code_size=7, norm_on_bbox=True): super(FCOS3DBBoxCoder, self).__init__() self.base_depths = base_depths self.base_dims = base_dims self.bbox_code_size = code_size self.norm_on_bbox = norm_on_bbox def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels): # TODO: refactor the encoder in the FCOS3D and PGD head pass def decode(self, bbox, scale, stride, training, cls_score=None): """Decode regressed results into 3D predictions. Note that offsets are not transformed to the projected 3D centers. Args: bbox (torch.Tensor): Raw bounding box predictions in shape [N, C, H, W]. scale (tuple[`Scale`]): Learnable scale parameters. stride (int): Stride for a specific feature level. training (bool): Whether the decoding is in the training procedure. cls_score (torch.Tensor): Classification score map for deciding which base depth or dim is used. Defaults to None. Returns: torch.Tensor: Decoded boxes. """ # scale the bbox of different level # only apply to offset, depth and size prediction scale_offset, scale_depth, scale_size = scale[0:3] clone_bbox = bbox.clone() bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float() bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float() bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float() if self.base_depths is None: bbox[:, 2] = bbox[:, 2].exp() elif len(self.base_depths) == 1: # only single prior mean = self.base_depths[0][0] std = self.base_depths[0][1] bbox[:, 2] = mean + bbox.clone()[:, 2] * std else: # multi-class priors assert len(self.base_depths) == cls_score.shape[1], \ 'The number of multi-class depth priors should be equal to ' \ 'the number of categories.' indices = cls_score.max(dim=1)[1] depth_priors = cls_score.new_tensor( self.base_depths)[indices, :].permute(0, 3, 1, 2) mean = depth_priors[:, 0] std = depth_priors[:, 1] bbox[:, 2] = mean + bbox.clone()[:, 2] * std bbox[:, 3:6] = bbox[:, 3:6].exp() if self.base_dims is not None: assert len(self.base_dims) == cls_score.shape[1], \ 'The number of anchor sizes should be equal to the number ' \ 'of categories.' indices = cls_score.max(dim=1)[1] size_priors = cls_score.new_tensor( self.base_dims)[indices, :].permute(0, 3, 1, 2) bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6] assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\ 'has not been thoroughly tested for FCOS3D.' if self.norm_on_bbox: if not training: # Note that this line is conducted only when testing bbox[:, :2] *= stride return bbox @staticmethod def decode_yaw(bbox, centers2d, dir_cls, dir_offset, cam2img): """Decode yaw angle and change it from local to global.i. Args: bbox (torch.Tensor): Bounding box predictions in shape [N, C] with yaws to be decoded. centers2d (torch.Tensor): Projected 3D-center on the image planes corresponding to the box predictions. dir_cls (torch.Tensor): Predicted direction classes. dir_offset (float): Direction offset before dividing all the directions into several classes. cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4]. Returns: torch.Tensor: Bounding boxes with decoded yaws. """ if bbox.shape[0] > 0: dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi) bbox[..., 6] = \ dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype) bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2], cam2img[0, 0]) + bbox[:, 6] return bbox ================================================ FILE: mmdet3d/core/bbox/coders/groupfree3d_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core.bbox.builder import BBOX_CODERS from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder @BBOX_CODERS.register_module() class GroupFree3DBBoxCoder(PartialBinBasedBBoxCoder): """Modified partial bin based bbox coder for GroupFree3D. Args: num_dir_bins (int): Number of bins to encode direction angle. num_sizes (int): Number of size clusters. mean_sizes (list[list[int]]): Mean size of bboxes in each class. with_rot (bool, optional): Whether the bbox is with rotation. Defaults to True. size_cls_agnostic (bool, optional): Whether the predicted size is class-agnostic. Defaults to True. """ def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True, size_cls_agnostic=True): super(GroupFree3DBBoxCoder, self).__init__( num_dir_bins=num_dir_bins, num_sizes=num_sizes, mean_sizes=mean_sizes, with_rot=with_rot) self.size_cls_agnostic = size_cls_agnostic def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. Returns: tuple: Targets of center, size and direction. """ # generate center target center_target = gt_bboxes_3d.gravity_center # generate bbox size target size_target = gt_bboxes_3d.dims size_class_target = gt_labels_3d size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor( self.mean_sizes)[size_class_target] # generate dir target box_num = gt_labels_3d.shape[0] if self.with_rot: (dir_class_target, dir_res_target) = self.angle2class(gt_bboxes_3d.yaw) else: dir_class_target = gt_labels_3d.new_zeros(box_num) dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num) return (center_target, size_target, size_class_target, size_res_target, dir_class_target, dir_res_target) def decode(self, bbox_out, prefix=''): """Decode predicted parts to bbox3d. Args: bbox_out (dict): Predictions from model, should contain keys below. - center: predicted bottom center of bboxes. - dir_class: predicted bbox direction class. - dir_res: predicted bbox direction residual. - size_class: predicted bbox size class. - size_res: predicted bbox size residual. - size: predicted class-agnostic bbox size prefix (str, optional): Decode predictions with specific prefix. Defaults to ''. Returns: torch.Tensor: Decoded bbox3d with shape (batch, n, 7). """ center = bbox_out[f'{prefix}center'] batch_size, num_proposal = center.shape[:2] # decode heading angle if self.with_rot: dir_class = torch.argmax(bbox_out[f'{prefix}dir_class'], -1) dir_res = torch.gather(bbox_out[f'{prefix}dir_res'], 2, dir_class.unsqueeze(-1)) dir_res.squeeze_(2) dir_angle = self.class2angle(dir_class, dir_res).reshape( batch_size, num_proposal, 1) else: dir_angle = center.new_zeros(batch_size, num_proposal, 1) # decode bbox size if self.size_cls_agnostic: bbox_size = bbox_out[f'{prefix}size'].reshape( batch_size, num_proposal, 3) else: size_class = torch.argmax( bbox_out[f'{prefix}size_class'], -1, keepdim=True) size_res = torch.gather( bbox_out[f'{prefix}size_res'], 2, size_class.unsqueeze(-1).repeat(1, 1, 1, 3)) mean_sizes = center.new_tensor(self.mean_sizes) size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1)) bbox_size = size_base.reshape(batch_size, num_proposal, -1) + size_res.squeeze(2) bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1) return bbox3d def split_pred(self, cls_preds, reg_preds, base_xyz, prefix=''): """Split predicted features to specific parts. Args: cls_preds (torch.Tensor): Class predicted features to split. reg_preds (torch.Tensor): Regression predicted features to split. base_xyz (torch.Tensor): Coordinates of points. prefix (str, optional): Decode predictions with specific prefix. Defaults to ''. Returns: dict[str, torch.Tensor]: Split results. """ results = {} start, end = 0, 0 cls_preds_trans = cls_preds.transpose(2, 1) reg_preds_trans = reg_preds.transpose(2, 1) # decode center end += 3 # (batch_size, num_proposal, 3) results[f'{prefix}center_residual'] = \ reg_preds_trans[..., start:end].contiguous() results[f'{prefix}center'] = base_xyz + \ reg_preds_trans[..., start:end].contiguous() start = end # decode direction end += self.num_dir_bins results[f'{prefix}dir_class'] = \ reg_preds_trans[..., start:end].contiguous() start = end end += self.num_dir_bins dir_res_norm = reg_preds_trans[..., start:end].contiguous() start = end results[f'{prefix}dir_res_norm'] = dir_res_norm results[f'{prefix}dir_res'] = dir_res_norm * ( np.pi / self.num_dir_bins) # decode size if self.size_cls_agnostic: end += 3 results[f'{prefix}size'] = \ reg_preds_trans[..., start:end].contiguous() else: end += self.num_sizes results[f'{prefix}size_class'] = reg_preds_trans[ ..., start:end].contiguous() start = end end += self.num_sizes * 3 size_res_norm = reg_preds_trans[..., start:end] batch_size, num_proposal = reg_preds_trans.shape[:2] size_res_norm = size_res_norm.view( [batch_size, num_proposal, self.num_sizes, 3]) start = end results[f'{prefix}size_res_norm'] = size_res_norm.contiguous() mean_sizes = reg_preds.new_tensor(self.mean_sizes) results[f'{prefix}size_res'] = ( size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0)) # decode objectness score # Group-Free-3D objectness output shape (batch, proposal, 1) results[f'{prefix}obj_scores'] = cls_preds_trans[..., :1].contiguous() # decode semantic score results[f'{prefix}sem_scores'] = cls_preds_trans[..., 1:].contiguous() return results ================================================ FILE: mmdet3d/core/bbox/coders/monoflex_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from torch.nn import functional as F from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class MonoFlexCoder(BaseBBoxCoder): """Bbox Coder for MonoFlex. Args: depth_mode (str): The mode for depth calculation. Available options are "linear", "inv_sigmoid", and "exp". base_depth (tuple[float]): References for decoding box depth. depth_range (list): Depth range of predicted depth. combine_depth (bool): Whether to use combined depth (direct depth and depth from keypoints) or use direct depth only. uncertainty_range (list): Uncertainty range of predicted depth. base_dims (tuple[tuple[float]]): Dimensions mean and std of decode bbox dimensions [l, h, w] for each category. dims_mode (str): The mode for dimension calculation. Available options are "linear" and "exp". multibin (bool): Whether to use multibin representation. num_dir_bins (int): Number of Number of bins to encode direction angle. bin_centers (list[float]): Local yaw centers while using multibin representations. bin_margin (float): Margin of multibin representations. code_size (int): The dimension of boxes to be encoded. eps (float, optional): A value added to the denominator for numerical stability. Default 1e-3. """ def __init__(self, depth_mode, base_depth, depth_range, combine_depth, uncertainty_range, base_dims, dims_mode, multibin, num_dir_bins, bin_centers, bin_margin, code_size, eps=1e-3): super(MonoFlexCoder, self).__init__() # depth related self.depth_mode = depth_mode self.base_depth = base_depth self.depth_range = depth_range self.combine_depth = combine_depth self.uncertainty_range = uncertainty_range # dimensions related self.base_dims = base_dims self.dims_mode = dims_mode # orientation related self.multibin = multibin self.num_dir_bins = num_dir_bins self.bin_centers = bin_centers self.bin_margin = bin_margin # output related self.bbox_code_size = code_size self.eps = eps def encode(self, gt_bboxes_3d): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (`BaseInstance3DBoxes`): Ground truth 3D bboxes. shape: (N, 7). Returns: torch.Tensor: Targets of orientations. """ local_yaw = gt_bboxes_3d.local_yaw # encode local yaw (-pi ~ pi) to multibin format encode_local_yaw = local_yaw.new_zeros( [local_yaw.shape[0], self.num_dir_bins * 2]) bin_size = 2 * np.pi / self.num_dir_bins margin_size = bin_size * self.bin_margin bin_centers = local_yaw.new_tensor(self.bin_centers) range_size = bin_size / 2 + margin_size offsets = local_yaw.unsqueeze(1) - bin_centers.unsqueeze(0) offsets[offsets > np.pi] = offsets[offsets > np.pi] - 2 * np.pi offsets[offsets < -np.pi] = offsets[offsets < -np.pi] + 2 * np.pi for i in range(self.num_dir_bins): offset = offsets[:, i] inds = abs(offset) < range_size encode_local_yaw[inds, i] = 1 encode_local_yaw[inds, i + self.num_dir_bins] = offset[inds] orientation_target = encode_local_yaw return orientation_target def decode(self, bbox, base_centers2d, labels, downsample_ratio, cam2imgs): """Decode bounding box regression into 3D predictions. Args: bbox (Tensor): Raw bounding box predictions for each predict center2d point. shape: (N, C) base_centers2d (torch.Tensor): Base centers2d for 3D bboxes. shape: (N, 2). labels (Tensor): Batch predict class label for each predict center2d point. shape: (N, ) downsample_ratio (int): The stride of feature map. cam2imgs (Tensor): Batch images' camera intrinsic matrix. shape: kitti (N, 4, 4) nuscenes (N, 3, 3) Return: dict: The 3D prediction dict decoded from regression map. the dict has components below: - bboxes2d (torch.Tensor): Decoded [x1, y1, x2, y2] format 2D bboxes. - dimensions (torch.Tensor): Decoded dimensions for each object. - offsets2d (torch.Tenosr): Offsets between base centers2d and real centers2d. - direct_depth (torch.Tensor): Decoded directly regressed depth. - keypoints2d (torch.Tensor): Keypoints of each projected 3D box on image. - keypoints_depth (torch.Tensor): Decoded depth from keypoints. - combined_depth (torch.Tensor): Combined depth using direct depth and keypoints depth with depth uncertainty. - orientations (torch.Tensor): Multibin format orientations (local yaw) for each objects. """ # 4 dimensions for FCOS style regression pred_bboxes2d = bbox[:, 0:4] # change FCOS style to [x1, y1, x2, y2] format for IOU Loss pred_bboxes2d = self.decode_bboxes2d(pred_bboxes2d, base_centers2d) # 2 dimensions for projected centers2d offsets pred_offsets2d = bbox[:, 4:6] # 3 dimensions for 3D bbox dimensions offsets pred_dimensions_offsets3d = bbox[:, 29:32] # the first 8 dimensions are for orientation bin classification # and the second 8 dimensions are for orientation offsets. pred_orientations = torch.cat((bbox[:, 32:40], bbox[:, 40:48]), dim=1) # 3 dimensions for the uncertainties of the solved depths from # groups of keypoints pred_keypoints_depth_uncertainty = bbox[:, 26:29] # 1 dimension for the uncertainty of directly regressed depth pred_direct_depth_uncertainty = bbox[:, 49:50].squeeze(-1) # 2 dimension of offsets x keypoints (8 corners + top/bottom center) pred_keypoints2d = bbox[:, 6:26].reshape(-1, 10, 2) # 1 dimension for depth offsets pred_direct_depth_offsets = bbox[:, 48:49].squeeze(-1) # decode the pred residual dimensions to real dimensions pred_dimensions = self.decode_dims(labels, pred_dimensions_offsets3d) pred_direct_depth = self.decode_direct_depth(pred_direct_depth_offsets) pred_keypoints_depth = self.keypoints2depth(pred_keypoints2d, pred_dimensions, cam2imgs, downsample_ratio) pred_direct_depth_uncertainty = torch.clamp( pred_direct_depth_uncertainty, self.uncertainty_range[0], self.uncertainty_range[1]) pred_keypoints_depth_uncertainty = torch.clamp( pred_keypoints_depth_uncertainty, self.uncertainty_range[0], self.uncertainty_range[1]) if self.combine_depth: pred_depth_uncertainty = torch.cat( (pred_direct_depth_uncertainty.unsqueeze(-1), pred_keypoints_depth_uncertainty), dim=1).exp() pred_depth = torch.cat( (pred_direct_depth.unsqueeze(-1), pred_keypoints_depth), dim=1) pred_combined_depth = \ self.combine_depths(pred_depth, pred_depth_uncertainty) else: pred_combined_depth = None preds = dict( bboxes2d=pred_bboxes2d, dimensions=pred_dimensions, offsets2d=pred_offsets2d, keypoints2d=pred_keypoints2d, orientations=pred_orientations, direct_depth=pred_direct_depth, keypoints_depth=pred_keypoints_depth, combined_depth=pred_combined_depth, direct_depth_uncertainty=pred_direct_depth_uncertainty, keypoints_depth_uncertainty=pred_keypoints_depth_uncertainty, ) return preds def decode_direct_depth(self, depth_offsets): """Transform depth offset to directly regressed depth. Args: depth_offsets (torch.Tensor): Predicted depth offsets. shape: (N, ) Return: torch.Tensor: Directly regressed depth. shape: (N, ) """ if self.depth_mode == 'exp': direct_depth = depth_offsets.exp() elif self.depth_mode == 'linear': base_depth = depth_offsets.new_tensor(self.base_depth) direct_depth = depth_offsets * base_depth[1] + base_depth[0] elif self.depth_mode == 'inv_sigmoid': direct_depth = 1 / torch.sigmoid(depth_offsets) - 1 else: raise ValueError if self.depth_range is not None: direct_depth = torch.clamp( direct_depth, min=self.depth_range[0], max=self.depth_range[1]) return direct_depth def decode_location(self, base_centers2d, offsets2d, depths, cam2imgs, downsample_ratio, pad_mode='default'): """Retrieve object location. Args: base_centers2d (torch.Tensor): predicted base centers2d. shape: (N, 2) offsets2d (torch.Tensor): The offsets between real centers2d and base centers2d. shape: (N , 2) depths (torch.Tensor): Depths of objects. shape: (N, ) cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix. shape: kitti (N, 4, 4) nuscenes (N, 3, 3) downsample_ratio (int): The stride of feature map. pad_mode (str, optional): Padding mode used in training data augmentation. Return: tuple(torch.Tensor): Centers of 3D boxes. shape: (N, 3) """ N = cam2imgs.shape[0] # (N, 4, 4) cam2imgs_inv = cam2imgs.inverse() if pad_mode == 'default': centers2d_img = (base_centers2d + offsets2d) * downsample_ratio else: raise NotImplementedError # (N, 3) centers2d_img = \ torch.cat((centers2d_img, depths.unsqueeze(-1)), dim=1) # (N, 4, 1) centers2d_extend = \ torch.cat((centers2d_img, centers2d_img.new_ones(N, 1)), dim=1).unsqueeze(-1) locations = torch.matmul(cam2imgs_inv, centers2d_extend).squeeze(-1) return locations[:, :3] def keypoints2depth(self, keypoints2d, dimensions, cam2imgs, downsample_ratio=4, group0_index=[(7, 3), (0, 4)], group1_index=[(2, 6), (1, 5)]): """Decode depth form three groups of keypoints and geometry projection model. 2D keypoints inlucding 8 coreners and top/bottom centers will be divided into three groups which will be used to calculate three depths of object. .. code-block:: none Group center keypoints: + --------------- + /| top center /| / | . / | / | | / | + ---------|----- + + | / | | / | / . | / |/ bottom center |/ + --------------- + Group 0 keypoints: 0 + -------------- + /| /| / | / | / | 5/ | + -------------- + + | /3 | / | / | / |/ |/ + -------------- + 6 Group 1 keypoints: 4 + -------------- + /| /| / | / | / | / | 1 + -------------- + + 7 | / | / | / | / |/ |/ 2 + -------------- + Args: keypoints2d (torch.Tensor): Keypoints of objects. 8 vertices + top/bottom center. shape: (N, 10, 2) dimensions (torch.Tensor): Dimensions of objetcts. shape: (N, 3) cam2imgs (torch.Tensor): Batch images' camera intrinsic matrix. shape: kitti (N, 4, 4) nuscenes (N, 3, 3) downsample_ratio (int, opitonal): The stride of feature map. Defaults: 4. group0_index(list[tuple[int]], optional): Keypoints group 0 of index to calculate the depth. Defaults: [0, 3, 4, 7]. group1_index(list[tuple[int]], optional): Keypoints group 1 of index to calculate the depth. Defaults: [1, 2, 5, 6] Return: tuple(torch.Tensor): Depth computed from three groups of keypoints (top/bottom, group0, group1) shape: (N, 3) """ pred_height_3d = dimensions[:, 1].clone() f_u = cam2imgs[:, 0, 0] center_height = keypoints2d[:, -2, 1] - keypoints2d[:, -1, 1] corner_group0_height = keypoints2d[:, group0_index[0], 1] \ - keypoints2d[:, group0_index[1], 1] corner_group1_height = keypoints2d[:, group1_index[0], 1] \ - keypoints2d[:, group1_index[1], 1] center_depth = f_u * pred_height_3d / ( F.relu(center_height) * downsample_ratio + self.eps) corner_group0_depth = (f_u * pred_height_3d).unsqueeze(-1) / ( F.relu(corner_group0_height) * downsample_ratio + self.eps) corner_group1_depth = (f_u * pred_height_3d).unsqueeze(-1) / ( F.relu(corner_group1_height) * downsample_ratio + self.eps) corner_group0_depth = corner_group0_depth.mean(dim=1) corner_group1_depth = corner_group1_depth.mean(dim=1) keypoints_depth = torch.stack( (center_depth, corner_group0_depth, corner_group1_depth), dim=1) keypoints_depth = torch.clamp( keypoints_depth, min=self.depth_range[0], max=self.depth_range[1]) return keypoints_depth def decode_dims(self, labels, dims_offset): """Retrieve object dimensions. Args: labels (torch.Tensor): Each points' category id. shape: (N, K) dims_offset (torch.Tensor): Dimension offsets. shape: (N, 3) Returns: torch.Tensor: Shape (N, 3) """ if self.dims_mode == 'exp': dims_offset = dims_offset.exp() elif self.dims_mode == 'linear': labels = labels.long() base_dims = dims_offset.new_tensor(self.base_dims) dims_mean = base_dims[:, :3] dims_std = base_dims[:, 3:6] cls_dimension_mean = dims_mean[labels, :] cls_dimension_std = dims_std[labels, :] dimensions = dims_offset * cls_dimension_mean + cls_dimension_std else: raise ValueError return dimensions def decode_orientation(self, ori_vector, locations): """Retrieve object orientation. Args: ori_vector (torch.Tensor): Local orientation vector in [axis_cls, head_cls, sin, cos] format. shape: (N, num_dir_bins * 4) locations (torch.Tensor): Object location. shape: (N, 3) Returns: tuple[torch.Tensor]: yaws and local yaws of 3d bboxes. """ if self.multibin: pred_bin_cls = ori_vector[:, :self.num_dir_bins * 2].view( -1, self.num_dir_bins, 2) pred_bin_cls = pred_bin_cls.softmax(dim=2)[..., 1] orientations = ori_vector.new_zeros(ori_vector.shape[0]) for i in range(self.num_dir_bins): mask_i = (pred_bin_cls.argmax(dim=1) == i) start_bin = self.num_dir_bins * 2 + i * 2 end_bin = start_bin + 2 pred_bin_offset = ori_vector[mask_i, start_bin:end_bin] orientations[mask_i] = pred_bin_offset[:, 0].atan2( pred_bin_offset[:, 1]) + self.bin_centers[i] else: axis_cls = ori_vector[:, :2].softmax(dim=1) axis_cls = axis_cls[:, 0] < axis_cls[:, 1] head_cls = ori_vector[:, 2:4].softmax(dim=1) head_cls = head_cls[:, 0] < head_cls[:, 1] # cls axis orientations = self.bin_centers[axis_cls + head_cls * 2] sin_cos_offset = F.normalize(ori_vector[:, 4:]) orientations += sin_cos_offset[:, 0].atan(sin_cos_offset[:, 1]) locations = locations.view(-1, 3) rays = locations[:, 0].atan2(locations[:, 2]) local_yaws = orientations yaws = local_yaws + rays larger_idx = (yaws > np.pi).nonzero(as_tuple=False) small_idx = (yaws < -np.pi).nonzero(as_tuple=False) if len(larger_idx) != 0: yaws[larger_idx] -= 2 * np.pi if len(small_idx) != 0: yaws[small_idx] += 2 * np.pi larger_idx = (local_yaws > np.pi).nonzero(as_tuple=False) small_idx = (local_yaws < -np.pi).nonzero(as_tuple=False) if len(larger_idx) != 0: local_yaws[larger_idx] -= 2 * np.pi if len(small_idx) != 0: local_yaws[small_idx] += 2 * np.pi return yaws, local_yaws def decode_bboxes2d(self, reg_bboxes2d, base_centers2d): """Retrieve [x1, y1, x2, y2] format 2D bboxes. Args: reg_bboxes2d (torch.Tensor): Predicted FCOS style 2D bboxes. shape: (N, 4) base_centers2d (torch.Tensor): predicted base centers2d. shape: (N, 2) Returns: torch.Tenosr: [x1, y1, x2, y2] format 2D bboxes. """ centers_x = base_centers2d[:, 0] centers_y = base_centers2d[:, 1] xs_min = centers_x - reg_bboxes2d[..., 0] ys_min = centers_y - reg_bboxes2d[..., 1] xs_max = centers_x + reg_bboxes2d[..., 2] ys_max = centers_y + reg_bboxes2d[..., 3] bboxes2d = torch.stack([xs_min, ys_min, xs_max, ys_max], dim=-1) return bboxes2d def combine_depths(self, depth, depth_uncertainty): """Combine all the prediced depths with depth uncertainty. Args: depth (torch.Tensor): Predicted depths of each object. 2D bboxes. shape: (N, 4) depth_uncertainty (torch.Tensor): Depth uncertainty for each depth of each object. shape: (N, 4) Returns: torch.Tenosr: combined depth. """ uncertainty_weights = 1 / depth_uncertainty uncertainty_weights = \ uncertainty_weights / \ uncertainty_weights.sum(dim=1, keepdim=True) combined_depth = torch.sum(depth * uncertainty_weights, dim=1) return combined_depth ================================================ FILE: mmdet3d/core/bbox/coders/partial_bin_based_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class PartialBinBasedBBoxCoder(BaseBBoxCoder): """Partial bin based bbox coder. Args: num_dir_bins (int): Number of bins to encode direction angle. num_sizes (int): Number of size clusters. mean_sizes (list[list[int]]): Mean size of bboxes in each class. with_rot (bool): Whether the bbox is with rotation. """ def __init__(self, num_dir_bins, num_sizes, mean_sizes, with_rot=True): super(PartialBinBasedBBoxCoder, self).__init__() assert len(mean_sizes) == num_sizes self.num_dir_bins = num_dir_bins self.num_sizes = num_sizes self.mean_sizes = mean_sizes self.with_rot = with_rot def encode(self, gt_bboxes_3d, gt_labels_3d): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (BaseInstance3DBoxes): Ground truth bboxes with shape (n, 7). gt_labels_3d (torch.Tensor): Ground truth classes. Returns: tuple: Targets of center, size and direction. """ # generate center target center_target = gt_bboxes_3d.gravity_center # generate bbox size target size_class_target = gt_labels_3d size_res_target = gt_bboxes_3d.dims - gt_bboxes_3d.tensor.new_tensor( self.mean_sizes)[size_class_target] # generate dir target box_num = gt_labels_3d.shape[0] if self.with_rot: (dir_class_target, dir_res_target) = self.angle2class(gt_bboxes_3d.yaw) else: dir_class_target = gt_labels_3d.new_zeros(box_num) dir_res_target = gt_bboxes_3d.tensor.new_zeros(box_num) return (center_target, size_class_target, size_res_target, dir_class_target, dir_res_target) def decode(self, bbox_out, suffix=''): """Decode predicted parts to bbox3d. Args: bbox_out (dict): Predictions from model, should contain keys below. - center: predicted bottom center of bboxes. - dir_class: predicted bbox direction class. - dir_res: predicted bbox direction residual. - size_class: predicted bbox size class. - size_res: predicted bbox size residual. suffix (str): Decode predictions with specific suffix. Returns: torch.Tensor: Decoded bbox3d with shape (batch, n, 7). """ center = bbox_out['center' + suffix] batch_size, num_proposal = center.shape[:2] # decode heading angle if self.with_rot: dir_class = torch.argmax(bbox_out['dir_class' + suffix], -1) dir_res = torch.gather(bbox_out['dir_res' + suffix], 2, dir_class.unsqueeze(-1)) dir_res.squeeze_(2) dir_angle = self.class2angle(dir_class, dir_res).reshape( batch_size, num_proposal, 1) else: dir_angle = center.new_zeros(batch_size, num_proposal, 1) # decode bbox size size_class = torch.argmax( bbox_out['size_class' + suffix], -1, keepdim=True) size_res = torch.gather(bbox_out['size_res' + suffix], 2, size_class.unsqueeze(-1).repeat(1, 1, 1, 3)) mean_sizes = center.new_tensor(self.mean_sizes) size_base = torch.index_select(mean_sizes, 0, size_class.reshape(-1)) bbox_size = size_base.reshape(batch_size, num_proposal, -1) + size_res.squeeze(2) bbox3d = torch.cat([center, bbox_size, dir_angle], dim=-1) return bbox3d def decode_corners(self, center, size_res, size_class): """Decode center, size residuals and class to corners. Only useful for axis-aligned bounding boxes, so angle isn't considered. Args: center (torch.Tensor): Shape [B, N, 3] size_res (torch.Tensor): Shape [B, N, 3] or [B, N, C, 3] size_class (torch.Tensor): Shape: [B, N] or [B, N, 1] or [B, N, C, 3] Returns: torch.Tensor: Corners with shape [B, N, 6] """ if len(size_class.shape) == 2 or size_class.shape[-1] == 1: batch_size, proposal_num = size_class.shape[:2] one_hot_size_class = size_res.new_zeros( (batch_size, proposal_num, self.num_sizes)) if len(size_class.shape) == 2: size_class = size_class.unsqueeze(-1) one_hot_size_class.scatter_(2, size_class, 1) one_hot_size_class_expand = one_hot_size_class.unsqueeze( -1).repeat(1, 1, 1, 3).contiguous() else: one_hot_size_class_expand = size_class if len(size_res.shape) == 4: size_res = torch.sum(size_res * one_hot_size_class_expand, 2) mean_sizes = size_res.new_tensor(self.mean_sizes) mean_sizes = torch.sum(mean_sizes * one_hot_size_class_expand, 2) size_full = (size_res + 1) * mean_sizes size_full = torch.clamp(size_full, 0) half_size_full = size_full / 2 corner1 = center - half_size_full corner2 = center + half_size_full corners = torch.cat([corner1, corner2], dim=-1) return corners def split_pred(self, cls_preds, reg_preds, base_xyz): """Split predicted features to specific parts. Args: cls_preds (torch.Tensor): Class predicted features to split. reg_preds (torch.Tensor): Regression predicted features to split. base_xyz (torch.Tensor): Coordinates of points. Returns: dict[str, torch.Tensor]: Split results. """ results = {} start, end = 0, 0 cls_preds_trans = cls_preds.transpose(2, 1) reg_preds_trans = reg_preds.transpose(2, 1) # decode center end += 3 # (batch_size, num_proposal, 3) results['center'] = base_xyz + \ reg_preds_trans[..., start:end].contiguous() start = end # decode direction end += self.num_dir_bins results['dir_class'] = reg_preds_trans[..., start:end].contiguous() start = end end += self.num_dir_bins dir_res_norm = reg_preds_trans[..., start:end].contiguous() start = end results['dir_res_norm'] = dir_res_norm results['dir_res'] = dir_res_norm * (np.pi / self.num_dir_bins) # decode size end += self.num_sizes results['size_class'] = reg_preds_trans[..., start:end].contiguous() start = end end += self.num_sizes * 3 size_res_norm = reg_preds_trans[..., start:end] batch_size, num_proposal = reg_preds_trans.shape[:2] size_res_norm = size_res_norm.view( [batch_size, num_proposal, self.num_sizes, 3]) start = end results['size_res_norm'] = size_res_norm.contiguous() mean_sizes = reg_preds.new_tensor(self.mean_sizes) results['size_res'] = ( size_res_norm * mean_sizes.unsqueeze(0).unsqueeze(0)) # decode objectness score start = 0 end = 2 results['obj_scores'] = cls_preds_trans[..., start:end].contiguous() start = end # decode semantic score results['sem_scores'] = cls_preds_trans[..., start:].contiguous() return results def angle2class(self, angle): """Convert continuous angle to a discrete class and a residual. Convert continuous angle to a discrete class and a small regression number from class center angle to current angle. Args: angle (torch.Tensor): Angle is from 0-2pi (or -pi~pi), class center at 0, 1*(2pi/N), 2*(2pi/N) ... (N-1)*(2pi/N). Returns: tuple: Encoded discrete class and residual. """ angle = angle % (2 * np.pi) angle_per_class = 2 * np.pi / float(self.num_dir_bins) shifted_angle = (angle + angle_per_class / 2) % (2 * np.pi) angle_cls = shifted_angle // angle_per_class angle_res = shifted_angle - ( angle_cls * angle_per_class + angle_per_class / 2) return angle_cls.long(), angle_res def class2angle(self, angle_cls, angle_res, limit_period=True): """Inverse function to angle2class. Args: angle_cls (torch.Tensor): Angle class to decode. angle_res (torch.Tensor): Angle residual to decode. limit_period (bool): Whether to limit angle to [-pi, pi]. Returns: torch.Tensor: Angle decoded from angle_cls and angle_res. """ angle_per_class = 2 * np.pi / float(self.num_dir_bins) angle_center = angle_cls.float() * angle_per_class angle = angle_center + angle_res if limit_period: angle[angle > np.pi] -= 2 * np.pi return angle ================================================ FILE: mmdet3d/core/bbox/coders/pgd_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from torch.nn import functional as F from mmdet.core.bbox.builder import BBOX_CODERS from .fcos3d_bbox_coder import FCOS3DBBoxCoder @BBOX_CODERS.register_module() class PGDBBoxCoder(FCOS3DBBoxCoder): """Bounding box coder for PGD.""" def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels): # TODO: refactor the encoder codes in the FCOS3D and PGD head pass def decode_2d(self, bbox, scale, stride, max_regress_range, training, pred_keypoints=False, pred_bbox2d=True): """Decode regressed 2D attributes. Args: bbox (torch.Tensor): Raw bounding box predictions in shape [N, C, H, W]. scale (tuple[`Scale`]): Learnable scale parameters. stride (int): Stride for a specific feature level. max_regress_range (int): Maximum regression range for a specific feature level. training (bool): Whether the decoding is in the training procedure. pred_keypoints (bool, optional): Whether to predict keypoints. Defaults to False. pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes. Defaults to False. Returns: torch.Tensor: Decoded boxes. """ clone_bbox = bbox.clone() if pred_keypoints: scale_kpts = scale[3] # 2 dimension of offsets x 8 corners of a 3D bbox bbox[:, self.bbox_code_size:self.bbox_code_size + 16] = \ torch.tanh(scale_kpts(clone_bbox[ :, self.bbox_code_size:self.bbox_code_size + 16]).float()) if pred_bbox2d: scale_bbox2d = scale[-1] # The last four dimensions are offsets to four sides of a 2D bbox bbox[:, -4:] = scale_bbox2d(clone_bbox[:, -4:]).float() if self.norm_on_bbox: if pred_bbox2d: bbox[:, -4:] = F.relu(bbox.clone()[:, -4:]) if not training: if pred_keypoints: bbox[ :, self.bbox_code_size:self.bbox_code_size + 16] *= \ max_regress_range if pred_bbox2d: bbox[:, -4:] *= stride else: if pred_bbox2d: bbox[:, -4:] = bbox.clone()[:, -4:].exp() return bbox def decode_prob_depth(self, depth_cls_preds, depth_range, depth_unit, division, num_depth_cls): """Decode probabilistic depth map. Args: depth_cls_preds (torch.Tensor): Depth probabilistic map in shape [..., self.num_depth_cls] (raw output before softmax). depth_range (tuple[float]): Range of depth estimation. depth_unit (int): Unit of depth range division. division (str): Depth division method. Options include 'uniform', 'linear', 'log', 'loguniform'. num_depth_cls (int): Number of depth classes. Returns: torch.Tensor: Decoded probabilistic depth estimation. """ if division == 'uniform': depth_multiplier = depth_unit * \ depth_cls_preds.new_tensor( list(range(num_depth_cls))).reshape([1, -1]) prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * depth_multiplier).sum(dim=-1) return prob_depth_preds elif division == 'linear': split_pts = depth_cls_preds.new_tensor(list( range(num_depth_cls))).reshape([1, -1]) depth_multiplier = depth_range[0] + ( depth_range[1] - depth_range[0]) / \ (num_depth_cls * (num_depth_cls - 1)) * \ (split_pts * (split_pts+1)) prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * depth_multiplier).sum(dim=-1) return prob_depth_preds elif division == 'log': split_pts = depth_cls_preds.new_tensor(list( range(num_depth_cls))).reshape([1, -1]) start = max(depth_range[0], 1) end = depth_range[1] depth_multiplier = (np.log(start) + split_pts * np.log(end / start) / (num_depth_cls - 1)).exp() prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * depth_multiplier).sum(dim=-1) return prob_depth_preds elif division == 'loguniform': split_pts = depth_cls_preds.new_tensor(list( range(num_depth_cls))).reshape([1, -1]) start = max(depth_range[0], 1) end = depth_range[1] log_multiplier = np.log(start) + \ split_pts * np.log(end / start) / (num_depth_cls - 1) prob_depth_preds = (F.softmax(depth_cls_preds.clone(), dim=-1) * log_multiplier).sum(dim=-1).exp() return prob_depth_preds else: raise NotImplementedError ================================================ FILE: mmdet3d/core/bbox/coders/point_xyzwhlr_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class PointXYZWHLRBBoxCoder(BaseBBoxCoder): """Point based bbox coder for 3D boxes. Args: code_size (int): The dimension of boxes to be encoded. use_mean_size (bool, optional): Whether using anchors based on class. Defaults to True. mean_size (list[list[float]], optional): Mean size of bboxes in each class. Defaults to None. """ def __init__(self, code_size=7, use_mean_size=True, mean_size=None): super(PointXYZWHLRBBoxCoder, self).__init__() self.code_size = code_size self.use_mean_size = use_mean_size if self.use_mean_size: self.mean_size = torch.from_numpy(np.array(mean_size)).float() assert self.mean_size.min() > 0, \ f'The min of mean_size should > 0, however currently it is '\ f'{self.mean_size.min()}, please check it in your config.' def encode(self, gt_bboxes_3d, points, gt_labels_3d=None): """Encode ground truth to prediction targets. Args: gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth bboxes with shape (N, 7 + C). points (torch.Tensor): Point cloud with shape (N, 3). gt_labels_3d (torch.Tensor, optional): Ground truth classes. Defaults to None. Returns: torch.Tensor: Encoded boxes with shape (N, 8 + C). """ gt_bboxes_3d[:, 3:6] = torch.clamp_min(gt_bboxes_3d[:, 3:6], min=1e-5) xg, yg, zg, dxg, dyg, dzg, rg, *cgs = torch.split( gt_bboxes_3d, 1, dim=-1) xa, ya, za = torch.split(points, 1, dim=-1) if self.use_mean_size: assert gt_labels_3d.max() <= self.mean_size.shape[0] - 1, \ f'the max gt label {gt_labels_3d.max()} is bigger than' \ f'anchor types {self.mean_size.shape[0] - 1}.' self.mean_size = self.mean_size.to(gt_labels_3d.device) point_anchor_size = self.mean_size[gt_labels_3d] dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1) diagonal = torch.sqrt(dxa**2 + dya**2) xt = (xg - xa) / diagonal yt = (yg - ya) / diagonal zt = (zg - za) / dza dxt = torch.log(dxg / dxa) dyt = torch.log(dyg / dya) dzt = torch.log(dzg / dza) else: xt = (xg - xa) yt = (yg - ya) zt = (zg - za) dxt = torch.log(dxg) dyt = torch.log(dyg) dzt = torch.log(dzg) return torch.cat( [xt, yt, zt, dxt, dyt, dzt, torch.cos(rg), torch.sin(rg), *cgs], dim=-1) def decode(self, box_encodings, points, pred_labels_3d=None): """Decode predicted parts and points to bbox3d. Args: box_encodings (torch.Tensor): Encoded boxes with shape (N, 8 + C). points (torch.Tensor): Point cloud with shape (N, 3). pred_labels_3d (torch.Tensor): Bbox predicted labels (N, M). Returns: torch.Tensor: Decoded boxes with shape (N, 7 + C) """ xt, yt, zt, dxt, dyt, dzt, cost, sint, *cts = torch.split( box_encodings, 1, dim=-1) xa, ya, za = torch.split(points, 1, dim=-1) if self.use_mean_size: assert pred_labels_3d.max() <= self.mean_size.shape[0] - 1, \ f'The max pred label {pred_labels_3d.max()} is bigger than' \ f'anchor types {self.mean_size.shape[0] - 1}.' self.mean_size = self.mean_size.to(pred_labels_3d.device) point_anchor_size = self.mean_size[pred_labels_3d] dxa, dya, dza = torch.split(point_anchor_size, 1, dim=-1) diagonal = torch.sqrt(dxa**2 + dya**2) xg = xt * diagonal + xa yg = yt * diagonal + ya zg = zt * dza + za dxg = torch.exp(dxt) * dxa dyg = torch.exp(dyt) * dya dzg = torch.exp(dzt) * dza else: xg = xt + xa yg = yt + ya zg = zt + za dxg, dyg, dzg = torch.split( torch.exp(box_encodings[..., 3:6]), 1, dim=-1) rg = torch.atan2(sint, cost) return torch.cat([xg, yg, zg, dxg, dyg, dzg, rg, *cts], dim=-1) ================================================ FILE: mmdet3d/core/bbox/coders/smoke_bbox_coder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS @BBOX_CODERS.register_module() class SMOKECoder(BaseBBoxCoder): """Bbox Coder for SMOKE. Args: base_depth (tuple[float]): Depth references for decode box depth. base_dims (tuple[tuple[float]]): Dimension references [l, h, w] for decode box dimension for each category. code_size (int): The dimension of boxes to be encoded. """ def __init__(self, base_depth, base_dims, code_size): super(SMOKECoder, self).__init__() self.base_depth = base_depth self.base_dims = base_dims self.bbox_code_size = code_size def encode(self, locations, dimensions, orientations, input_metas): """Encode CameraInstance3DBoxes by locations, dimensions, orientations. Args: locations (Tensor): Center location for 3D boxes. (N, 3) dimensions (Tensor): Dimensions for 3D boxes. shape (N, 3) orientations (Tensor): Orientations for 3D boxes. shape (N, 1) input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Return: :obj:`CameraInstance3DBoxes`: 3D bboxes of batch images, shape (N, bbox_code_size). """ bboxes = torch.cat((locations, dimensions, orientations), dim=1) assert bboxes.shape[1] == self.bbox_code_size, 'bboxes shape dose not'\ 'match the bbox_code_size.' batch_bboxes = input_metas[0]['box_type_3d']( bboxes, box_dim=self.bbox_code_size) return batch_bboxes def decode(self, reg, points, labels, cam2imgs, trans_mats, locations=None): """Decode regression into locations, dimensions, orientations. Args: reg (Tensor): Batch regression for each predict center2d point. shape: (batch * K (max_objs), C) points(Tensor): Batch projected bbox centers on image plane. shape: (batch * K (max_objs) , 2) labels (Tensor): Batch predict class label for each predict center2d point. shape: (batch, K (max_objs)) cam2imgs (Tensor): Batch images' camera intrinsic matrix. shape: kitti (batch, 4, 4) nuscenes (batch, 3, 3) trans_mats (Tensor): transformation matrix from original image to feature map. shape: (batch, 3, 3) locations (None | Tensor): if locations is None, this function is used to decode while inference, otherwise, it's used while training using the ground truth 3d bbox locations. shape: (batch * K (max_objs), 3) Return: tuple(Tensor): The tuple has components below: - locations (Tensor): Centers of 3D boxes. shape: (batch * K (max_objs), 3) - dimensions (Tensor): Dimensions of 3D boxes. shape: (batch * K (max_objs), 3) - orientations (Tensor): Orientations of 3D boxes. shape: (batch * K (max_objs), 1) """ depth_offsets = reg[:, 0] centers2d_offsets = reg[:, 1:3] dimensions_offsets = reg[:, 3:6] orientations = reg[:, 6:8] depths = self._decode_depth(depth_offsets) # get the 3D Bounding box's center location. pred_locations = self._decode_location(points, centers2d_offsets, depths, cam2imgs, trans_mats) pred_dimensions = self._decode_dimension(labels, dimensions_offsets) if locations is None: pred_orientations = self._decode_orientation( orientations, pred_locations) else: pred_orientations = self._decode_orientation( orientations, locations) return pred_locations, pred_dimensions, pred_orientations def _decode_depth(self, depth_offsets): """Transform depth offset to depth.""" base_depth = depth_offsets.new_tensor(self.base_depth) depths = depth_offsets * base_depth[1] + base_depth[0] return depths def _decode_location(self, points, centers2d_offsets, depths, cam2imgs, trans_mats): """Retrieve objects location in camera coordinate based on projected points. Args: points (Tensor): Projected points on feature map in (x, y) shape: (batch * K, 2) centers2d_offset (Tensor): Project points offset in (delta_x, delta_y). shape: (batch * K, 2) depths (Tensor): Object depth z. shape: (batch * K) cam2imgs (Tensor): Batch camera intrinsics matrix. shape: kitti (batch, 4, 4) nuscenes (batch, 3, 3) trans_mats (Tensor): transformation matrix from original image to feature map. shape: (batch, 3, 3) """ # number of points N = centers2d_offsets.shape[0] # batch_size N_batch = cam2imgs.shape[0] batch_id = torch.arange(N_batch).unsqueeze(1) obj_id = batch_id.repeat(1, N // N_batch).flatten() trans_mats_inv = trans_mats.inverse()[obj_id] cam2imgs_inv = cam2imgs.inverse()[obj_id] centers2d = points + centers2d_offsets centers2d_extend = torch.cat((centers2d, centers2d.new_ones(N, 1)), dim=1) # expand project points as [N, 3, 1] centers2d_extend = centers2d_extend.unsqueeze(-1) # transform project points back on original image centers2d_img = torch.matmul(trans_mats_inv, centers2d_extend) centers2d_img = centers2d_img * depths.view(N, -1, 1) if cam2imgs.shape[1] == 4: centers2d_img = torch.cat( (centers2d_img, centers2d.new_ones(N, 1, 1)), dim=1) locations = torch.matmul(cam2imgs_inv, centers2d_img).squeeze(2) return locations[:, :3] def _decode_dimension(self, labels, dims_offset): """Transform dimension offsets to dimension according to its category. Args: labels (Tensor): Each points' category id. shape: (N, K) dims_offset (Tensor): Dimension offsets. shape: (N, 3) """ labels = labels.flatten().long() base_dims = dims_offset.new_tensor(self.base_dims) dims_select = base_dims[labels, :] dimensions = dims_offset.exp() * dims_select return dimensions def _decode_orientation(self, ori_vector, locations): """Retrieve object orientation. Args: ori_vector (Tensor): Local orientation in [sin, cos] format. shape: (N, 2) locations (Tensor): Object location. shape: (N, 3) Return: Tensor: yaw(Orientation). Notice that the yaw's range is [-np.pi, np.pi]. shape:(N, 1) """ assert len(ori_vector) == len(locations) locations = locations.view(-1, 3) rays = torch.atan(locations[:, 0] / (locations[:, 2] + 1e-7)) alphas = torch.atan(ori_vector[:, 0] / (ori_vector[:, 1] + 1e-7)) # get cosine value positive and negative index. cos_pos_inds = (ori_vector[:, 1] >= 0).nonzero(as_tuple=False) cos_neg_inds = (ori_vector[:, 1] < 0).nonzero(as_tuple=False) alphas[cos_pos_inds] -= np.pi / 2 alphas[cos_neg_inds] += np.pi / 2 # retrieve object rotation y angle. yaws = alphas + rays larger_inds = (yaws > np.pi).nonzero(as_tuple=False) small_inds = (yaws < -np.pi).nonzero(as_tuple=False) if len(larger_inds) != 0: yaws[larger_inds] -= 2 * np.pi if len(small_inds) != 0: yaws[small_inds] += 2 * np.pi yaws = yaws.unsqueeze(-1) return yaws ================================================ FILE: mmdet3d/core/bbox/iou_calculators/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .iou3d_calculator import (AxisAlignedBboxOverlaps3D, BboxOverlaps3D, BboxOverlapsNearest3D, axis_aligned_bbox_overlaps_3d, bbox_overlaps_3d, bbox_overlaps_nearest_3d) __all__ = [ 'BboxOverlapsNearest3D', 'BboxOverlaps3D', 'bbox_overlaps_nearest_3d', 'bbox_overlaps_3d', 'AxisAlignedBboxOverlaps3D', 'axis_aligned_bbox_overlaps_3d' ] ================================================ FILE: mmdet3d/core/bbox/iou_calculators/iou3d_calculator.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet.core.bbox import bbox_overlaps from mmdet.core.bbox.iou_calculators.builder import IOU_CALCULATORS from ..structures import get_box_type @IOU_CALCULATORS.register_module() class BboxOverlapsNearest3D(object): """Nearest 3D IoU Calculator. Note: This IoU calculator first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. Args: coordinate (str): 'camera', 'lidar', or 'depth' coordinate system. """ def __init__(self, coordinate='lidar'): assert coordinate in ['camera', 'lidar', 'depth'] self.coordinate = coordinate def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate nearest 3D IoU. Note: If ``is_aligned`` is ``False``, then it calculates the ious between each bbox of bboxes1 and bboxes2, otherwise it calculates the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): shape (N, 7+N) [x, y, z, x_size, y_size, z_size, ry, v]. bboxes2 (torch.Tensor): shape (M, 7+N) [x, y, z, x_size, y_size, z_size, ry, v]. mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned. Return: torch.Tensor: If ``is_aligned`` is ``True``, return ious between bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is ``False``, return shape is M. """ return bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode, is_aligned, self.coordinate) def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(coordinate={self.coordinate}' return repr_str @IOU_CALCULATORS.register_module() class BboxOverlaps3D(object): """3D IoU Calculator. Args: coordinate (str): The coordinate system, valid options are 'camera', 'lidar', and 'depth'. """ def __init__(self, coordinate): assert coordinate in ['camera', 'lidar', 'depth'] self.coordinate = coordinate def __call__(self, bboxes1, bboxes2, mode='iou'): """Calculate 3D IoU using cuda implementation. Note: This function calculate the IoU of 3D boxes based on their volumes. IoU calculator ``:class:BboxOverlaps3D`` uses this function to calculate the actual 3D IoUs of boxes. Args: bboxes1 (torch.Tensor): with shape (N, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). bboxes2 (torch.Tensor): with shape (M, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). Return: torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 with shape (M, N) (aligned mode is not supported currently). """ return bbox_overlaps_3d(bboxes1, bboxes2, mode, self.coordinate) def __repr__(self): """str: return a string that describes the module""" repr_str = self.__class__.__name__ repr_str += f'(coordinate={self.coordinate}' return repr_str def bbox_overlaps_nearest_3d(bboxes1, bboxes2, mode='iou', is_aligned=False, coordinate='lidar'): """Calculate nearest 3D IoU. Note: This function first finds the nearest 2D boxes in bird eye view (BEV), and then calculates the 2D IoU using :meth:`bbox_overlaps`. This IoU calculator :class:`BboxOverlapsNearest3D` uses this function to calculate IoUs of boxes. If ``is_aligned`` is ``False``, then it calculates the ious between each bbox of bboxes1 and bboxes2, otherwise the ious between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (torch.Tensor): with shape (N, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). bboxes2 (torch.Tensor): with shape (M, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). is_aligned (bool): Whether the calculation is aligned Return: torch.Tensor: If ``is_aligned`` is ``True``, return ious between bboxes1 and bboxes2 with shape (M, N). If ``is_aligned`` is ``False``, return shape is M. """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 box_type, _ = get_box_type(coordinate) bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) # Change the bboxes to bev # box conversion and iou calculation in torch version on CUDA # is 10x faster than that in numpy version bboxes1_bev = bboxes1.nearest_bev bboxes2_bev = bboxes2.nearest_bev ret = bbox_overlaps( bboxes1_bev, bboxes2_bev, mode=mode, is_aligned=is_aligned) return ret def bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', coordinate='camera'): """Calculate 3D IoU using cuda implementation. Note: This function calculates the IoU of 3D boxes based on their volumes. IoU calculator :class:`BboxOverlaps3D` uses this function to calculate the actual IoUs of boxes. Args: bboxes1 (torch.Tensor): with shape (N, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). bboxes2 (torch.Tensor): with shape (M, 7+C), (x, y, z, x_size, y_size, z_size, ry, v*). mode (str): "iou" (intersection over union) or iof (intersection over foreground). coordinate (str): 'camera' or 'lidar' coordinate system. Return: torch.Tensor: Bbox overlaps results of bboxes1 and bboxes2 with shape (M, N) (aligned mode is not supported currently). """ assert bboxes1.size(-1) == bboxes2.size(-1) >= 7 box_type, _ = get_box_type(coordinate) bboxes1 = box_type(bboxes1, box_dim=bboxes1.shape[-1]) bboxes2 = box_type(bboxes2, box_dim=bboxes2.shape[-1]) return bboxes1.overlaps(bboxes1, bboxes2, mode=mode) @IOU_CALCULATORS.register_module() class AxisAlignedBboxOverlaps3D(object): """Axis-aligned 3D Overlaps (IoU) Calculator.""" def __call__(self, bboxes1, bboxes2, mode='iou', is_aligned=False): """Calculate IoU between 2D bboxes. Args: bboxes1 (Tensor): shape (B, m, 6) in format or empty. bboxes2 (Tensor): shape (B, n, 6) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned`` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. Defaults to False. Returns: Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) """ assert bboxes1.size(-1) == bboxes2.size(-1) == 6 return axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode, is_aligned) def __repr__(self): """str: a string describing the module""" repr_str = self.__class__.__name__ + '()' return repr_str def axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2, mode='iou', is_aligned=False, eps=1e-6): """Calculate overlap between two set of axis aligned 3D bboxes. If ``is_aligned`` is ``False``, then calculate the overlaps between each bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned pair of bboxes1 and bboxes2. Args: bboxes1 (Tensor): shape (B, m, 6) in format or empty. bboxes2 (Tensor): shape (B, n, 6) in format or empty. B indicates the batch dim, in shape (B1, B2, ..., Bn). If ``is_aligned`` is ``True``, then m and n must be equal. mode (str): "iou" (intersection over union) or "giou" (generalized intersection over union). is_aligned (bool, optional): If True, then m and n must be equal. Defaults to False. eps (float, optional): A value added to the denominator for numerical stability. Defaults to 1e-6. Returns: Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,) Example: >>> bboxes1 = torch.FloatTensor([ >>> [0, 0, 0, 10, 10, 10], >>> [10, 10, 10, 20, 20, 20], >>> [32, 32, 32, 38, 40, 42], >>> ]) >>> bboxes2 = torch.FloatTensor([ >>> [0, 0, 0, 10, 20, 20], >>> [0, 10, 10, 10, 19, 20], >>> [10, 10, 10, 20, 20, 20], >>> ]) >>> overlaps = axis_aligned_bbox_overlaps_3d(bboxes1, bboxes2) >>> assert overlaps.shape == (3, 3) >>> overlaps = bbox_overlaps(bboxes1, bboxes2, is_aligned=True) >>> assert overlaps.shape == (3, ) Example: >>> empty = torch.empty(0, 6) >>> nonempty = torch.FloatTensor([[0, 0, 0, 10, 9, 10]]) >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) """ assert mode in ['iou', 'giou'], f'Unsupported mode {mode}' # Either the boxes are empty or the length of boxes's last dimension is 6 assert (bboxes1.size(-1) == 6 or bboxes1.size(0) == 0) assert (bboxes2.size(-1) == 6 or bboxes2.size(0) == 0) # Batch dim must be the same # Batch dim: (B1, B2, ... Bn) assert bboxes1.shape[:-2] == bboxes2.shape[:-2] batch_shape = bboxes1.shape[:-2] rows = bboxes1.size(-2) cols = bboxes2.size(-2) if is_aligned: assert rows == cols if rows * cols == 0: if is_aligned: return bboxes1.new(batch_shape + (rows, )) else: return bboxes1.new(batch_shape + (rows, cols)) area1 = (bboxes1[..., 3] - bboxes1[..., 0]) * (bboxes1[..., 4] - bboxes1[..., 1]) * ( bboxes1[..., 5] - bboxes1[..., 2]) area2 = (bboxes2[..., 3] - bboxes2[..., 0]) * (bboxes2[..., 4] - bboxes2[..., 1]) * ( bboxes2[..., 5] - bboxes2[..., 2]) if is_aligned: lt = torch.max(bboxes1[..., :3], bboxes2[..., :3]) # [B, rows, 3] rb = torch.min(bboxes1[..., 3:], bboxes2[..., 3:]) # [B, rows, 3] wh = (rb - lt).clamp(min=0) # [B, rows, 2] overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] if mode in ['iou', 'giou']: union = area1 + area2 - overlap else: union = area1 if mode == 'giou': enclosed_lt = torch.min(bboxes1[..., :3], bboxes2[..., :3]) enclosed_rb = torch.max(bboxes1[..., 3:], bboxes2[..., 3:]) else: lt = torch.max(bboxes1[..., :, None, :3], bboxes2[..., None, :, :3]) # [B, rows, cols, 3] rb = torch.min(bboxes1[..., :, None, 3:], bboxes2[..., None, :, 3:]) # [B, rows, cols, 3] wh = (rb - lt).clamp(min=0) # [B, rows, cols, 3] overlap = wh[..., 0] * wh[..., 1] * wh[..., 2] if mode in ['iou', 'giou']: union = area1[..., None] + area2[..., None, :] - overlap if mode == 'giou': enclosed_lt = torch.min(bboxes1[..., :, None, :3], bboxes2[..., None, :, :3]) enclosed_rb = torch.max(bboxes1[..., :, None, 3:], bboxes2[..., None, :, 3:]) eps = union.new_tensor([eps]) union = torch.max(union, eps) ious = overlap / union if mode in ['iou']: return ious # calculate gious enclose_wh = (enclosed_rb - enclosed_lt).clamp(min=0) enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1] * enclose_wh[..., 2] enclose_area = torch.max(enclose_area, eps) gious = ious - (enclose_area - union) / enclose_area return gious ================================================ FILE: mmdet3d/core/bbox/samplers/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.core.bbox.samplers import (BaseSampler, CombinedSampler, InstanceBalancedPosSampler, IoUBalancedNegSampler, OHEMSampler, PseudoSampler, RandomSampler, SamplingResult) from .iou_neg_piecewise_sampler import IoUNegPiecewiseSampler __all__ = [ 'BaseSampler', 'PseudoSampler', 'RandomSampler', 'InstanceBalancedPosSampler', 'IoUBalancedNegSampler', 'CombinedSampler', 'OHEMSampler', 'SamplingResult', 'IoUNegPiecewiseSampler' ] ================================================ FILE: mmdet3d/core/bbox/samplers/iou_neg_piecewise_sampler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet.core.bbox.builder import BBOX_SAMPLERS from . import RandomSampler, SamplingResult @BBOX_SAMPLERS.register_module() class IoUNegPiecewiseSampler(RandomSampler): """IoU Piece-wise Sampling. Sampling negative proposals according to a list of IoU thresholds. The negative proposals are divided into several pieces according to `neg_iou_piece_thrs`. And the ratio of each piece is indicated by `neg_piece_fractions`. Args: num (int): Number of proposals. pos_fraction (float): The fraction of positive proposals. neg_piece_fractions (list): A list contains fractions that indicates the ratio of each piece of total negative samplers. neg_iou_piece_thrs (list): A list contains IoU thresholds that indicate the upper bound of this piece. neg_pos_ub (float): The total ratio to limit the upper bound number of negative samples. add_gt_as_proposals (bool): Whether to add gt as proposals. """ def __init__(self, num, pos_fraction=None, neg_piece_fractions=None, neg_iou_piece_thrs=None, neg_pos_ub=-1, add_gt_as_proposals=False, return_iou=False): super(IoUNegPiecewiseSampler, self).__init__(num, pos_fraction, neg_pos_ub, add_gt_as_proposals) assert isinstance(neg_piece_fractions, list) assert len(neg_piece_fractions) == len(neg_iou_piece_thrs) self.neg_piece_fractions = neg_piece_fractions self.neg_iou_thr = neg_iou_piece_thrs self.return_iou = return_iou self.neg_piece_num = len(self.neg_piece_fractions) def _sample_pos(self, assign_result, num_expected, **kwargs): """Randomly sample some positive samples.""" pos_inds = torch.nonzero(assign_result.gt_inds > 0, as_tuple=False) if pos_inds.numel() != 0: pos_inds = pos_inds.squeeze(1) if pos_inds.numel() <= num_expected: return pos_inds else: return self.random_choice(pos_inds, num_expected) def _sample_neg(self, assign_result, num_expected, **kwargs): """Randomly sample some negative samples.""" neg_inds = torch.nonzero(assign_result.gt_inds == 0, as_tuple=False) if neg_inds.numel() != 0: neg_inds = neg_inds.squeeze(1) if len(neg_inds) <= 0: return neg_inds.squeeze(1) else: neg_inds_choice = neg_inds.new_zeros([0]) extend_num = 0 max_overlaps = assign_result.max_overlaps[neg_inds] for piece_inds in range(self.neg_piece_num): if piece_inds == self.neg_piece_num - 1: # for the last piece piece_expected_num = num_expected - len(neg_inds_choice) min_iou_thr = 0 else: # if the numbers of negative samplers in previous # pieces are less than the expected number, extend # the same number in the current piece. piece_expected_num = int( num_expected * self.neg_piece_fractions[piece_inds]) + extend_num min_iou_thr = self.neg_iou_thr[piece_inds + 1] max_iou_thr = self.neg_iou_thr[piece_inds] piece_neg_inds = torch.nonzero( (max_overlaps >= min_iou_thr) & (max_overlaps < max_iou_thr), as_tuple=False).view(-1) if len(piece_neg_inds) < piece_expected_num: neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds[piece_neg_inds]], dim=0) extend_num += piece_expected_num - len(piece_neg_inds) # for the last piece if piece_inds == self.neg_piece_num - 1: extend_neg_num = num_expected - len(neg_inds_choice) # if the numbers of nagetive samples > 0, we will # randomly select num_expected samples in last piece if piece_neg_inds.numel() > 0: rand_idx = torch.randint( low=0, high=piece_neg_inds.numel(), size=(extend_neg_num, )).long() neg_inds_choice = torch.cat( [neg_inds_choice, piece_neg_inds[rand_idx]], dim=0) # if the numbers of nagetive samples == 0, we will # randomly select num_expected samples in all # previous pieces else: rand_idx = torch.randint( low=0, high=neg_inds_choice.numel(), size=(extend_neg_num, )).long() neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds_choice[rand_idx]], dim=0) else: piece_choice = self.random_choice(piece_neg_inds, piece_expected_num) neg_inds_choice = torch.cat( [neg_inds_choice, neg_inds[piece_choice]], dim=0) extend_num = 0 assert len(neg_inds_choice) == num_expected return neg_inds_choice def sample(self, assign_result, bboxes, gt_bboxes, gt_labels=None, **kwargs): """Sample positive and negative bboxes. This is a simple implementation of bbox sampling given candidates, assigning results and ground truth bboxes. Args: assign_result (:obj:`AssignResult`): Bbox assigning results. bboxes (torch.Tensor): Boxes to be sampled from. gt_bboxes (torch.Tensor): Ground truth bboxes. gt_labels (torch.Tensor, optional): Class labels of ground truth bboxes. Returns: :obj:`SamplingResult`: Sampling result. """ if len(bboxes.shape) < 2: bboxes = bboxes[None, :] gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.bool) if self.add_gt_as_proposals and len(gt_bboxes) > 0: if gt_labels is None: raise ValueError( 'gt_labels must be given when add_gt_as_proposals is True') bboxes = torch.cat([gt_bboxes, bboxes], dim=0) assign_result.add_gt_(gt_labels) gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.bool) gt_flags = torch.cat([gt_ones, gt_flags]) num_expected_pos = int(self.num * self.pos_fraction) pos_inds = self.pos_sampler._sample_pos( assign_result, num_expected_pos, bboxes=bboxes, **kwargs) # We found that sampled indices have duplicated items occasionally. # (may be a bug of PyTorch) pos_inds = pos_inds.unique() num_sampled_pos = pos_inds.numel() num_expected_neg = self.num - num_sampled_pos if self.neg_pos_ub >= 0: _pos = max(1, num_sampled_pos) neg_upper_bound = int(self.neg_pos_ub * _pos) if num_expected_neg > neg_upper_bound: num_expected_neg = neg_upper_bound neg_inds = self.neg_sampler._sample_neg( assign_result, num_expected_neg, bboxes=bboxes, **kwargs) sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes, assign_result, gt_flags) if self.return_iou: # PartA2 needs iou score to regression. sampling_result.iou = assign_result.max_overlaps[torch.cat( [pos_inds, neg_inds])] sampling_result.iou.detach_() return sampling_result ================================================ FILE: mmdet3d/core/bbox/structures/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_box3d import BaseInstance3DBoxes from .box_3d_mode import Box3DMode from .cam_box3d import CameraInstance3DBoxes from .coord_3d_mode import Coord3DMode from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes from .custom_box import CustomBox from .utils import (get_box_type, get_proj_mat_by_coord_type, limit_period, mono_cam_box2vis, points_cam2img, points_img2cam, rotation_3d_in_axis, xywhr2xyxyr) __all__ = [ 'Box3DMode', 'BaseInstance3DBoxes', 'LiDARInstance3DBoxes', 'CameraInstance3DBoxes', 'DepthInstance3DBoxes', 'xywhr2xyxyr', 'get_box_type', 'rotation_3d_in_axis', 'limit_period', 'points_cam2img', 'points_img2cam', 'Coord3DMode', 'mono_cam_box2vis', 'get_proj_mat_by_coord_type' ] ================================================ FILE: mmdet3d/core/bbox/structures/base_box3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from abc import abstractmethod import numpy as np import torch from mmcv.ops import box_iou_rotated, points_in_boxes_all, points_in_boxes_part from .utils import limit_period class BaseInstance3DBoxes(object): """Base class for 3D Boxes. Note: The box is bottom centered, i.e. the relative position of origin in the box is (0.5, 0.5, 0). Args: tensor (torch.Tensor | np.ndarray | list): a N x box_dim matrix. box_dim (int): Number of the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw). Defaults to 7. with_yaw (bool): Whether the box is with yaw rotation. If False, the value of yaw will be set to 0 as minmax boxes. Defaults to True. origin (tuple[float], optional): Relative position of the box origin. Defaults to (0.5, 0.5, 0). This will guide the box be converted to (0.5, 0.5, 0) mode. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicating the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 0.5, 0)): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, box_dim)).to( dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() if tensor.shape[-1] == 6: # If the dimension of boxes is 6, we expand box_dim by padding # 0 as a fake yaw and set with_yaw to False. assert box_dim == 6 fake_rot = tensor.new_zeros(tensor.shape[0], 1) tensor = torch.cat((tensor, fake_rot), dim=-1) self.box_dim = box_dim + 1 self.with_yaw = False else: self.box_dim = box_dim self.with_yaw = with_yaw self.tensor = tensor.clone() if origin != (0.5, 0.5, 0): dst = self.tensor.new_tensor((0.5, 0.5, 0)) src = self.tensor.new_tensor(origin) self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) @property def volume(self): """torch.Tensor: A vector with volume of each box.""" return self.tensor[:, 3] * self.tensor[:, 4] * self.tensor[:, 5] @property def dims(self): """torch.Tensor: Size dimensions of each box in shape (N, 3).""" return self.tensor[:, 3:6] @property def yaw(self): """torch.Tensor: A vector with yaw of each box in shape (N, ).""" return self.tensor[:, 6] @property def height(self): """torch.Tensor: A vector with height of each box in shape (N, ).""" return self.tensor[:, 5] @property def top_height(self): """torch.Tensor: A vector with the top height of each box in shape (N, ).""" return self.bottom_height + self.height @property def bottom_height(self): """torch.Tensor: A vector with bottom's height of each box in shape (N, ).""" return self.tensor[:, 2] @property def center(self): """Calculate the center of all the boxes. Note: In MMDetection3D's convention, the bottom center is usually taken as the default center. The relative position of the centers in different kinds of boxes are different, e.g., the relative center of a boxes is (0.5, 1.0, 0.5) in camera and (0.5, 0.5, 0) in lidar. It is recommended to use ``bottom_center`` or ``gravity_center`` for clearer usage. Returns: torch.Tensor: A tensor with center of each box in shape (N, 3). """ return self.bottom_center @property def bottom_center(self): """torch.Tensor: A tensor with center of each box in shape (N, 3).""" return self.tensor[:, :3] @property def gravity_center(self): """torch.Tensor: A tensor with center of each box in shape (N, 3).""" pass @property def corners(self): """torch.Tensor: a tensor with 8 corners of each box in shape (N, 8, 3).""" pass @property def bev(self): """torch.Tensor: 2D BEV box of each box with rotation in XYWHR format, in shape (N, 5).""" return self.tensor[:, [0, 1, 3, 4, 6]] @property def nearest_bev(self): """torch.Tensor: A tensor of 2D BEV box of each box without rotation.""" # Obtain BEV boxes with rotation in XYWHR format bev_rotated_boxes = self.bev # convert the rotation to a valid range rotations = bev_rotated_boxes[:, -1] normed_rotations = torch.abs(limit_period(rotations, 0.5, np.pi)) # find the center of boxes conditions = (normed_rotations > np.pi / 4)[..., None] bboxes_xywh = torch.where(conditions, bev_rotated_boxes[:, [0, 1, 3, 2]], bev_rotated_boxes[:, :4]) centers = bboxes_xywh[:, :2] dims = bboxes_xywh[:, 2:] bev_boxes = torch.cat([centers - dims / 2, centers + dims / 2], dim=-1) return bev_boxes def in_range_bev(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): the range of box (x_min, y_min, x_max, y_max) Note: The original implementation of SECOND checks whether boxes in a range by checking whether the points are in a convex polygon, we reduce the burden for simpler cases. Returns: torch.Tensor: Whether each box is inside the reference range. """ in_range_flags = ((self.bev[:, 0] > box_range[0]) & (self.bev[:, 1] > box_range[1]) & (self.bev[:, 0] < box_range[2]) & (self.bev[:, 1] < box_range[3])) return in_range_flags @abstractmethod def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle or rotation matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. points (torch.Tensor | numpy.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. """ pass @abstractmethod def flip(self, bev_direction='horizontal'): """Flip the boxes in BEV along given BEV direction. Args: bev_direction (str, optional): Direction by which to flip. Can be chosen from 'horizontal' and 'vertical'. Defaults to 'horizontal'. """ pass def translate(self, trans_vector): """Translate boxes with the given translation vector. Args: trans_vector (torch.Tensor): Translation vector of size (1, 3). """ if not isinstance(trans_vector, torch.Tensor): trans_vector = self.tensor.new_tensor(trans_vector) self.tensor[:, :3] += trans_vector def in_range_3d(self, box_range): """Check whether the boxes are in the given range. Args: box_range (list | torch.Tensor): The range of box (x_min, y_min, z_min, x_max, y_max, z_max) Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burden for simpler cases. Returns: torch.Tensor: A binary vector indicating whether each box is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > box_range[0]) & (self.tensor[:, 1] > box_range[1]) & (self.tensor[:, 2] > box_range[2]) & (self.tensor[:, 0] < box_range[3]) & (self.tensor[:, 1] < box_range[4]) & (self.tensor[:, 2] < box_range[5])) return in_range_flags @abstractmethod def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BaseInstance3DBoxes`: The converted box of the same type in the `dst` mode. """ pass def scale(self, scale_factor): """Scale the box with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the boxes. """ self.tensor[:, :6] *= scale_factor self.tensor[:, 7:] *= scale_factor # velocity def limit_yaw(self, offset=0.5, period=np.pi): """Limit the yaw to a given period and offset. Args: offset (float, optional): The offset of the yaw. Defaults to 0.5. period (float, optional): The expected period. Defaults to np.pi. """ self.tensor[:, 6] = limit_period(self.tensor[:, 6], offset, period) def nonempty(self, threshold=0.0): """Find boxes that are non-empty. A box is considered empty, if either of its side is no larger than threshold. Args: threshold (float, optional): The threshold of minimal sizes. Defaults to 0.0. Returns: torch.Tensor: A binary vector which represents whether each box is empty (False) or non-empty (True). """ box = self.tensor size_x = box[..., 3] size_y = box[..., 4] size_z = box[..., 5] keep = ((size_x > threshold) & (size_y > threshold) & (size_z > threshold)) return keep def __getitem__(self, item): """ Note: The following usage are allowed: 1. `new_boxes = boxes[3]`: return a `Boxes` that contains only one box. 2. `new_boxes = boxes[2:10]`: return a slice of boxes. 3. `new_boxes = boxes[vector]`: where vector is a torch.BoolTensor with `length = len(boxes)`. Nonzero elements in the vector will be selected. Note that the returned Boxes might share storage with this Boxes, subject to Pytorch's indexing semantics. Returns: :obj:`BaseInstance3DBoxes`: A new object of :class:`BaseInstance3DBoxes` after indexing. """ original_type = type(self) if isinstance(item, int): return original_type( self.tensor[item].view(1, -1), box_dim=self.box_dim, with_yaw=self.with_yaw) b = self.tensor[item] assert b.dim() == 2, \ f'Indexing on Boxes with {item} failed to return a matrix!' return original_type(b, box_dim=self.box_dim, with_yaw=self.with_yaw) def __len__(self): """int: Number of boxes in the current object.""" return self.tensor.shape[0] def __repr__(self): """str: Return a strings that describes the object.""" return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' @classmethod def cat(cls, boxes_list): """Concatenate a list of Boxes into a single Boxes. Args: boxes_list (list[:obj:`BaseInstance3DBoxes`]): List of boxes. Returns: :obj:`BaseInstance3DBoxes`: The concatenated Boxes. """ assert isinstance(boxes_list, (list, tuple)) if len(boxes_list) == 0: return cls(torch.empty(0)) assert all(isinstance(box, cls) for box in boxes_list) # use torch.cat (v.s. layers.cat) # so the returned boxes never share storage with input cat_boxes = cls( torch.cat([b.tensor for b in boxes_list], dim=0), box_dim=boxes_list[0].tensor.shape[1], with_yaw=boxes_list[0].with_yaw) return cat_boxes def to(self, device): """Convert current boxes to a specific device. Args: device (str | :obj:`torch.device`): The name of the device. Returns: :obj:`BaseInstance3DBoxes`: A new boxes object on the specific device. """ original_type = type(self) return original_type( self.tensor.to(device), box_dim=self.box_dim, with_yaw=self.with_yaw) def clone(self): """Clone the Boxes. Returns: :obj:`BaseInstance3DBoxes`: Box object with the same properties as self. """ original_type = type(self) return original_type( self.tensor.clone(), box_dim=self.box_dim, with_yaw=self.with_yaw) @property def device(self): """str: The device of the boxes are on.""" return self.tensor.device def __iter__(self): """Yield a box as a Tensor of shape (4,) at a time. Returns: torch.Tensor: A box of shape (4,). """ yield from self.tensor @classmethod def height_overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate height overlaps of two boxes. Note: This function calculates the height overlaps between boxes1 and boxes2, boxes1 and boxes2 should be in the same type. Args: boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. mode (str, optional): Mode of IoU calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated iou of boxes. """ assert isinstance(boxes1, BaseInstance3DBoxes) assert isinstance(boxes2, BaseInstance3DBoxes) assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.' boxes1_top_height = boxes1.top_height.view(-1, 1) boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) boxes2_top_height = boxes2.top_height.view(1, -1) boxes2_bottom_height = boxes2.bottom_height.view(1, -1) heighest_of_bottom = torch.max(boxes1_bottom_height, boxes2_bottom_height) lowest_of_top = torch.min(boxes1_top_height, boxes2_top_height) overlaps_h = torch.clamp(lowest_of_top - heighest_of_bottom, min=0) return overlaps_h @classmethod def overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate 3D overlaps of two boxes. Note: This function calculates the overlaps between ``boxes1`` and ``boxes2``, ``boxes1`` and ``boxes2`` should be in the same type. Args: boxes1 (:obj:`BaseInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`BaseInstance3DBoxes`): Boxes 2 contain M boxes. mode (str, optional): Mode of iou calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated 3D overlaps of the boxes. """ assert isinstance(boxes1, BaseInstance3DBoxes) assert isinstance(boxes2, BaseInstance3DBoxes) assert type(boxes1) == type(boxes2), '"boxes1" and "boxes2" should' \ f'be in the same type, got {type(boxes1)} and {type(boxes2)}.' assert mode in ['iou', 'iof'] rows = len(boxes1) cols = len(boxes2) if rows * cols == 0: return boxes1.tensor.new(rows, cols) # height overlap overlaps_h = cls.height_overlaps(boxes1, boxes2) # bev overlap iou2d = box_iou_rotated(boxes1.bev, boxes2.bev) areas1 = (boxes1.bev[:, 2] * boxes1.bev[:, 3]).unsqueeze(1).expand( rows, cols) areas2 = (boxes2.bev[:, 2] * boxes2.bev[:, 3]).unsqueeze(0).expand( rows, cols) overlaps_bev = iou2d * (areas1 + areas2) / (1 + iou2d) # 3d overlaps overlaps_3d = overlaps_bev.to(boxes1.device) * overlaps_h volume1 = boxes1.volume.view(-1, 1) volume2 = boxes2.volume.view(1, -1) if mode == 'iou': # the clamp func is used to avoid division of 0 iou3d = overlaps_3d / torch.clamp( volume1 + volume2 - overlaps_3d, min=1e-8) else: iou3d = overlaps_3d / torch.clamp(volume1, min=1e-8) return iou3d def new_box(self, data): """Create a new box object with data. The new box and its tensor has the similar properties as self and self.tensor, respectively. Args: data (torch.Tensor | numpy.array | list): Data to be copied. Returns: :obj:`BaseInstance3DBoxes`: A new bbox object with ``data``, the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ if not isinstance(data, torch.Tensor) else data.to(self.device) original_type = type(self) return original_type( new_tensor, box_dim=self.box_dim, with_yaw=self.with_yaw) def points_in_boxes_part(self, points, boxes_override=None): """Find the box in which each point is. Args: points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions are (x, y, z) in LiDAR or depth coordinate. boxes_override (torch.Tensor, optional): Boxes to override `self.tensor`. Defaults to None. Returns: torch.Tensor: The index of the first box that each point is in, in shape (M, ). Default value is -1 (if the point is not enclosed by any box). Note: If a point is enclosed by multiple boxes, the index of the first box will be returned. """ if boxes_override is not None: boxes = boxes_override else: boxes = self.tensor if points.dim() == 2: points = points.unsqueeze(0) box_idx = points_in_boxes_part(points, boxes.unsqueeze(0).to( points.device)).squeeze(0) return box_idx def points_in_boxes_all(self, points, boxes_override=None): """Find all boxes in which each point is. Args: points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions are (x, y, z) in LiDAR or depth coordinate. boxes_override (torch.Tensor, optional): Boxes to override `self.tensor`. Defaults to None. Returns: torch.Tensor: A tensor indicating whether a point is in a box, in shape (M, T). T is the number of boxes. Denote this tensor as A, if the m^th point is in the t^th box, then `A[m, t] == 1`, elsewise `A[m, t] == 0`. """ if boxes_override is not None: boxes = boxes_override else: boxes = self.tensor points_clone = points.clone()[..., :3] if points_clone.dim() == 2: points_clone = points_clone.unsqueeze(0) else: assert points_clone.dim() == 3 and points_clone.shape[0] == 1 boxes = boxes.to(points_clone.device).unsqueeze(0) box_idxs_of_pts = points_in_boxes_all(points_clone, boxes) return box_idxs_of_pts.squeeze(0) def points_in_boxes(self, points, boxes_override=None): warnings.warn('DeprecationWarning: points_in_boxes is a ' 'deprecated method, please consider using ' 'points_in_boxes_part.') return self.points_in_boxes_part(points, boxes_override) def points_in_boxes_batch(self, points, boxes_override=None): warnings.warn('DeprecationWarning: points_in_boxes_batch is a ' 'deprecated method, please consider using ' 'points_in_boxes_all.') return self.points_in_boxes_all(points, boxes_override) ================================================ FILE: mmdet3d/core/bbox/structures/box_3d_mode.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from enum import IntEnum, unique import numpy as np import torch from .base_box3d import BaseInstance3DBoxes from .cam_box3d import CameraInstance3DBoxes from .depth_box3d import DepthInstance3DBoxes from .lidar_box3d import LiDARInstance3DBoxes from .utils import limit_period @unique class Box3DMode(IntEnum): r"""Enum of different ways to represent a box. Coordinates in LiDAR: .. code-block:: none up z ^ x front | / | / left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. Coordinates in camera: .. code-block:: none z front / / 0 ------> x right | | v down y The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5], and the yaw is around the y axis, thus the rotation axis=1. Coordinates in Depth mode: .. code-block:: none up z ^ y front | / | / 0 ------> x right The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. """ LIDAR = 0 CAM = 1 DEPTH = 2 @staticmethod def convert(box, src, dst, rt_mat=None, with_yaw=True): """Convert boxes from `src` mode to `dst` mode. Args: box (tuple | list | np.ndarray | torch.Tensor | :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. src (:obj:`Box3DMode`): The src Box mode. dst (:obj:`Box3DMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. with_yaw (bool, optional): If `box` is an instance of :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. Defaults to True. Returns: (tuple | list | np.ndarray | torch.Tensor | :obj:`BaseInstance3DBoxes`): The converted box of the same type. """ if src == dst: return box is_numpy = isinstance(box, np.ndarray) is_Instance3DBoxes = isinstance(box, BaseInstance3DBoxes) single_box = isinstance(box, (list, tuple)) if single_box: assert len(box) >= 7, ( 'Box3DMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 7') arr = torch.tensor(box)[None, :] else: # avoid modifying the input box if is_numpy: arr = torch.from_numpy(np.asarray(box)).clone() elif is_Instance3DBoxes: arr = box.tensor.clone() else: arr = box.clone() if is_Instance3DBoxes: with_yaw = box.with_yaw # convert box from `src` mode to `dst` mode. x_size, y_size, z_size = arr[..., 3:4], arr[..., 4:5], arr[..., 5:6] if with_yaw: yaw = arr[..., 6:7] if src == Box3DMode.LIDAR and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: yaw = -yaw - np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.CAM and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: yaw = -yaw - np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.DEPTH and dst == Box3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: yaw = -yaw elif src == Box3DMode.CAM and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) xyz_size = torch.cat([x_size, z_size, y_size], dim=-1) if with_yaw: yaw = -yaw elif src == Box3DMode.LIDAR and dst == Box3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) if with_yaw: yaw = yaw + np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) elif src == Box3DMode.DEPTH and dst == Box3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) xyz_size = torch.cat([x_size, y_size, z_size], dim=-1) if with_yaw: yaw = yaw - np.pi / 2 yaw = limit_period(yaw, period=np.pi * 2) else: raise NotImplementedError( f'Conversion from Box3DMode {src} to {dst} ' 'is not supported yet') if not isinstance(rt_mat, torch.Tensor): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[..., :3] @ rt_mat.t() if with_yaw: remains = arr[..., 7:] arr = torch.cat([xyz[..., :3], xyz_size, yaw, remains], dim=-1) else: remains = arr[..., 6:] arr = torch.cat([xyz[..., :3], xyz_size, remains], dim=-1) # convert arr to the original type original_type = type(box) if single_box: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_Instance3DBoxes: if dst == Box3DMode.CAM: target_type = CameraInstance3DBoxes elif dst == Box3DMode.LIDAR: target_type = LiDARInstance3DBoxes elif dst == Box3DMode.DEPTH: target_type = DepthInstance3DBoxes else: raise NotImplementedError( f'Conversion to {dst} through {original_type}' ' is not supported yet') return target_type(arr, box_dim=arr.size(-1), with_yaw=with_yaw) else: return arr ================================================ FILE: mmdet3d/core/bbox/structures/cam_box3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from ...points import BasePoints from .base_box3d import BaseInstance3DBoxes from .utils import rotation_3d_in_axis, yaw2local class CameraInstance3DBoxes(BaseInstance3DBoxes): """3D boxes of instances in CAM coordinates. Coordinates in camera: .. code-block:: none z front (yaw=-0.5*pi) / / 0 ------> x right (yaw=0) | | v down y The relative coordinate of bottom center in a CAM box is (0.5, 1.0, 0.5), and the yaw is around the y axis, thus the rotation axis=1. The yaw is 0 at the positive direction of x axis, and decreases from the positive direction of x to the positive direction of z. Attributes: tensor (torch.Tensor): Float matrix in shape (N, box_dim). box_dim (int): Integer indicating the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as axis-aligned boxes tightly enclosing the original boxes. """ YAW_AXIS = 1 def __init__(self, tensor, box_dim=7, with_yaw=True, origin=(0.5, 1.0, 0.5)): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, box_dim)).to( dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == box_dim, tensor.size() if tensor.shape[-1] == 6: # If the dimension of boxes is 6, we expand box_dim by padding # 0 as a fake yaw and set with_yaw to False. assert box_dim == 6 fake_rot = tensor.new_zeros(tensor.shape[0], 1) tensor = torch.cat((tensor, fake_rot), dim=-1) self.box_dim = box_dim + 1 self.with_yaw = False else: self.box_dim = box_dim self.with_yaw = with_yaw self.tensor = tensor.clone() if origin != (0.5, 1.0, 0.5): dst = self.tensor.new_tensor((0.5, 1.0, 0.5)) src = self.tensor.new_tensor(origin) self.tensor[:, :3] += self.tensor[:, 3:6] * (dst - src) @property def height(self): """torch.Tensor: A vector with height of each box in shape (N, ).""" return self.tensor[:, 4] @property def top_height(self): """torch.Tensor: A vector with the top height of each box in shape (N, ).""" # the positive direction is down rather than up return self.bottom_height - self.height @property def bottom_height(self): """torch.Tensor: A vector with bottom's height of each box in shape (N, ).""" return self.tensor[:, 1] @property def local_yaw(self): """torch.Tensor: A vector with local yaw of each box in shape (N, ). local_yaw equals to alpha in kitti, which is commonly used in monocular 3D object detection task, so only :obj:`CameraInstance3DBoxes` has the property. """ yaw = self.yaw loc = self.gravity_center local_yaw = yaw2local(yaw, loc) return local_yaw @property def gravity_center(self): """torch.Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, [0, 2]] = bottom_center[:, [0, 2]] gravity_center[:, 1] = bottom_center[:, 1] - self.tensor[:, 4] * 0.5 return gravity_center @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to in clockwise order, in the form of (x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0) .. code-block:: none front z / / (x0, y0, z1) + ----------- + (x1, y0, z1) /| / | / | / | (x0, y0, z0) + ----------- + + (x1, y1, z1) | / . | / | / origin | / (x0, y1, z0) + ----------- + -------> x right | (x1, y1, z0) | v down y """ if self.tensor.numel() == 0: return torch.empty([0, 8, 3], device=self.tensor.device) dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin [0.5, 1, 0.5] corners_norm = corners_norm - dims.new_tensor([0.5, 1, 0.5]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) corners = rotation_3d_in_axis( corners, self.tensor[:, 6], axis=self.YAW_AXIS) corners += self.tensor[:, :3].view(-1, 1, 3) return corners @property def bev(self): """torch.Tensor: 2D BEV box of each box with rotation in XYWHR format, in shape (N, 5).""" bev = self.tensor[:, [0, 2, 3, 5, 6]].clone() # positive direction of the gravity axis # in cam coord system points to the earth # so the bev yaw angle needs to be reversed bev[:, -1] = -bev[:, -1] return bev def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle or rotation matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ f'invalid rotation angle shape {angle.shape}' if angle.numel() == 1: self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( self.tensor[:, 0:3], angle, axis=self.YAW_AXIS, return_mat=True) else: rot_mat_T = angle rot_sin = rot_mat_T[2, 0] rot_cos = rot_mat_T[0, 0] angle = np.arctan2(rot_sin, rot_cos) self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T self.tensor[:, 6] += angle if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In CAM coordinates, it flips the x (horizontal) or z (vertical) axis. Args: bev_direction (str): Flip direction (horizontal or vertical). points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] + np.pi elif bev_direction == 'vertical': self.tensor[:, 2::7] = -self.tensor[:, 2::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 0] = -points[:, 0] elif bev_direction == 'vertical': points[:, 2] = -points[:, 2] elif isinstance(points, BasePoints): points.flip(bev_direction) return points @classmethod def height_overlaps(cls, boxes1, boxes2, mode='iou'): """Calculate height overlaps of two boxes. This function calculates the height overlaps between ``boxes1`` and ``boxes2``, where ``boxes1`` and ``boxes2`` should be in the same type. Args: boxes1 (:obj:`CameraInstance3DBoxes`): Boxes 1 contain N boxes. boxes2 (:obj:`CameraInstance3DBoxes`): Boxes 2 contain M boxes. mode (str, optional): Mode of iou calculation. Defaults to 'iou'. Returns: torch.Tensor: Calculated iou of boxes' heights. """ assert isinstance(boxes1, CameraInstance3DBoxes) assert isinstance(boxes2, CameraInstance3DBoxes) boxes1_top_height = boxes1.top_height.view(-1, 1) boxes1_bottom_height = boxes1.bottom_height.view(-1, 1) boxes2_top_height = boxes2.top_height.view(1, -1) boxes2_bottom_height = boxes2.bottom_height.view(1, -1) # positive direction of the gravity axis # in cam coord system points to the earth heighest_of_bottom = torch.min(boxes1_bottom_height, boxes2_bottom_height) lowest_of_top = torch.max(boxes1_top_height, boxes2_top_height) overlaps_h = torch.clamp(heighest_of_bottom - lowest_of_top, min=0) return overlaps_h def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BaseInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.CAM, dst=dst, rt_mat=rt_mat) def points_in_boxes_part(self, points, boxes_override=None): """Find the box in which each point is. Args: points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions are (x, y, z) in LiDAR or depth coordinate. boxes_override (torch.Tensor, optional): Boxes to override `self.tensor `. Defaults to None. Returns: torch.Tensor: The index of the box in which each point is, in shape (M, ). Default value is -1 (if the point is not enclosed by any box). """ from .coord_3d_mode import Coord3DMode points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM, Coord3DMode.LIDAR) if boxes_override is not None: boxes_lidar = boxes_override else: boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM, Coord3DMode.LIDAR) box_idx = super().points_in_boxes_part(points_lidar, boxes_lidar) return box_idx def points_in_boxes_all(self, points, boxes_override=None): """Find all boxes in which each point is. Args: points (torch.Tensor): Points in shape (1, M, 3) or (M, 3), 3 dimensions are (x, y, z) in LiDAR or depth coordinate. boxes_override (torch.Tensor, optional): Boxes to override `self.tensor `. Defaults to None. Returns: torch.Tensor: The index of all boxes in which each point is, in shape (B, M, T). """ from .coord_3d_mode import Coord3DMode points_lidar = Coord3DMode.convert(points, Coord3DMode.CAM, Coord3DMode.LIDAR) if boxes_override is not None: boxes_lidar = boxes_override else: boxes_lidar = Coord3DMode.convert(self.tensor, Coord3DMode.CAM, Coord3DMode.LIDAR) box_idx = super().points_in_boxes_all(points_lidar, boxes_lidar) return box_idx ================================================ FILE: mmdet3d/core/bbox/structures/coord_3d_mode.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from enum import IntEnum, unique import numpy as np import torch from ...points import BasePoints, CameraPoints, DepthPoints, LiDARPoints from .base_box3d import BaseInstance3DBoxes from .box_3d_mode import Box3DMode @unique class Coord3DMode(IntEnum): r"""Enum of different ways to represent a box and point cloud. Coordinates in LiDAR: .. code-block:: none up z ^ x front | / | / left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. Coordinates in camera: .. code-block:: none z front / / 0 ------> x right | | v down y The relative coordinate of bottom center in a CAM box is [0.5, 1.0, 0.5], and the yaw is around the y axis, thus the rotation axis=1. Coordinates in Depth mode: .. code-block:: none up z ^ y front | / | / 0 ------> x right The relative coordinate of bottom center in a DEPTH box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. """ LIDAR = 0 CAM = 1 DEPTH = 2 @staticmethod def convert(input, src, dst, rt_mat=None, with_yaw=True, is_point=True): """Convert boxes or points from `src` mode to `dst` mode. Args: input (tuple | list | np.ndarray | torch.Tensor | :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. src (:obj:`Box3DMode` | :obj:`Coord3DMode`): The source mode. dst (:obj:`Box3DMode` | :obj:`Coord3DMode`): The target mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. with_yaw (bool): If `box` is an instance of :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. Defaults to True. is_point (bool): If `input` is neither an instance of :obj:`BaseInstance3DBoxes` nor an instance of :obj:`BasePoints`, whether or not it is point data. Defaults to True. Returns: (tuple | list | np.ndarray | torch.Tensor | :obj:`BaseInstance3DBoxes` | :obj:`BasePoints`): The converted box of the same type. """ if isinstance(input, BaseInstance3DBoxes): return Coord3DMode.convert_box( input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw) elif isinstance(input, BasePoints): return Coord3DMode.convert_point(input, src, dst, rt_mat=rt_mat) elif isinstance(input, (tuple, list, np.ndarray, torch.Tensor)): if is_point: return Coord3DMode.convert_point( input, src, dst, rt_mat=rt_mat) else: return Coord3DMode.convert_box( input, src, dst, rt_mat=rt_mat, with_yaw=with_yaw) else: raise NotImplementedError @staticmethod def convert_box(box, src, dst, rt_mat=None, with_yaw=True): """Convert boxes from `src` mode to `dst` mode. Args: box (tuple | list | np.ndarray | torch.Tensor | :obj:`BaseInstance3DBoxes`): Can be a k-tuple, k-list or an Nxk array/tensor, where k = 7. src (:obj:`Box3DMode`): The src Box mode. dst (:obj:`Box3DMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. with_yaw (bool): If `box` is an instance of :obj:`BaseInstance3DBoxes`, whether or not it has a yaw angle. Defaults to True. Returns: (tuple | list | np.ndarray | torch.Tensor | :obj:`BaseInstance3DBoxes`): The converted box of the same type. """ return Box3DMode.convert(box, src, dst, rt_mat=rt_mat) @staticmethod def convert_point(point, src, dst, rt_mat=None): """Convert points from `src` mode to `dst` mode. Args: point (tuple | list | np.ndarray | torch.Tensor | :obj:`BasePoints`): Can be a k-tuple, k-list or an Nxk array/tensor. src (:obj:`CoordMode`): The src Point mode. dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: (tuple | list | np.ndarray | torch.Tensor | :obj:`BasePoints`): The converted point of the same type. """ if src == dst: return point is_numpy = isinstance(point, np.ndarray) is_InstancePoints = isinstance(point, BasePoints) single_point = isinstance(point, (list, tuple)) if single_point: assert len(point) >= 3, ( 'CoordMode.convert takes either a k-tuple/list or ' 'an Nxk array/tensor, where k >= 3') arr = torch.tensor(point)[None, :] else: # avoid modifying the input point if is_numpy: arr = torch.from_numpy(np.asarray(point)).clone() elif is_InstancePoints: arr = point.tensor.clone() else: arr = point.clone() # convert point from `src` mode to `dst` mode. if src == Coord3DMode.LIDAR and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [0, 0, -1], [1, 0, 0]]) elif src == Coord3DMode.CAM and dst == Coord3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 0, 1], [-1, 0, 0], [0, -1, 0]]) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.CAM: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, -1], [0, 1, 0]]) elif src == Coord3DMode.CAM and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[1, 0, 0], [0, 0, 1], [0, -1, 0]]) elif src == Coord3DMode.LIDAR and dst == Coord3DMode.DEPTH: if rt_mat is None: rt_mat = arr.new_tensor([[0, -1, 0], [1, 0, 0], [0, 0, 1]]) elif src == Coord3DMode.DEPTH and dst == Coord3DMode.LIDAR: if rt_mat is None: rt_mat = arr.new_tensor([[0, 1, 0], [-1, 0, 0], [0, 0, 1]]) else: raise NotImplementedError( f'Conversion from Coord3DMode {src} to {dst} ' 'is not supported yet') if not isinstance(rt_mat, torch.Tensor): rt_mat = arr.new_tensor(rt_mat) if rt_mat.size(1) == 4: extended_xyz = torch.cat( [arr[..., :3], arr.new_ones(arr.size(0), 1)], dim=-1) xyz = extended_xyz @ rt_mat.t() else: xyz = arr[..., :3] @ rt_mat.t() remains = arr[..., 3:] arr = torch.cat([xyz[..., :3], remains], dim=-1) # convert arr to the original type original_type = type(point) if single_point: return original_type(arr.flatten().tolist()) if is_numpy: return arr.numpy() elif is_InstancePoints: if dst == Coord3DMode.CAM: target_type = CameraPoints elif dst == Coord3DMode.LIDAR: target_type = LiDARPoints elif dst == Coord3DMode.DEPTH: target_type = DepthPoints else: raise NotImplementedError( f'Conversion to {dst} through {original_type}' ' is not supported yet') return target_type( arr, points_dim=arr.size(-1), attribute_dims=point.attribute_dims) else: return arr ================================================ FILE: mmdet3d/core/bbox/structures/custom_box.py ================================================ # nuScenes dev-kit. # Code written by Oscar Beijbom, 2018. import copy import os.path as osp import struct from abc import ABC, abstractmethod from functools import reduce from typing import Tuple, List, Dict import cv2 import numpy as np from matplotlib.axes import Axes from pyquaternion import Quaternion from nuscenes.lidarseg.lidarseg_utils import colormap_to_colors, create_lidarseg_legend from nuscenes.utils.data_io import load_bin_file from nuscenes.utils.geometry_utils import view_points, transform_matrix from nuscenes.utils.data_classes import Box class CustomBox(Box): """ Simple data class representing a 3d box including, label, score and velocity. """ def __init__(self, center: List[float], size: List[float], orientation: Quaternion, label: int = np.nan, score: float = np.nan, velocity: Tuple = (np.nan, np.nan, np.nan), name: str = None, token: str = None): """ :param center: Center of box given as x, y, z. :param size: Size of box in width, length, height. :param orientation: Box orientation. :param label: Integer label, optional. :param score: Classification score, optional. :param velocity: Box velocity in x, y, z direction. :param name: Box name, optional. Can be used e.g. for denote category name. :param token: Unique string identifier from DB. """ # assert not np.any(np.isnan(center)) # assert not np.any(np.isnan(size)) # assert len(center) == 3 # assert len(size) == 3 # assert type(orientation) == Quaternion # self.center = np.array(center) # self.wlh = np.array(size) # self.orientation = orientation # self.label = int(label) if not np.isnan(label) else label # self.score = float(score) if not np.isnan(score) else score # self.velocity = np.array(velocity) # self.name = name # self.token = token super().__init__( center=center, size=size, orientation=orientation, label=label, score = score, velocity = velocity, name = name, token = token ) def render(self, axis: Axes, view: np.ndarray = np.eye(3), normalize: bool = False, colors: Tuple = ('b', 'r', 'k'), linewidth: float = 2) -> None: """ Renders the box in the provided Matplotlib axis. :param axis: Axis onto which the box should be drawn. :param view: . Define a projection in needed (e.g. for drawing projection in an image). :param normalize: Whether to normalize the remaining coordinate. :param colors: (: 3). Valid Matplotlib colors ( or normalized RGB tuple) for front, back and sides. :param linewidth: Width in pixel of the box sides. """ corners = view_points(self.corners(), view, normalize=normalize)[:2, :] def draw_rect(selected_corners, color): prev = selected_corners[-1] for corner in selected_corners: axis.plot([prev[0], corner[0]], [prev[1], corner[1]], color=color, linewidth=linewidth) prev = corner # Draw the sides for i in range(4): axis.plot([corners.T[i][0], corners.T[i + 4][0]], [corners.T[i][1], corners.T[i + 4][1]], color=colors[2], linewidth=linewidth) # Draw front (first 4 corners) and rear (last 4 corners) rectangles(3d)/lines(2d) draw_rect(corners.T[:4], colors[0]) draw_rect(corners.T[4:], colors[1]) # Draw line indicating the front center_bottom_forward = np.mean(corners.T[2:4], axis=0) center_bottom = np.mean(corners.T[[2, 3, 7, 6]], axis=0) axis.plot([center_bottom[0], center_bottom_forward[0]], [center_bottom[1], center_bottom_forward[1]], color=colors[0], linewidth=linewidth) # from IPython import embed # embed() # exit() # # In [1]: corners.T # Out[1]: # array([[135.10084664, 217.64073984], # [145.04250652, 217.12554824], # [145.04250652, 217.12554824], # [135.10084664, 217.64073984], # [134.95749348, 214.87445176], # [144.89915336, 214.35926016], # [144.89915336, 214.35926016], # [134.95749348, 214.87445176]]) x_coords, y_coords = zip(*corners.T[[0,1,6,7]]) # axis.fill(x_coords, y_coords, colors[0], alpha=0.8) ================================================ FILE: mmdet3d/core/bbox/structures/depth_box3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet3d.core.points import BasePoints from .base_box3d import BaseInstance3DBoxes from .utils import rotation_3d_in_axis class DepthInstance3DBoxes(BaseInstance3DBoxes): """3D boxes of instances in Depth coordinates. Coordinates in Depth: .. code-block:: none up z y front (yaw=-0.5*pi) ^ ^ | / | / 0 ------> x right (yaw=0) The relative coordinate of bottom center in a Depth box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at the positive direction of x axis, and decreases from the positive direction of x to the positive direction of y. Also note that rotation of DepthInstance3DBoxes is counterclockwise, which is reverse to the definition of the yaw angle (clockwise). A refactor is ongoing to make the three coordinate systems easier to understand and convert between each other. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicates the dimension of a box Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ YAW_AXIS = 2 @property def gravity_center(self): """torch.Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 return gravity_center @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to corners in clockwise order, in form of ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` .. code-block:: none up z front y ^ / | / | (x0, y1, z1) + ----------- + (x1, y1, z1) /| / | / | / | (x0, y0, z1) + ----------- + + (x1, y1, z0) | / . | / | / origin | / (x0, y0, z0) + ----------- + --------> right x (x1, y0, z0) """ if self.tensor.numel() == 0: return torch.empty([0, 8, 3], device=self.tensor.device) dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin (0.5, 0.5, 0) corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around z axis corners = rotation_3d_in_axis( corners, self.tensor[:, 6], axis=self.YAW_AXIS) corners += self.tensor[:, :3].view(-1, 1, 3) return corners def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle or rotation matrix. Args: angle (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ f'invalid rotation angle shape {angle.shape}' if angle.numel() == 1: self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( self.tensor[:, 0:3], angle, axis=self.YAW_AXIS, return_mat=True) else: rot_mat_T = angle rot_sin = rot_mat_T[0, 1] rot_cos = rot_mat_T[0, 0] angle = np.arctan2(rot_sin, rot_cos) self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T if self.with_yaw: self.tensor[:, 6] += angle else: # for axis-aligned boxes, we take the new # enclosing axis-aligned boxes after rotation corners_rot = self.corners @ rot_mat_T new_x_size = corners_rot[..., 0].max( dim=1, keepdim=True)[0] - corners_rot[..., 0].min( dim=1, keepdim=True)[0] new_y_size = corners_rot[..., 1].max( dim=1, keepdim=True)[0] - corners_rot[..., 1].min( dim=1, keepdim=True)[0] self.tensor[:, 3:5] = torch.cat((new_x_size, new_y_size), dim=-1) if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In Depth coordinates, it flips x (horizontal) or y (vertical) axis. Args: bev_direction (str, optional): Flip direction (horizontal or vertical). Defaults to 'horizontal'. points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] + np.pi elif bev_direction == 'vertical': self.tensor[:, 1::7] = -self.tensor[:, 1::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 0] = -points[:, 0] elif bev_direction == 'vertical': points[:, 1] = -points[:, 1] elif isinstance(points, BasePoints): points.flip(bev_direction) return points def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`DepthInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.DEPTH, dst=dst, rt_mat=rt_mat) def enlarged_box(self, extra_width): """Enlarge the length, width and height boxes. Args: extra_width (float | torch.Tensor): Extra width to enlarge the box. Returns: :obj:`DepthInstance3DBoxes`: Enlarged boxes. """ enlarged_boxes = self.tensor.clone() enlarged_boxes[:, 3:6] += extra_width * 2 # bottom center z minus extra_width enlarged_boxes[:, 2] -= extra_width return self.new_box(enlarged_boxes) def get_surface_line_center(self): """Compute surface and line center of bounding boxes. Returns: torch.Tensor: Surface and line center of bounding boxes. """ obj_size = self.dims center = self.gravity_center.view(-1, 1, 3) batch_size = center.shape[0] rot_sin = torch.sin(-self.yaw) rot_cos = torch.cos(-self.yaw) rot_mat_T = self.yaw.new_zeros(tuple(list(self.yaw.shape) + [3, 3])) rot_mat_T[..., 0, 0] = rot_cos rot_mat_T[..., 0, 1] = -rot_sin rot_mat_T[..., 1, 0] = rot_sin rot_mat_T[..., 1, 1] = rot_cos rot_mat_T[..., 2, 2] = 1 # Get the object surface center offset = obj_size.new_tensor([[0, 0, 1], [0, 0, -1], [0, 1, 0], [0, -1, 0], [1, 0, 0], [-1, 0, 0]]) offset = offset.view(1, 6, 3) / 2 surface_3d = (offset * obj_size.view(batch_size, 1, 3).repeat(1, 6, 1)).reshape( -1, 3) # Get the object line center offset = obj_size.new_tensor([[1, 0, 1], [-1, 0, 1], [0, 1, 1], [0, -1, 1], [1, 0, -1], [-1, 0, -1], [0, 1, -1], [0, -1, -1], [1, 1, 0], [1, -1, 0], [-1, 1, 0], [-1, -1, 0]]) offset = offset.view(1, 12, 3) / 2 line_3d = (offset * obj_size.view(batch_size, 1, 3).repeat(1, 12, 1)).reshape( -1, 3) surface_rot = rot_mat_T.repeat(6, 1, 1) surface_3d = torch.matmul(surface_3d.unsqueeze(-2), surface_rot).squeeze(-2) surface_center = center.repeat(1, 6, 1).reshape(-1, 3) + surface_3d line_rot = rot_mat_T.repeat(12, 1, 1) line_3d = torch.matmul(line_3d.unsqueeze(-2), line_rot).squeeze(-2) line_center = center.repeat(1, 12, 1).reshape(-1, 3) + line_3d return surface_center, line_center ================================================ FILE: mmdet3d/core/bbox/structures/lidar_box3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet3d.core.points import BasePoints from .base_box3d import BaseInstance3DBoxes from .utils import rotation_3d_in_axis class LiDARInstance3DBoxes(BaseInstance3DBoxes): """3D boxes of instances in LIDAR coordinates. Coordinates in LiDAR: .. code-block:: none up z x front (yaw=0) ^ ^ | / | / (yaw=0.5*pi) left y <------ 0 The relative coordinate of bottom center in a LiDAR box is (0.5, 0.5, 0), and the yaw is around the z axis, thus the rotation axis=2. The yaw is 0 at the positive direction of x axis, and increases from the positive direction of x to the positive direction of y. A refactor is ongoing to make the three coordinate systems easier to understand and convert between each other. Attributes: tensor (torch.Tensor): Float matrix of N x box_dim. box_dim (int): Integer indicating the dimension of a box. Each row is (x, y, z, x_size, y_size, z_size, yaw, ...). with_yaw (bool): If True, the value of yaw will be set to 0 as minmax boxes. """ YAW_AXIS = 2 @property def gravity_center(self): """torch.Tensor: A tensor with center of each box in shape (N, 3).""" bottom_center = self.bottom_center gravity_center = torch.zeros_like(bottom_center) gravity_center[:, :2] = bottom_center[:, :2] gravity_center[:, 2] = bottom_center[:, 2] + self.tensor[:, 5] * 0.5 return gravity_center @property def corners(self): """torch.Tensor: Coordinates of corners of all the boxes in shape (N, 8, 3). Convert the boxes to corners in clockwise order, in form of ``(x0y0z0, x0y0z1, x0y1z1, x0y1z0, x1y0z0, x1y0z1, x1y1z1, x1y1z0)`` .. code-block:: none up z front x ^ / | / | (x1, y0, z1) + ----------- + (x1, y1, z1) /| / | / | / | (x0, y0, z1) + ----------- + + (x1, y1, z0) | / . | / | / origin | / left y<-------- + ----------- + (x0, y1, z0) (x0, y0, z0) """ if self.tensor.numel() == 0: return torch.empty([0, 8, 3], device=self.tensor.device) dims = self.dims corners_norm = torch.from_numpy( np.stack(np.unravel_index(np.arange(8), [2] * 3), axis=1)).to( device=dims.device, dtype=dims.dtype) corners_norm = corners_norm[[0, 1, 3, 2, 4, 5, 7, 6]] # use relative origin [0.5, 0.5, 0] corners_norm = corners_norm - dims.new_tensor([0.5, 0.5, 0]) corners = dims.view([-1, 1, 3]) * corners_norm.reshape([1, 8, 3]) # rotate around z axis corners = rotation_3d_in_axis( corners, self.tensor[:, 6], axis=self.YAW_AXIS) corners += self.tensor[:, :3].view(-1, 1, 3) return corners def rotate(self, angle, points=None): """Rotate boxes with points (optional) with the given angle or rotation matrix. Args: angles (float | torch.Tensor | np.ndarray): Rotation angle or rotation matrix. points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to rotate. Defaults to None. Returns: tuple or None: When ``points`` is None, the function returns None, otherwise it returns the rotated points and the rotation matrix ``rot_mat_T``. """ if not isinstance(angle, torch.Tensor): angle = self.tensor.new_tensor(angle) assert angle.shape == torch.Size([3, 3]) or angle.numel() == 1, \ f'invalid rotation angle shape {angle.shape}' if angle.numel() == 1: self.tensor[:, 0:3], rot_mat_T = rotation_3d_in_axis( self.tensor[:, 0:3], angle, axis=self.YAW_AXIS, return_mat=True) else: rot_mat_T = angle rot_sin = rot_mat_T[0, 1] rot_cos = rot_mat_T[0, 0] angle = np.arctan2(rot_sin, rot_cos) self.tensor[:, 0:3] = self.tensor[:, 0:3] @ rot_mat_T self.tensor[:, 6] += angle if self.tensor.shape[1] == 9: # rotate velo vector self.tensor[:, 7:9] = self.tensor[:, 7:9] @ rot_mat_T[:2, :2] if points is not None: if isinstance(points, torch.Tensor): points[:, :3] = points[:, :3] @ rot_mat_T elif isinstance(points, np.ndarray): rot_mat_T = rot_mat_T.cpu().numpy() points[:, :3] = np.dot(points[:, :3], rot_mat_T) elif isinstance(points, BasePoints): points.rotate(rot_mat_T) else: raise ValueError return points, rot_mat_T def flip(self, bev_direction='horizontal', points=None): """Flip the boxes in BEV along given BEV direction. In LIDAR coordinates, it flips the y (horizontal) or x (vertical) axis. Args: bev_direction (str): Flip direction (horizontal or vertical). points (torch.Tensor | np.ndarray | :obj:`BasePoints`, optional): Points to flip. Defaults to None. Returns: torch.Tensor, numpy.ndarray or None: Flipped points. """ assert bev_direction in ('horizontal', 'vertical') if bev_direction == 'horizontal': self.tensor[:, 1::7] = -self.tensor[:, 1::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] elif bev_direction == 'vertical': self.tensor[:, 0::7] = -self.tensor[:, 0::7] if self.with_yaw: self.tensor[:, 6] = -self.tensor[:, 6] + np.pi if points is not None: assert isinstance(points, (torch.Tensor, np.ndarray, BasePoints)) if isinstance(points, (torch.Tensor, np.ndarray)): if bev_direction == 'horizontal': points[:, 1] = -points[:, 1] elif bev_direction == 'vertical': points[:, 0] = -points[:, 0] elif isinstance(points, BasePoints): points.flip(bev_direction) return points def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`Box3DMode`): the target Box mode rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from ``src`` coordinates to ``dst`` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BaseInstance3DBoxes`: The converted box of the same type in the ``dst`` mode. """ from .box_3d_mode import Box3DMode return Box3DMode.convert( box=self, src=Box3DMode.LIDAR, dst=dst, rt_mat=rt_mat) def enlarged_box(self, extra_width): """Enlarge the length, width and height boxes. Args: extra_width (float | torch.Tensor): Extra width to enlarge the box. Returns: :obj:`LiDARInstance3DBoxes`: Enlarged boxes. """ enlarged_boxes = self.tensor.clone() enlarged_boxes[:, 3:6] += extra_width * 2 # bottom center z minus extra_width enlarged_boxes[:, 2] -= extra_width return self.new_box(enlarged_boxes) ================================================ FILE: mmdet3d/core/bbox/structures/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from logging import warning import numpy as np import torch from mmdet3d.core.utils import array_converter @array_converter(apply_to=('val', )) def limit_period(val, offset=0.5, period=np.pi): """Limit the value into a period for periodic function. Args: val (torch.Tensor | np.ndarray): The value to be converted. offset (float, optional): Offset to set the value range. Defaults to 0.5. period ([type], optional): Period of the value. Defaults to np.pi. Returns: (torch.Tensor | np.ndarray): Value in the range of [-offset * period, (1-offset) * period] """ limited_val = val - torch.floor(val / period + offset) * period return limited_val @array_converter(apply_to=('points', 'angles')) def rotation_3d_in_axis(points, angles, axis=0, return_mat=False, clockwise=False): """Rotate points by angles according to axis. Args: points (np.ndarray | torch.Tensor | list | tuple ): Points of shape (N, M, 3). angles (np.ndarray | torch.Tensor | list | tuple | float): Vector of angles in shape (N,) axis (int, optional): The axis to be rotated. Defaults to 0. return_mat: Whether or not return the rotation matrix (transposed). Defaults to False. clockwise: Whether the rotation is clockwise. Defaults to False. Raises: ValueError: when the axis is not in range [0, 1, 2], it will raise value error. Returns: (torch.Tensor | np.ndarray): Rotated points in shape (N, M, 3). """ batch_free = len(points.shape) == 2 if batch_free: points = points[None] if isinstance(angles, float) or len(angles.shape) == 0: angles = torch.full(points.shape[:1], angles) assert len(points.shape) == 3 and len(angles.shape) == 1 \ and points.shape[0] == angles.shape[0], f'Incorrect shape of points ' \ f'angles: {points.shape}, {angles.shape}' assert points.shape[-1] in [2, 3], \ f'Points size should be 2 or 3 instead of {points.shape[-1]}' rot_sin = torch.sin(angles) rot_cos = torch.cos(angles) ones = torch.ones_like(rot_cos) zeros = torch.zeros_like(rot_cos) if points.shape[-1] == 3: if axis == 1 or axis == -2: rot_mat_T = torch.stack([ torch.stack([rot_cos, zeros, -rot_sin]), torch.stack([zeros, ones, zeros]), torch.stack([rot_sin, zeros, rot_cos]) ]) elif axis == 2 or axis == -1: rot_mat_T = torch.stack([ torch.stack([rot_cos, rot_sin, zeros]), torch.stack([-rot_sin, rot_cos, zeros]), torch.stack([zeros, zeros, ones]) ]) elif axis == 0 or axis == -3: rot_mat_T = torch.stack([ torch.stack([ones, zeros, zeros]), torch.stack([zeros, rot_cos, rot_sin]), torch.stack([zeros, -rot_sin, rot_cos]) ]) else: raise ValueError(f'axis should in range ' f'[-3, -2, -1, 0, 1, 2], got {axis}') else: rot_mat_T = torch.stack([ torch.stack([rot_cos, rot_sin]), torch.stack([-rot_sin, rot_cos]) ]) if clockwise: rot_mat_T = rot_mat_T.transpose(0, 1) if points.shape[0] == 0: points_new = points else: points_new = torch.einsum('aij,jka->aik', points, rot_mat_T) if batch_free: points_new = points_new.squeeze(0) if return_mat: rot_mat_T = torch.einsum('jka->ajk', rot_mat_T) if batch_free: rot_mat_T = rot_mat_T.squeeze(0) return points_new, rot_mat_T else: return points_new @array_converter(apply_to=('boxes_xywhr', )) def xywhr2xyxyr(boxes_xywhr): """Convert a rotated boxes in XYWHR format to XYXYR format. Args: boxes_xywhr (torch.Tensor | np.ndarray): Rotated boxes in XYWHR format. Returns: (torch.Tensor | np.ndarray): Converted boxes in XYXYR format. """ boxes = torch.zeros_like(boxes_xywhr) half_w = boxes_xywhr[..., 2] / 2 half_h = boxes_xywhr[..., 3] / 2 boxes[..., 0] = boxes_xywhr[..., 0] - half_w boxes[..., 1] = boxes_xywhr[..., 1] - half_h boxes[..., 2] = boxes_xywhr[..., 0] + half_w boxes[..., 3] = boxes_xywhr[..., 1] + half_h boxes[..., 4] = boxes_xywhr[..., 4] return boxes def get_box_type(box_type): """Get the type and mode of box structure. Args: box_type (str): The type of box structure. The valid value are "LiDAR", "Camera", or "Depth". Raises: ValueError: A ValueError is raised when `box_type` does not belong to the three valid types. Returns: tuple: Box type and box mode. """ from .box_3d_mode import (Box3DMode, CameraInstance3DBoxes, DepthInstance3DBoxes, LiDARInstance3DBoxes) box_type_lower = box_type.lower() if box_type_lower == 'lidar': box_type_3d = LiDARInstance3DBoxes box_mode_3d = Box3DMode.LIDAR elif box_type_lower == 'camera': box_type_3d = CameraInstance3DBoxes box_mode_3d = Box3DMode.CAM elif box_type_lower == 'depth': box_type_3d = DepthInstance3DBoxes box_mode_3d = Box3DMode.DEPTH else: raise ValueError('Only "box_type" of "camera", "lidar", "depth"' f' are supported, got {box_type}') return box_type_3d, box_mode_3d @array_converter(apply_to=('points_3d', 'proj_mat')) def points_cam2img(points_3d, proj_mat, with_depth=False): """Project points in camera coordinates to image coordinates. Args: points_3d (torch.Tensor | np.ndarray): Points in shape (N, 3) proj_mat (torch.Tensor | np.ndarray): Transformation matrix between coordinates. with_depth (bool, optional): Whether to keep depth in the output. Defaults to False. Returns: (torch.Tensor | np.ndarray): Points in image coordinates, with shape [N, 2] if `with_depth=False`, else [N, 3]. """ points_shape = list(points_3d.shape) points_shape[-1] = 1 assert len(proj_mat.shape) == 2, 'The dimension of the projection'\ f' matrix should be 2 instead of {len(proj_mat.shape)}.' d1, d2 = proj_mat.shape[:2] assert (d1 == 3 and d2 == 3) or (d1 == 3 and d2 == 4) or ( d1 == 4 and d2 == 4), 'The shape of the projection matrix'\ f' ({d1}*{d2}) is not supported.' if d1 == 3: proj_mat_expanded = torch.eye( 4, device=proj_mat.device, dtype=proj_mat.dtype) proj_mat_expanded[:d1, :d2] = proj_mat proj_mat = proj_mat_expanded # previous implementation use new_zeros, new_one yields better results points_4 = torch.cat([points_3d, points_3d.new_ones(points_shape)], dim=-1) point_2d = points_4 @ proj_mat.T point_2d_res = point_2d[..., :2] / point_2d[..., 2:3] if with_depth: point_2d_res = torch.cat([point_2d_res, point_2d[..., 2:3]], dim=-1) return point_2d_res @array_converter(apply_to=('points', 'cam2img')) def points_img2cam(points, cam2img): """Project points in image coordinates to camera coordinates. Args: points (torch.Tensor): 2.5D points in 2D images, [N, 3], 3 corresponds with x, y in the image and depth. cam2img (torch.Tensor): Camera intrinsic matrix. The shape can be [3, 3], [3, 4] or [4, 4]. Returns: torch.Tensor: points in 3D space. [N, 3], 3 corresponds with x, y, z in 3D space. """ assert cam2img.shape[0] <= 4 assert cam2img.shape[1] <= 4 assert points.shape[1] == 3 xys = points[:, :2] depths = points[:, 2].view(-1, 1) unnormed_xys = torch.cat([xys * depths, depths], dim=1) pad_cam2img = torch.eye(4, dtype=xys.dtype, device=xys.device) pad_cam2img[:cam2img.shape[0], :cam2img.shape[1]] = cam2img inv_pad_cam2img = torch.inverse(pad_cam2img).transpose(0, 1) # Do operation in homogeneous coordinates. num_points = unnormed_xys.shape[0] homo_xys = torch.cat([unnormed_xys, xys.new_ones((num_points, 1))], dim=1) points3D = torch.mm(homo_xys, inv_pad_cam2img)[:, :3] return points3D def mono_cam_box2vis(cam_box): """This is a post-processing function on the bboxes from Mono-3D task. If we want to perform projection visualization, we need to: 1. rotate the box along x-axis for np.pi / 2 (roll) 2. change orientation from local yaw to global yaw 3. convert yaw by (np.pi / 2 - yaw) After applying this function, we can project and draw it on 2D images. Args: cam_box (:obj:`CameraInstance3DBoxes`): 3D bbox in camera coordinate system before conversion. Could be gt bbox loaded from dataset or network prediction output. Returns: :obj:`CameraInstance3DBoxes`: Box after conversion. """ warning.warn('DeprecationWarning: The hack of yaw and dimension in the ' 'monocular 3D detection on nuScenes has been removed. The ' 'function mono_cam_box2vis will be deprecated.') from . import CameraInstance3DBoxes assert isinstance(cam_box, CameraInstance3DBoxes), \ 'input bbox should be CameraInstance3DBoxes!' loc = cam_box.gravity_center dim = cam_box.dims yaw = cam_box.yaw feats = cam_box.tensor[:, 7:] # rotate along x-axis for np.pi / 2 # see also here: https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L557 # noqa dim[:, [1, 2]] = dim[:, [2, 1]] # change local yaw to global yaw for visualization # refer to https://github.com/open-mmlab/mmdetection3d/blob/master/mmdet3d/datasets/nuscenes_mono_dataset.py#L164-L166 # noqa yaw += torch.atan2(loc[:, 0], loc[:, 2]) # convert yaw by (-yaw - np.pi / 2) # this is because mono 3D box class such as `NuScenesBox` has different # definition of rotation with our `CameraInstance3DBoxes` yaw = -yaw - np.pi / 2 cam_box = torch.cat([loc, dim, yaw[:, None], feats], dim=1) cam_box = CameraInstance3DBoxes( cam_box, box_dim=cam_box.shape[-1], origin=(0.5, 0.5, 0.5)) return cam_box def get_proj_mat_by_coord_type(img_meta, coord_type): """Obtain image features using points. Args: img_meta (dict): Meta info. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Can be case-insensitive. Returns: torch.Tensor: transformation matrix. """ coord_type = coord_type.upper() mapping = {'LIDAR': 'lidar2img', 'DEPTH': 'depth2img', 'CAMERA': 'cam2img'} assert coord_type in mapping.keys() return img_meta[mapping[coord_type]] def yaw2local(yaw, loc): """Transform global yaw to local yaw (alpha in kitti) in camera coordinates, ranges from -pi to pi. Args: yaw (torch.Tensor): A vector with local yaw of each box. shape: (N, ) loc (torch.Tensor): gravity center of each box. shape: (N, 3) Returns: torch.Tensor: local yaw (alpha in kitti). """ local_yaw = yaw - torch.atan2(loc[:, 0], loc[:, 2]) larger_idx = (local_yaw > np.pi).nonzero(as_tuple=False) small_idx = (local_yaw < -np.pi).nonzero(as_tuple=False) if len(larger_idx) != 0: local_yaw[larger_idx] -= 2 * np.pi if len(small_idx) != 0: local_yaw[small_idx] += 2 * np.pi return local_yaw ================================================ FILE: mmdet3d/core/bbox/transforms.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch def bbox3d_mapping_back(bboxes, scale_factor, flip_horizontal, flip_vertical): """Map bboxes from testing scale to original image scale. Args: bboxes (:obj:`BaseInstance3DBoxes`): Boxes to be mapped back. scale_factor (float): Scale factor. flip_horizontal (bool): Whether to flip horizontally. flip_vertical (bool): Whether to flip vertically. Returns: :obj:`BaseInstance3DBoxes`: Boxes mapped back. """ new_bboxes = bboxes.clone() if flip_horizontal: new_bboxes.flip('horizontal') if flip_vertical: new_bboxes.flip('vertical') new_bboxes.scale(1 / scale_factor) return new_bboxes def bbox3d2roi(bbox_list): """Convert a list of bounding boxes to roi format. Args: bbox_list (list[torch.Tensor]): A list of bounding boxes corresponding to a batch of images. Returns: torch.Tensor: Region of interests in shape (n, c), where the channels are in order of [batch_ind, x, y ...]. """ rois_list = [] for img_id, bboxes in enumerate(bbox_list): if bboxes.size(0) > 0: img_inds = bboxes.new_full((bboxes.size(0), 1), img_id) rois = torch.cat([img_inds, bboxes], dim=-1) else: rois = torch.zeros_like(bboxes) rois_list.append(rois) rois = torch.cat(rois_list, 0) return rois def bbox3d2result(bboxes, scores, labels, attrs=None): """Convert detection results to a list of numpy arrays. Args: bboxes (torch.Tensor): Bounding boxes with shape (N, 5). labels (torch.Tensor): Labels with shape (N, ). scores (torch.Tensor): Scores with shape (N, ). attrs (torch.Tensor, optional): Attributes with shape (N, ). Defaults to None. Returns: dict[str, torch.Tensor]: Bounding box results in cpu mode. - boxes_3d (torch.Tensor): 3D boxes. - scores (torch.Tensor): Prediction scores. - labels_3d (torch.Tensor): Box labels. - attrs_3d (torch.Tensor, optional): Box attributes. """ result_dict = dict( boxes_3d=bboxes.to('cpu'), scores_3d=scores.cpu(), labels_3d=labels.cpu()) if attrs is not None: result_dict['attrs_3d'] = attrs.cpu() return result_dict ================================================ FILE: mmdet3d/core/bbox/util.py ================================================ import torch def normalize_bbox(bboxes, pc_range): cx = bboxes[..., 0:1] cy = bboxes[..., 1:2] cz = bboxes[..., 2:3] w = bboxes[..., 3:4].log() l = bboxes[..., 4:5].log() h = bboxes[..., 5:6].log() rot = bboxes[..., 6:7] if bboxes.size(-1) > 7: vx = bboxes[..., 7:8] vy = bboxes[..., 8:9] normalized_bboxes = torch.cat( (cx, cy, w, l, cz, h, rot.sin(), rot.cos(), vx, vy), dim=-1 ) else: normalized_bboxes = torch.cat( (cx, cy, w, l, cz, h, rot.sin(), rot.cos()), dim=-1 ) return normalized_bboxes def denormalize_bbox(normalized_bboxes, pc_range): # rotation rot_sine = normalized_bboxes[..., 6:7] rot_cosine = normalized_bboxes[..., 7:8] rot = torch.atan2(rot_sine, rot_cosine) # center in the bev cx = normalized_bboxes[..., 0:1] cy = normalized_bboxes[..., 1:2] cz = normalized_bboxes[..., 4:5] # size w = normalized_bboxes[..., 2:3] l = normalized_bboxes[..., 3:4] h = normalized_bboxes[..., 5:6] w = w.exp() l = l.exp() h = h.exp() if normalized_bboxes.size(-1) > 8: # velocity vx = normalized_bboxes[:, 8:9] vy = normalized_bboxes[:, 9:10] denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) else: denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) return denormalized_bboxes ================================================ FILE: mmdet3d/core/evaluation/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .indoor_eval import indoor_eval from .instance_seg_eval import instance_seg_eval from .kitti_utils import kitti_eval, kitti_eval_coco_style from .lyft_eval import lyft_eval from .seg_eval import seg_eval __all__ = [ 'kitti_eval_coco_style', 'kitti_eval', 'indoor_eval', 'lyft_eval', 'seg_eval', 'instance_seg_eval' ] ================================================ FILE: mmdet3d/core/evaluation/indoor_eval.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.utils import print_log from terminaltables import AsciiTable def average_precision(recalls, precisions, mode='area'): """Calculate average precision (for single or multiple scales). Args: recalls (np.ndarray): Recalls with shape of (num_scales, num_dets) or (num_dets, ). precisions (np.ndarray): Precisions with shape of (num_scales, num_dets) or (num_dets, ). mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float or np.ndarray: Calculated average precision. """ if recalls.ndim == 1: recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape assert recalls.ndim == 2 num_scales = recalls.shape[0] ap = np.zeros(num_scales, dtype=np.float32) if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) for i in range(num_scales): ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] ap[i] = np.sum( (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) elif mode == '11points': for i in range(num_scales): for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[i, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap[i] += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') return ap def eval_det_cls(pred, gt, iou_thr=None): """Generic functions to compute precision/recall for object detection for a single class. Args: pred (dict): Predictions mapping from image id to bounding boxes and scores. gt (dict): Ground truths mapping from image id to bounding boxes. iou_thr (list[float]): A list of iou thresholds. Return: tuple (np.ndarray, np.ndarray, float): Recalls, precisions and average precision. """ # {img_id: {'bbox': box structure, 'det': matched list}} class_recs = {} npos = 0 for img_id in gt.keys(): cur_gt_num = len(gt[img_id]) if cur_gt_num != 0: gt_cur = torch.zeros([cur_gt_num, 7], dtype=torch.float32) for i in range(cur_gt_num): gt_cur[i] = gt[img_id][i].tensor bbox = gt[img_id][0].new_box(gt_cur) else: bbox = gt[img_id] det = [[False] * len(bbox) for i in iou_thr] npos += len(bbox) class_recs[img_id] = {'bbox': bbox, 'det': det} # construct dets image_ids = [] confidence = [] ious = [] for img_id in pred.keys(): cur_num = len(pred[img_id]) if cur_num == 0: continue pred_cur = torch.zeros((cur_num, 7), dtype=torch.float32) box_idx = 0 for box, score in pred[img_id]: image_ids.append(img_id) confidence.append(score) pred_cur[box_idx] = box.tensor box_idx += 1 pred_cur = box.new_box(pred_cur) gt_cur = class_recs[img_id]['bbox'] if len(gt_cur) > 0: # calculate iou in each image iou_cur = pred_cur.overlaps(pred_cur, gt_cur) for i in range(cur_num): ious.append(iou_cur[i]) else: for i in range(cur_num): ious.append(np.zeros(1)) confidence = np.array(confidence) # sort by confidence sorted_ind = np.argsort(-confidence) image_ids = [image_ids[x] for x in sorted_ind] ious = [ious[x] for x in sorted_ind] # go down dets and mark TPs and FPs nd = len(image_ids) tp_thr = [np.zeros(nd) for i in iou_thr] fp_thr = [np.zeros(nd) for i in iou_thr] for d in range(nd): R = class_recs[image_ids[d]] iou_max = -np.inf BBGT = R['bbox'] cur_iou = ious[d] if len(BBGT) > 0: # compute overlaps for j in range(len(BBGT)): # iou = get_iou_main(get_iou_func, (bb, BBGT[j,...])) iou = cur_iou[j] if iou > iou_max: iou_max = iou jmax = j for iou_idx, thresh in enumerate(iou_thr): if iou_max > thresh: if not R['det'][iou_idx][jmax]: tp_thr[iou_idx][d] = 1. R['det'][iou_idx][jmax] = 1 else: fp_thr[iou_idx][d] = 1. else: fp_thr[iou_idx][d] = 1. ret = [] for iou_idx, thresh in enumerate(iou_thr): # compute precision recall fp = np.cumsum(fp_thr[iou_idx]) tp = np.cumsum(tp_thr[iou_idx]) recall = tp / float(npos) # avoid divide by zero in case the first detection matches a difficult # ground truth precision = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = average_precision(recall, precision) ret.append((recall, precision, ap)) return ret def eval_map_recall(pred, gt, ovthresh=None): """Evaluate mAP and recall. Generic functions to compute precision/recall for object detection for multiple classes. Args: pred (dict): Information of detection results, which maps class_id and predictions. gt (dict): Information of ground truths, which maps class_id and ground truths. ovthresh (list[float], optional): iou threshold. Default: None. Return: tuple[dict]: dict results of recall, AP, and precision for all classes. """ ret_values = {} for classname in gt.keys(): if classname in pred: ret_values[classname] = eval_det_cls(pred[classname], gt[classname], ovthresh) recall = [{} for i in ovthresh] precision = [{} for i in ovthresh] ap = [{} for i in ovthresh] for label in gt.keys(): for iou_idx, thresh in enumerate(ovthresh): if label in pred: recall[iou_idx][label], precision[iou_idx][label], ap[iou_idx][ label] = ret_values[label][iou_idx] else: recall[iou_idx][label] = np.zeros(1) precision[iou_idx][label] = np.zeros(1) ap[iou_idx][label] = np.zeros(1) return recall, precision, ap def indoor_eval(gt_annos, dt_annos, metric, label2cat, logger=None, box_type_3d=None, box_mode_3d=None): """Indoor Evaluation. Evaluate the result of the detection. Args: gt_annos (list[dict]): Ground truth annotations. dt_annos (list[dict]): Detection annotations. the dict includes the following keys - labels_3d (torch.Tensor): Labels of boxes. - boxes_3d (:obj:`BaseInstance3DBoxes`): 3D bounding boxes in Depth coordinate. - scores_3d (torch.Tensor): Scores of boxes. metric (list[float]): IoU thresholds for computing average precisions. label2cat (dict): Map from label to category. logger (logging.Logger | str, optional): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Return: dict[str, float]: Dict of results. """ assert len(dt_annos) == len(gt_annos) pred = {} # map {class_id: pred} gt = {} # map {class_id: gt} for img_id in range(len(dt_annos)): # parse detected annotations det_anno = dt_annos[img_id] for i in range(len(det_anno['labels_3d'])): label = det_anno['labels_3d'].numpy()[i] bbox = det_anno['boxes_3d'].convert_to(box_mode_3d)[i] score = det_anno['scores_3d'].numpy()[i] if label not in pred: pred[int(label)] = {} if img_id not in pred[label]: pred[int(label)][img_id] = [] if label not in gt: gt[int(label)] = {} if img_id not in gt[label]: gt[int(label)][img_id] = [] pred[int(label)][img_id].append((bbox, score)) # parse gt annotations gt_anno = gt_annos[img_id] if gt_anno['gt_num'] != 0: gt_boxes = box_type_3d( gt_anno['gt_boxes_upright_depth'], box_dim=gt_anno['gt_boxes_upright_depth'].shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(box_mode_3d) labels_3d = gt_anno['class'] else: gt_boxes = box_type_3d(np.array([], dtype=np.float32)) labels_3d = np.array([], dtype=np.int64) for i in range(len(labels_3d)): label = labels_3d[i] bbox = gt_boxes[i] if label not in gt: gt[label] = {} if img_id not in gt[label]: gt[label][img_id] = [] gt[label][img_id].append(bbox) rec, prec, ap = eval_map_recall(pred, gt, metric) ret_dict = dict() header = ['classes'] table_columns = [[label2cat[label] for label in ap[0].keys()] + ['Overall']] for i, iou_thresh in enumerate(metric): header.append(f'AP_{iou_thresh:.2f}') header.append(f'AR_{iou_thresh:.2f}') rec_list = [] for label in ap[i].keys(): ret_dict[f'{label2cat[label]}_AP_{iou_thresh:.2f}'] = float( ap[i][label][0]) ret_dict[f'mAP_{iou_thresh:.2f}'] = float( np.mean(list(ap[i].values()))) table_columns.append(list(map(float, list(ap[i].values())))) table_columns[-1] += [ret_dict[f'mAP_{iou_thresh:.2f}']] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] for label in rec[i].keys(): ret_dict[f'{label2cat[label]}_rec_{iou_thresh:.2f}'] = float( rec[i][label][-1]) rec_list.append(rec[i][label][-1]) ret_dict[f'mAR_{iou_thresh:.2f}'] = float(np.mean(rec_list)) table_columns.append(list(map(float, rec_list))) table_columns[-1] += [ret_dict[f'mAR_{iou_thresh:.2f}']] table_columns[-1] = [f'{x:.4f}' for x in table_columns[-1]] table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) return ret_dict ================================================ FILE: mmdet3d/core/evaluation/instance_seg_eval.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np from mmcv.utils import print_log from terminaltables import AsciiTable from .scannet_utils.evaluate_semantic_instance import scannet_eval def aggregate_predictions(masks, labels, scores, valid_class_ids): """Maps predictions to ScanNet evaluator format. Args: masks (list[torch.Tensor]): Per scene predicted instance masks. labels (list[torch.Tensor]): Per scene predicted instance labels. scores (list[torch.Tensor]): Per scene predicted instance scores. valid_class_ids (tuple[int]): Ids of valid categories. Returns: list[dict]: Per scene aggregated predictions. """ infos = [] for id, (mask, label, score) in enumerate(zip(masks, labels, scores)): mask = mask.clone().numpy() label = label.clone().numpy() score = score.clone().numpy() info = dict() n_instances = mask.max() + 1 for i in range(n_instances): # match pred_instance['filename'] from assign_instances_for_scan file_name = f'{id}_{i}' info[file_name] = dict() info[file_name]['mask'] = (mask == i).astype(np.int) info[file_name]['label_id'] = valid_class_ids[label[i]] info[file_name]['conf'] = score[i] infos.append(info) return infos def rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids): """Maps gt instance and semantic masks to instance masks for ScanNet evaluator. Args: gt_semantic_masks (list[torch.Tensor]): Per scene gt semantic masks. gt_instance_masks (list[torch.Tensor]): Per scene gt instance masks. valid_class_ids (tuple[int]): Ids of valid categories. Returns: list[np.array]: Per scene instance masks. """ renamed_instance_masks = [] for semantic_mask, instance_mask in zip(gt_semantic_masks, gt_instance_masks): semantic_mask = semantic_mask.clone().numpy() instance_mask = instance_mask.clone().numpy() unique = np.unique(instance_mask) assert len(unique) < 1000 for i in unique: semantic_instance = semantic_mask[instance_mask == i] semantic_unique = np.unique(semantic_instance) assert len(semantic_unique) == 1 if semantic_unique[0] < len(valid_class_ids): instance_mask[ instance_mask == i] = 1000 * valid_class_ids[semantic_unique[0]] + i renamed_instance_masks.append(instance_mask) return renamed_instance_masks def instance_seg_eval(gt_semantic_masks, gt_instance_masks, pred_instance_masks, pred_instance_labels, pred_instance_scores, valid_class_ids, class_labels, options=None, logger=None): """Instance Segmentation Evaluation. Evaluate the result of the instance segmentation. Args: gt_semantic_masks (list[torch.Tensor]): Ground truth semantic masks. gt_instance_masks (list[torch.Tensor]): Ground truth instance masks. pred_instance_masks (list[torch.Tensor]): Predicted instance masks. pred_instance_labels (list[torch.Tensor]): Predicted instance labels. pred_instance_scores (list[torch.Tensor]): Predicted instance labels. valid_class_ids (tuple[int]): Ids of valid categories. class_labels (tuple[str]): Names of valid categories. options (dict, optional): Additional options. Keys may contain: `overlaps`, `min_region_sizes`, `distance_threshes`, `distance_confs`. Default: None. logger (logging.Logger | str, optional): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Returns: dict[str, float]: Dict of results. """ assert len(valid_class_ids) == len(class_labels) id_to_label = { valid_class_ids[i]: class_labels[i] for i in range(len(valid_class_ids)) } preds = aggregate_predictions( masks=pred_instance_masks, labels=pred_instance_labels, scores=pred_instance_scores, valid_class_ids=valid_class_ids) gts = rename_gt(gt_semantic_masks, gt_instance_masks, valid_class_ids) metrics = scannet_eval( preds=preds, gts=gts, options=options, valid_class_ids=valid_class_ids, class_labels=class_labels, id_to_label=id_to_label) header = ['classes', 'AP_0.25', 'AP_0.50', 'AP'] rows = [] for label, data in metrics['classes'].items(): aps = [data['ap25%'], data['ap50%'], data['ap']] rows.append([label] + [f'{ap:.4f}' for ap in aps]) aps = metrics['all_ap_25%'], metrics['all_ap_50%'], metrics['all_ap'] footer = ['Overall'] + [f'{ap:.4f}' for ap in aps] table = AsciiTable([header] + rows + [footer]) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) return metrics ================================================ FILE: mmdet3d/core/evaluation/kitti_utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .eval import kitti_eval, kitti_eval_coco_style __all__ = ['kitti_eval', 'kitti_eval_coco_style'] ================================================ FILE: mmdet3d/core/evaluation/kitti_utils/eval.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import gc import io as sysio import numba import numpy as np @numba.jit def get_thresholds(scores: np.ndarray, num_gt, num_sample_pts=41): scores.sort() scores = scores[::-1] current_recall = 0 thresholds = [] for i, score in enumerate(scores): l_recall = (i + 1) / num_gt if i < (len(scores) - 1): r_recall = (i + 2) / num_gt else: r_recall = l_recall if (((r_recall - current_recall) < (current_recall - l_recall)) and (i < (len(scores) - 1))): continue # recall = l_recall thresholds.append(score) current_recall += 1 / (num_sample_pts - 1.0) return thresholds def clean_data(gt_anno, dt_anno, current_class, difficulty): CLASS_NAMES = ['car', 'pedestrian', 'cyclist'] MIN_HEIGHT = [40, 25, 25] MAX_OCCLUSION = [0, 1, 2] MAX_TRUNCATION = [0.15, 0.3, 0.5] dc_bboxes, ignored_gt, ignored_dt = [], [], [] current_cls_name = CLASS_NAMES[current_class].lower() num_gt = len(gt_anno['name']) num_dt = len(dt_anno['name']) num_valid_gt = 0 for i in range(num_gt): bbox = gt_anno['bbox'][i] gt_name = gt_anno['name'][i].lower() height = bbox[3] - bbox[1] valid_class = -1 if (gt_name == current_cls_name): valid_class = 1 elif (current_cls_name == 'Pedestrian'.lower() and 'Person_sitting'.lower() == gt_name): valid_class = 0 elif (current_cls_name == 'Car'.lower() and 'Van'.lower() == gt_name): valid_class = 0 else: valid_class = -1 ignore = False if ((gt_anno['occluded'][i] > MAX_OCCLUSION[difficulty]) or (gt_anno['truncated'][i] > MAX_TRUNCATION[difficulty]) or (height <= MIN_HEIGHT[difficulty])): ignore = True if valid_class == 1 and not ignore: ignored_gt.append(0) num_valid_gt += 1 elif (valid_class == 0 or (ignore and (valid_class == 1))): ignored_gt.append(1) else: ignored_gt.append(-1) # for i in range(num_gt): if gt_anno['name'][i] == 'DontCare': dc_bboxes.append(gt_anno['bbox'][i]) for i in range(num_dt): if (dt_anno['name'][i].lower() == current_cls_name): valid_class = 1 else: valid_class = -1 height = abs(dt_anno['bbox'][i, 3] - dt_anno['bbox'][i, 1]) if height < MIN_HEIGHT[difficulty]: ignored_dt.append(1) elif valid_class == 1: ignored_dt.append(0) else: ignored_dt.append(-1) return num_valid_gt, ignored_gt, ignored_dt, dc_bboxes @numba.jit(nopython=True) def image_box_overlap(boxes, query_boxes, criterion=-1): N = boxes.shape[0] K = query_boxes.shape[0] overlaps = np.zeros((N, K), dtype=boxes.dtype) for k in range(K): qbox_area = ((query_boxes[k, 2] - query_boxes[k, 0]) * (query_boxes[k, 3] - query_boxes[k, 1])) for n in range(N): iw = ( min(boxes[n, 2], query_boxes[k, 2]) - max(boxes[n, 0], query_boxes[k, 0])) if iw > 0: ih = ( min(boxes[n, 3], query_boxes[k, 3]) - max(boxes[n, 1], query_boxes[k, 1])) if ih > 0: if criterion == -1: ua = ((boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1]) + qbox_area - iw * ih) elif criterion == 0: ua = ((boxes[n, 2] - boxes[n, 0]) * (boxes[n, 3] - boxes[n, 1])) elif criterion == 1: ua = qbox_area else: ua = 1.0 overlaps[n, k] = iw * ih / ua return overlaps def bev_box_overlap(boxes, qboxes, criterion=-1): from .rotate_iou import rotate_iou_gpu_eval riou = rotate_iou_gpu_eval(boxes, qboxes, criterion) return riou @numba.jit(nopython=True, parallel=True) def d3_box_overlap_kernel(boxes, qboxes, rinc, criterion=-1): # ONLY support overlap in CAMERA, not lidar. # TODO: change to use prange for parallel mode, should check the difference N, K = boxes.shape[0], qboxes.shape[0] for i in numba.prange(N): for j in numba.prange(K): if rinc[i, j] > 0: # iw = (min(boxes[i, 1] + boxes[i, 4], qboxes[j, 1] + # qboxes[j, 4]) - max(boxes[i, 1], qboxes[j, 1])) iw = ( min(boxes[i, 1], qboxes[j, 1]) - max(boxes[i, 1] - boxes[i, 4], qboxes[j, 1] - qboxes[j, 4])) if iw > 0: area1 = boxes[i, 3] * boxes[i, 4] * boxes[i, 5] area2 = qboxes[j, 3] * qboxes[j, 4] * qboxes[j, 5] inc = iw * rinc[i, j] if criterion == -1: ua = (area1 + area2 - inc) elif criterion == 0: ua = area1 elif criterion == 1: ua = area2 else: ua = inc rinc[i, j] = inc / ua else: rinc[i, j] = 0.0 def d3_box_overlap(boxes, qboxes, criterion=-1): from .rotate_iou import rotate_iou_gpu_eval rinc = rotate_iou_gpu_eval(boxes[:, [0, 2, 3, 5, 6]], qboxes[:, [0, 2, 3, 5, 6]], 2) d3_box_overlap_kernel(boxes, qboxes, rinc, criterion) return rinc @numba.jit(nopython=True) def compute_statistics_jit(overlaps, gt_datas, dt_datas, ignored_gt, ignored_det, dc_bboxes, metric, min_overlap, thresh=0, compute_fp=False, compute_aos=False): det_size = dt_datas.shape[0] gt_size = gt_datas.shape[0] dt_scores = dt_datas[:, -1] dt_alphas = dt_datas[:, 4] gt_alphas = gt_datas[:, 4] dt_bboxes = dt_datas[:, :4] # gt_bboxes = gt_datas[:, :4] assigned_detection = [False] * det_size ignored_threshold = [False] * det_size if compute_fp: for i in range(det_size): if (dt_scores[i] < thresh): ignored_threshold[i] = True NO_DETECTION = -10000000 tp, fp, fn, similarity = 0, 0, 0, 0 # thresholds = [0.0] # delta = [0.0] thresholds = np.zeros((gt_size, )) thresh_idx = 0 delta = np.zeros((gt_size, )) delta_idx = 0 for i in range(gt_size): if ignored_gt[i] == -1: continue det_idx = -1 valid_detection = NO_DETECTION max_overlap = 0 assigned_ignored_det = False for j in range(det_size): if (ignored_det[j] == -1): continue if (assigned_detection[j]): continue if (ignored_threshold[j]): continue overlap = overlaps[j, i] dt_score = dt_scores[j] if (not compute_fp and (overlap > min_overlap) and dt_score > valid_detection): det_idx = j valid_detection = dt_score elif (compute_fp and (overlap > min_overlap) and (overlap > max_overlap or assigned_ignored_det) and ignored_det[j] == 0): max_overlap = overlap det_idx = j valid_detection = 1 assigned_ignored_det = False elif (compute_fp and (overlap > min_overlap) and (valid_detection == NO_DETECTION) and ignored_det[j] == 1): det_idx = j valid_detection = 1 assigned_ignored_det = True if (valid_detection == NO_DETECTION) and ignored_gt[i] == 0: fn += 1 elif ((valid_detection != NO_DETECTION) and (ignored_gt[i] == 1 or ignored_det[det_idx] == 1)): assigned_detection[det_idx] = True elif valid_detection != NO_DETECTION: tp += 1 # thresholds.append(dt_scores[det_idx]) thresholds[thresh_idx] = dt_scores[det_idx] thresh_idx += 1 if compute_aos: # delta.append(gt_alphas[i] - dt_alphas[det_idx]) delta[delta_idx] = gt_alphas[i] - dt_alphas[det_idx] delta_idx += 1 assigned_detection[det_idx] = True if compute_fp: for i in range(det_size): if (not (assigned_detection[i] or ignored_det[i] == -1 or ignored_det[i] == 1 or ignored_threshold[i])): fp += 1 nstuff = 0 if metric == 0: overlaps_dt_dc = image_box_overlap(dt_bboxes, dc_bboxes, 0) for i in range(dc_bboxes.shape[0]): for j in range(det_size): if (assigned_detection[j]): continue if (ignored_det[j] == -1 or ignored_det[j] == 1): continue if (ignored_threshold[j]): continue if overlaps_dt_dc[j, i] > min_overlap: assigned_detection[j] = True nstuff += 1 fp -= nstuff if compute_aos: tmp = np.zeros((fp + delta_idx, )) # tmp = [0] * fp for i in range(delta_idx): tmp[i + fp] = (1.0 + np.cos(delta[i])) / 2.0 # tmp.append((1.0 + np.cos(delta[i])) / 2.0) # assert len(tmp) == fp + tp # assert len(delta) == tp if tp > 0 or fp > 0: similarity = np.sum(tmp) else: similarity = -1 return tp, fp, fn, similarity, thresholds[:thresh_idx] def get_split_parts(num, num_part): same_part = num // num_part remain_num = num % num_part if remain_num == 0: return [same_part] * num_part else: return [same_part] * num_part + [remain_num] @numba.jit(nopython=True) def fused_compute_statistics(overlaps, pr, gt_nums, dt_nums, dc_nums, gt_datas, dt_datas, dontcares, ignored_gts, ignored_dets, metric, min_overlap, thresholds, compute_aos=False): gt_num = 0 dt_num = 0 dc_num = 0 for i in range(gt_nums.shape[0]): for t, thresh in enumerate(thresholds): overlap = overlaps[dt_num:dt_num + dt_nums[i], gt_num:gt_num + gt_nums[i]] gt_data = gt_datas[gt_num:gt_num + gt_nums[i]] dt_data = dt_datas[dt_num:dt_num + dt_nums[i]] ignored_gt = ignored_gts[gt_num:gt_num + gt_nums[i]] ignored_det = ignored_dets[dt_num:dt_num + dt_nums[i]] dontcare = dontcares[dc_num:dc_num + dc_nums[i]] tp, fp, fn, similarity, _ = compute_statistics_jit( overlap, gt_data, dt_data, ignored_gt, ignored_det, dontcare, metric, min_overlap=min_overlap, thresh=thresh, compute_fp=True, compute_aos=compute_aos) pr[t, 0] += tp pr[t, 1] += fp pr[t, 2] += fn if similarity != -1: pr[t, 3] += similarity gt_num += gt_nums[i] dt_num += dt_nums[i] dc_num += dc_nums[i] def calculate_iou_partly(gt_annos, dt_annos, metric, num_parts=50): """Fast iou algorithm. this function can be used independently to do result analysis. Must be used in CAMERA coordinate system. Args: gt_annos (dict): Must from get_label_annos() in kitti_common.py. dt_annos (dict): Must from get_label_annos() in kitti_common.py. metric (int): Eval type. 0: bbox, 1: bev, 2: 3d. num_parts (int): A parameter for fast calculate algorithm. """ assert len(gt_annos) == len(dt_annos) total_dt_num = np.stack([len(a['name']) for a in dt_annos], 0) total_gt_num = np.stack([len(a['name']) for a in gt_annos], 0) num_examples = len(gt_annos) split_parts = get_split_parts(num_examples, num_parts) parted_overlaps = [] example_idx = 0 for num_part in split_parts: gt_annos_part = gt_annos[example_idx:example_idx + num_part] dt_annos_part = dt_annos[example_idx:example_idx + num_part] if metric == 0: gt_boxes = np.concatenate([a['bbox'] for a in gt_annos_part], 0) dt_boxes = np.concatenate([a['bbox'] for a in dt_annos_part], 0) overlap_part = image_box_overlap(gt_boxes, dt_boxes) elif metric == 1: loc = np.concatenate( [a['location'][:, [0, 2]] for a in gt_annos_part], 0) dims = np.concatenate( [a['dimensions'][:, [0, 2]] for a in gt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) loc = np.concatenate( [a['location'][:, [0, 2]] for a in dt_annos_part], 0) dims = np.concatenate( [a['dimensions'][:, [0, 2]] for a in dt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0) dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) overlap_part = bev_box_overlap(gt_boxes, dt_boxes).astype(np.float64) elif metric == 2: loc = np.concatenate([a['location'] for a in gt_annos_part], 0) dims = np.concatenate([a['dimensions'] for a in gt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in gt_annos_part], 0) gt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) loc = np.concatenate([a['location'] for a in dt_annos_part], 0) dims = np.concatenate([a['dimensions'] for a in dt_annos_part], 0) rots = np.concatenate([a['rotation_y'] for a in dt_annos_part], 0) dt_boxes = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) overlap_part = d3_box_overlap(gt_boxes, dt_boxes).astype(np.float64) else: raise ValueError('unknown metric') parted_overlaps.append(overlap_part) example_idx += num_part overlaps = [] example_idx = 0 for j, num_part in enumerate(split_parts): gt_annos_part = gt_annos[example_idx:example_idx + num_part] dt_annos_part = dt_annos[example_idx:example_idx + num_part] gt_num_idx, dt_num_idx = 0, 0 for i in range(num_part): gt_box_num = total_gt_num[example_idx + i] dt_box_num = total_dt_num[example_idx + i] overlaps.append( parted_overlaps[j][gt_num_idx:gt_num_idx + gt_box_num, dt_num_idx:dt_num_idx + dt_box_num]) gt_num_idx += gt_box_num dt_num_idx += dt_box_num example_idx += num_part return overlaps, parted_overlaps, total_gt_num, total_dt_num def _prepare_data(gt_annos, dt_annos, current_class, difficulty): gt_datas_list = [] dt_datas_list = [] total_dc_num = [] ignored_gts, ignored_dets, dontcares = [], [], [] total_num_valid_gt = 0 for i in range(len(gt_annos)): rets = clean_data(gt_annos[i], dt_annos[i], current_class, difficulty) num_valid_gt, ignored_gt, ignored_det, dc_bboxes = rets ignored_gts.append(np.array(ignored_gt, dtype=np.int64)) ignored_dets.append(np.array(ignored_det, dtype=np.int64)) if len(dc_bboxes) == 0: dc_bboxes = np.zeros((0, 4)).astype(np.float64) else: dc_bboxes = np.stack(dc_bboxes, 0).astype(np.float64) total_dc_num.append(dc_bboxes.shape[0]) dontcares.append(dc_bboxes) total_num_valid_gt += num_valid_gt gt_datas = np.concatenate( [gt_annos[i]['bbox'], gt_annos[i]['alpha'][..., np.newaxis]], 1) dt_datas = np.concatenate([ dt_annos[i]['bbox'], dt_annos[i]['alpha'][..., np.newaxis], dt_annos[i]['score'][..., np.newaxis] ], 1) gt_datas_list.append(gt_datas) dt_datas_list.append(dt_datas) total_dc_num = np.stack(total_dc_num, axis=0) return (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) def eval_class(gt_annos, dt_annos, current_classes, difficultys, metric, min_overlaps, compute_aos=False, num_parts=200): """Kitti eval. support 2d/bev/3d/aos eval. support 0.5:0.05:0.95 coco AP. Args: gt_annos (dict): Must from get_label_annos() in kitti_common.py. dt_annos (dict): Must from get_label_annos() in kitti_common.py. current_classes (list[int]): 0: car, 1: pedestrian, 2: cyclist. difficultys (list[int]): Eval difficulty, 0: easy, 1: normal, 2: hard metric (int): Eval type. 0: bbox, 1: bev, 2: 3d min_overlaps (float): Min overlap. format: [num_overlap, metric, class]. num_parts (int): A parameter for fast calculate algorithm Returns: dict[str, np.ndarray]: recall, precision and aos """ assert len(gt_annos) == len(dt_annos) num_examples = len(gt_annos) if num_examples < num_parts: num_parts = num_examples split_parts = get_split_parts(num_examples, num_parts) rets = calculate_iou_partly(dt_annos, gt_annos, metric, num_parts) overlaps, parted_overlaps, total_dt_num, total_gt_num = rets N_SAMPLE_PTS = 41 num_minoverlap = len(min_overlaps) num_class = len(current_classes) num_difficulty = len(difficultys) precision = np.zeros( [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) recall = np.zeros( [num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) aos = np.zeros([num_class, num_difficulty, num_minoverlap, N_SAMPLE_PTS]) for m, current_class in enumerate(current_classes): for idx_l, difficulty in enumerate(difficultys): rets = _prepare_data(gt_annos, dt_annos, current_class, difficulty) (gt_datas_list, dt_datas_list, ignored_gts, ignored_dets, dontcares, total_dc_num, total_num_valid_gt) = rets for k, min_overlap in enumerate(min_overlaps[:, metric, m]): thresholdss = [] for i in range(len(gt_annos)): rets = compute_statistics_jit( overlaps[i], gt_datas_list[i], dt_datas_list[i], ignored_gts[i], ignored_dets[i], dontcares[i], metric, min_overlap=min_overlap, thresh=0.0, compute_fp=False) tp, fp, fn, similarity, thresholds = rets thresholdss += thresholds.tolist() thresholdss = np.array(thresholdss) thresholds = get_thresholds(thresholdss, total_num_valid_gt) thresholds = np.array(thresholds) pr = np.zeros([len(thresholds), 4]) idx = 0 for j, num_part in enumerate(split_parts): gt_datas_part = np.concatenate( gt_datas_list[idx:idx + num_part], 0) dt_datas_part = np.concatenate( dt_datas_list[idx:idx + num_part], 0) dc_datas_part = np.concatenate( dontcares[idx:idx + num_part], 0) ignored_dets_part = np.concatenate( ignored_dets[idx:idx + num_part], 0) ignored_gts_part = np.concatenate( ignored_gts[idx:idx + num_part], 0) fused_compute_statistics( parted_overlaps[j], pr, total_gt_num[idx:idx + num_part], total_dt_num[idx:idx + num_part], total_dc_num[idx:idx + num_part], gt_datas_part, dt_datas_part, dc_datas_part, ignored_gts_part, ignored_dets_part, metric, min_overlap=min_overlap, thresholds=thresholds, compute_aos=compute_aos) idx += num_part for i in range(len(thresholds)): recall[m, idx_l, k, i] = pr[i, 0] / (pr[i, 0] + pr[i, 2]) precision[m, idx_l, k, i] = pr[i, 0] / ( pr[i, 0] + pr[i, 1]) if compute_aos: aos[m, idx_l, k, i] = pr[i, 3] / (pr[i, 0] + pr[i, 1]) for i in range(len(thresholds)): precision[m, idx_l, k, i] = np.max( precision[m, idx_l, k, i:], axis=-1) recall[m, idx_l, k, i] = np.max( recall[m, idx_l, k, i:], axis=-1) if compute_aos: aos[m, idx_l, k, i] = np.max( aos[m, idx_l, k, i:], axis=-1) ret_dict = { 'recall': recall, 'precision': precision, 'orientation': aos, } # clean temp variables del overlaps del parted_overlaps gc.collect() return ret_dict def get_mAP11(prec): sums = 0 for i in range(0, prec.shape[-1], 4): sums = sums + prec[..., i] return sums / 11 * 100 def get_mAP40(prec): sums = 0 for i in range(1, prec.shape[-1]): sums = sums + prec[..., i] return sums / 40 * 100 def print_str(value, *arg, sstream=None): if sstream is None: sstream = sysio.StringIO() sstream.truncate(0) sstream.seek(0) print(value, *arg, file=sstream) return sstream.getvalue() def do_eval(gt_annos, dt_annos, current_classes, min_overlaps, eval_types=['bbox', 'bev', '3d']): # min_overlaps: [num_minoverlap, metric, num_class] difficultys = [0, 1, 2] mAP11_bbox = None mAP11_aos = None mAP40_bbox = None mAP40_aos = None if 'bbox' in eval_types: ret = eval_class( gt_annos, dt_annos, current_classes, difficultys, 0, min_overlaps, compute_aos=('aos' in eval_types)) # ret: [num_class, num_diff, num_minoverlap, num_sample_points] mAP11_bbox = get_mAP11(ret['precision']) mAP40_bbox = get_mAP40(ret['precision']) if 'aos' in eval_types: mAP11_aos = get_mAP11(ret['orientation']) mAP40_aos = get_mAP40(ret['orientation']) mAP11_bev = None mAP40_bev = None if 'bev' in eval_types: ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 1, min_overlaps) mAP11_bev = get_mAP11(ret['precision']) mAP40_bev = get_mAP40(ret['precision']) mAP11_3d = None mAP40_3d = None if '3d' in eval_types: ret = eval_class(gt_annos, dt_annos, current_classes, difficultys, 2, min_overlaps) mAP11_3d = get_mAP11(ret['precision']) mAP40_3d = get_mAP40(ret['precision']) return (mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, mAP40_3d, mAP40_aos) def do_coco_style_eval(gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos): # overlap_ranges: [range, metric, num_class] min_overlaps = np.zeros([10, *overlap_ranges.shape[1:]]) for i in range(overlap_ranges.shape[1]): for j in range(overlap_ranges.shape[2]): min_overlaps[:, i, j] = np.linspace(*overlap_ranges[:, i, j]) mAP_bbox, mAP_bev, mAP_3d, mAP_aos, _, _, \ _, _ = do_eval(gt_annos, dt_annos, current_classes, min_overlaps, compute_aos) # ret: [num_class, num_diff, num_minoverlap] mAP_bbox = mAP_bbox.mean(-1) mAP_bev = mAP_bev.mean(-1) mAP_3d = mAP_3d.mean(-1) if mAP_aos is not None: mAP_aos = mAP_aos.mean(-1) return mAP_bbox, mAP_bev, mAP_3d, mAP_aos def kitti_eval(gt_annos, dt_annos, current_classes, eval_types=['bbox', 'bev', '3d']): """KITTI evaluation. Args: gt_annos (list[dict]): Contain gt information of each sample. dt_annos (list[dict]): Contain detected information of each sample. current_classes (list[str]): Classes to evaluation. eval_types (list[str], optional): Types to eval. Defaults to ['bbox', 'bev', '3d']. Returns: tuple: String and dict of evaluation results. """ assert len(eval_types) > 0, 'must contain at least one evaluation type' if 'aos' in eval_types: assert 'bbox' in eval_types, 'must evaluate bbox when evaluating aos' overlap_0_7 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5], [0.7, 0.5, 0.5, 0.7, 0.5], [0.7, 0.5, 0.5, 0.7, 0.5]]) overlap_0_5 = np.array([[0.7, 0.5, 0.5, 0.7, 0.5], [0.5, 0.25, 0.25, 0.5, 0.25], [0.5, 0.25, 0.25, 0.5, 0.25]]) min_overlaps = np.stack([overlap_0_7, overlap_0_5], axis=0) # [2, 3, 5] class_to_name = { 0: 'Car', 1: 'Pedestrian', 2: 'Cyclist', 3: 'Van', 4: 'Person_sitting', } name_to_class = {v: n for n, v in class_to_name.items()} if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] current_classes_int = [] for curcls in current_classes: if isinstance(curcls, str): current_classes_int.append(name_to_class[curcls]) else: current_classes_int.append(curcls) current_classes = current_classes_int min_overlaps = min_overlaps[:, :, current_classes] result = '' # check whether alpha is valid compute_aos = False pred_alpha = False valid_alpha_gt = False for anno in dt_annos: mask = (anno['alpha'] != -10) if anno['alpha'][mask].shape[0] != 0: pred_alpha = True break for anno in gt_annos: if anno['alpha'][0] != -10: valid_alpha_gt = True break compute_aos = (pred_alpha and valid_alpha_gt) if compute_aos: eval_types.append('aos') mAP11_bbox, mAP11_bev, mAP11_3d, mAP11_aos, mAP40_bbox, mAP40_bev, \ mAP40_3d, mAP40_aos = do_eval(gt_annos, dt_annos, current_classes, min_overlaps, eval_types) ret_dict = {} difficulty = ['easy', 'moderate', 'hard'] # calculate AP11 result += '\n----------- AP11 Results ------------\n\n' for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] curcls_name = class_to_name[curcls] for i in range(min_overlaps.shape[0]): # prepare results for print result += ('{} AP11@{:.2f}, {:.2f}, {:.2f}:\n'.format( curcls_name, *min_overlaps[i, :, j])) if mAP11_bbox is not None: result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP11_bbox[j, :, i]) if mAP11_bev is not None: result += 'bev AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP11_bev[j, :, i]) if mAP11_3d is not None: result += '3d AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP11_3d[j, :, i]) if compute_aos: result += 'aos AP11:{:.2f}, {:.2f}, {:.2f}\n'.format( *mAP11_aos[j, :, i]) # prepare results for logger for idx in range(3): if i == 0: postfix = f'{difficulty[idx]}_strict' else: postfix = f'{difficulty[idx]}_loose' prefix = f'KITTI/{curcls_name}' if mAP11_3d is not None: ret_dict[f'{prefix}_3D_AP11_{postfix}'] =\ mAP11_3d[j, idx, i] if mAP11_bev is not None: ret_dict[f'{prefix}_BEV_AP11_{postfix}'] =\ mAP11_bev[j, idx, i] if mAP11_bbox is not None: ret_dict[f'{prefix}_2D_AP11_{postfix}'] =\ mAP11_bbox[j, idx, i] # calculate mAP11 over all classes if there are multiple classes if len(current_classes) > 1: # prepare results for print result += ('\nOverall AP11@{}, {}, {}:\n'.format(*difficulty)) if mAP11_bbox is not None: mAP11_bbox = mAP11_bbox.mean(axis=0) result += 'bbox AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP11_bbox[:, 0]) if mAP11_bev is not None: mAP11_bev = mAP11_bev.mean(axis=0) result += 'bev AP11:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP11_bev[:, 0]) if mAP11_3d is not None: mAP11_3d = mAP11_3d.mean(axis=0) result += '3d AP11:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP11_3d[:, 0]) if compute_aos: mAP11_aos = mAP11_aos.mean(axis=0) result += 'aos AP11:{:.2f}, {:.2f}, {:.2f}\n'.format( *mAP11_aos[:, 0]) # prepare results for logger for idx in range(3): postfix = f'{difficulty[idx]}' if mAP11_3d is not None: ret_dict[f'KITTI/Overall_3D_AP11_{postfix}'] = mAP11_3d[idx, 0] if mAP11_bev is not None: ret_dict[f'KITTI/Overall_BEV_AP11_{postfix}'] =\ mAP11_bev[idx, 0] if mAP11_bbox is not None: ret_dict[f'KITTI/Overall_2D_AP11_{postfix}'] =\ mAP11_bbox[idx, 0] # Calculate AP40 result += '\n----------- AP40 Results ------------\n\n' for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] curcls_name = class_to_name[curcls] for i in range(min_overlaps.shape[0]): # prepare results for print result += ('{} AP40@{:.2f}, {:.2f}, {:.2f}:\n'.format( curcls_name, *min_overlaps[i, :, j])) if mAP40_bbox is not None: result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP40_bbox[j, :, i]) if mAP40_bev is not None: result += 'bev AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP40_bev[j, :, i]) if mAP40_3d is not None: result += '3d AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP40_3d[j, :, i]) if compute_aos: result += 'aos AP40:{:.2f}, {:.2f}, {:.2f}\n'.format( *mAP40_aos[j, :, i]) # prepare results for logger for idx in range(3): if i == 0: postfix = f'{difficulty[idx]}_strict' else: postfix = f'{difficulty[idx]}_loose' prefix = f'KITTI/{curcls_name}' if mAP40_3d is not None: ret_dict[f'{prefix}_3D_AP40_{postfix}'] =\ mAP40_3d[j, idx, i] if mAP40_bev is not None: ret_dict[f'{prefix}_BEV_AP40_{postfix}'] =\ mAP40_bev[j, idx, i] if mAP40_bbox is not None: ret_dict[f'{prefix}_2D_AP40_{postfix}'] =\ mAP40_bbox[j, idx, i] # calculate mAP40 over all classes if there are multiple classes if len(current_classes) > 1: # prepare results for print result += ('\nOverall AP40@{}, {}, {}:\n'.format(*difficulty)) if mAP40_bbox is not None: mAP40_bbox = mAP40_bbox.mean(axis=0) result += 'bbox AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP40_bbox[:, 0]) if mAP40_bev is not None: mAP40_bev = mAP40_bev.mean(axis=0) result += 'bev AP40:{:.4f}, {:.4f}, {:.4f}\n'.format( *mAP40_bev[:, 0]) if mAP40_3d is not None: mAP40_3d = mAP40_3d.mean(axis=0) result += '3d AP40:{:.4f}, {:.4f}, {:.4f}\n'.format(*mAP40_3d[:, 0]) if compute_aos: mAP40_aos = mAP40_aos.mean(axis=0) result += 'aos AP40:{:.2f}, {:.2f}, {:.2f}\n'.format( *mAP40_aos[:, 0]) # prepare results for logger for idx in range(3): postfix = f'{difficulty[idx]}' if mAP40_3d is not None: ret_dict[f'KITTI/Overall_3D_AP40_{postfix}'] = mAP40_3d[idx, 0] if mAP40_bev is not None: ret_dict[f'KITTI/Overall_BEV_AP40_{postfix}'] =\ mAP40_bev[idx, 0] if mAP40_bbox is not None: ret_dict[f'KITTI/Overall_2D_AP40_{postfix}'] =\ mAP40_bbox[idx, 0] return result, ret_dict def kitti_eval_coco_style(gt_annos, dt_annos, current_classes): """coco style evaluation of kitti. Args: gt_annos (list[dict]): Contain gt information of each sample. dt_annos (list[dict]): Contain detected information of each sample. current_classes (list[str]): Classes to evaluation. Returns: string: Evaluation results. """ class_to_name = { 0: 'Car', 1: 'Pedestrian', 2: 'Cyclist', 3: 'Van', 4: 'Person_sitting', } class_to_range = { 0: [0.5, 0.95, 10], 1: [0.25, 0.7, 10], 2: [0.25, 0.7, 10], 3: [0.5, 0.95, 10], 4: [0.25, 0.7, 10], } name_to_class = {v: n for n, v in class_to_name.items()} if not isinstance(current_classes, (list, tuple)): current_classes = [current_classes] current_classes_int = [] for curcls in current_classes: if isinstance(curcls, str): current_classes_int.append(name_to_class[curcls]) else: current_classes_int.append(curcls) current_classes = current_classes_int overlap_ranges = np.zeros([3, 3, len(current_classes)]) for i, curcls in enumerate(current_classes): overlap_ranges[:, :, i] = np.array(class_to_range[curcls])[:, np.newaxis] result = '' # check whether alpha is valid compute_aos = False for anno in dt_annos: if anno['alpha'].shape[0] != 0: if anno['alpha'][0] != -10: compute_aos = True break mAPbbox, mAPbev, mAP3d, mAPaos = do_coco_style_eval( gt_annos, dt_annos, current_classes, overlap_ranges, compute_aos) for j, curcls in enumerate(current_classes): # mAP threshold array: [num_minoverlap, metric, class] # mAP result: [num_class, num_diff, num_minoverlap] o_range = np.array(class_to_range[curcls])[[0, 2, 1]] o_range[1] = (o_range[2] - o_range[0]) / (o_range[1] - 1) result += print_str((f'{class_to_name[curcls]} ' 'coco AP@{:.2f}:{:.2f}:{:.2f}:'.format(*o_range))) result += print_str((f'bbox AP:{mAPbbox[j, 0]:.2f}, ' f'{mAPbbox[j, 1]:.2f}, ' f'{mAPbbox[j, 2]:.2f}')) result += print_str((f'bev AP:{mAPbev[j, 0]:.2f}, ' f'{mAPbev[j, 1]:.2f}, ' f'{mAPbev[j, 2]:.2f}')) result += print_str((f'3d AP:{mAP3d[j, 0]:.2f}, ' f'{mAP3d[j, 1]:.2f}, ' f'{mAP3d[j, 2]:.2f}')) if compute_aos: result += print_str((f'aos AP:{mAPaos[j, 0]:.2f}, ' f'{mAPaos[j, 1]:.2f}, ' f'{mAPaos[j, 2]:.2f}')) return result ================================================ FILE: mmdet3d/core/evaluation/kitti_utils/rotate_iou.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. ##################### # Based on https://github.com/hongzhenwang/RRPN-revise # Licensed under The MIT License # Author: yanyan, scrin@foxmail.com ##################### import math import numba import numpy as np from numba import cuda @numba.jit(nopython=True) def div_up(m, n): return m // n + (m % n > 0) @cuda.jit(device=True, inline=True) def trangle_area(a, b, c): return ((a[0] - c[0]) * (b[1] - c[1]) - (a[1] - c[1]) * (b[0] - c[0])) / 2.0 @cuda.jit(device=True, inline=True) def area(int_pts, num_of_inter): area_val = 0.0 for i in range(num_of_inter - 2): area_val += abs( trangle_area(int_pts[:2], int_pts[2 * i + 2:2 * i + 4], int_pts[2 * i + 4:2 * i + 6])) return area_val @cuda.jit(device=True, inline=True) def sort_vertex_in_convex_polygon(int_pts, num_of_inter): if num_of_inter > 0: center = cuda.local.array((2, ), dtype=numba.float32) center[:] = 0.0 for i in range(num_of_inter): center[0] += int_pts[2 * i] center[1] += int_pts[2 * i + 1] center[0] /= num_of_inter center[1] /= num_of_inter v = cuda.local.array((2, ), dtype=numba.float32) vs = cuda.local.array((16, ), dtype=numba.float32) for i in range(num_of_inter): v[0] = int_pts[2 * i] - center[0] v[1] = int_pts[2 * i + 1] - center[1] d = math.sqrt(v[0] * v[0] + v[1] * v[1]) v[0] = v[0] / d v[1] = v[1] / d if v[1] < 0: v[0] = -2 - v[0] vs[i] = v[0] j = 0 temp = 0 for i in range(1, num_of_inter): if vs[i - 1] > vs[i]: temp = vs[i] tx = int_pts[2 * i] ty = int_pts[2 * i + 1] j = i while j > 0 and vs[j - 1] > temp: vs[j] = vs[j - 1] int_pts[j * 2] = int_pts[j * 2 - 2] int_pts[j * 2 + 1] = int_pts[j * 2 - 1] j -= 1 vs[j] = temp int_pts[j * 2] = tx int_pts[j * 2 + 1] = ty @cuda.jit(device=True, inline=True) def line_segment_intersection(pts1, pts2, i, j, temp_pts): A = cuda.local.array((2, ), dtype=numba.float32) B = cuda.local.array((2, ), dtype=numba.float32) C = cuda.local.array((2, ), dtype=numba.float32) D = cuda.local.array((2, ), dtype=numba.float32) A[0] = pts1[2 * i] A[1] = pts1[2 * i + 1] B[0] = pts1[2 * ((i + 1) % 4)] B[1] = pts1[2 * ((i + 1) % 4) + 1] C[0] = pts2[2 * j] C[1] = pts2[2 * j + 1] D[0] = pts2[2 * ((j + 1) % 4)] D[1] = pts2[2 * ((j + 1) % 4) + 1] BA0 = B[0] - A[0] BA1 = B[1] - A[1] DA0 = D[0] - A[0] CA0 = C[0] - A[0] DA1 = D[1] - A[1] CA1 = C[1] - A[1] acd = DA1 * CA0 > CA1 * DA0 bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * (D[0] - B[0]) if acd != bcd: abc = CA1 * BA0 > BA1 * CA0 abd = DA1 * BA0 > BA1 * DA0 if abc != abd: DC0 = D[0] - C[0] DC1 = D[1] - C[1] ABBA = A[0] * B[1] - B[0] * A[1] CDDC = C[0] * D[1] - D[0] * C[1] DH = BA1 * DC0 - BA0 * DC1 Dx = ABBA * DC0 - BA0 * CDDC Dy = ABBA * DC1 - BA1 * CDDC temp_pts[0] = Dx / DH temp_pts[1] = Dy / DH return True return False @cuda.jit(device=True, inline=True) def line_segment_intersection_v1(pts1, pts2, i, j, temp_pts): a = cuda.local.array((2, ), dtype=numba.float32) b = cuda.local.array((2, ), dtype=numba.float32) c = cuda.local.array((2, ), dtype=numba.float32) d = cuda.local.array((2, ), dtype=numba.float32) a[0] = pts1[2 * i] a[1] = pts1[2 * i + 1] b[0] = pts1[2 * ((i + 1) % 4)] b[1] = pts1[2 * ((i + 1) % 4) + 1] c[0] = pts2[2 * j] c[1] = pts2[2 * j + 1] d[0] = pts2[2 * ((j + 1) % 4)] d[1] = pts2[2 * ((j + 1) % 4) + 1] area_abc = trangle_area(a, b, c) area_abd = trangle_area(a, b, d) if area_abc * area_abd >= 0: return False area_cda = trangle_area(c, d, a) area_cdb = area_cda + area_abc - area_abd if area_cda * area_cdb >= 0: return False t = area_cda / (area_abd - area_abc) dx = t * (b[0] - a[0]) dy = t * (b[1] - a[1]) temp_pts[0] = a[0] + dx temp_pts[1] = a[1] + dy return True @cuda.jit(device=True, inline=True) def point_in_quadrilateral(pt_x, pt_y, corners): ab0 = corners[2] - corners[0] ab1 = corners[3] - corners[1] ad0 = corners[6] - corners[0] ad1 = corners[7] - corners[1] ap0 = pt_x - corners[0] ap1 = pt_y - corners[1] abab = ab0 * ab0 + ab1 * ab1 abap = ab0 * ap0 + ab1 * ap1 adad = ad0 * ad0 + ad1 * ad1 adap = ad0 * ap0 + ad1 * ap1 return abab >= abap and abap >= 0 and adad >= adap and adap >= 0 @cuda.jit(device=True, inline=True) def quadrilateral_intersection(pts1, pts2, int_pts): num_of_inter = 0 for i in range(4): if point_in_quadrilateral(pts1[2 * i], pts1[2 * i + 1], pts2): int_pts[num_of_inter * 2] = pts1[2 * i] int_pts[num_of_inter * 2 + 1] = pts1[2 * i + 1] num_of_inter += 1 if point_in_quadrilateral(pts2[2 * i], pts2[2 * i + 1], pts1): int_pts[num_of_inter * 2] = pts2[2 * i] int_pts[num_of_inter * 2 + 1] = pts2[2 * i + 1] num_of_inter += 1 temp_pts = cuda.local.array((2, ), dtype=numba.float32) for i in range(4): for j in range(4): has_pts = line_segment_intersection(pts1, pts2, i, j, temp_pts) if has_pts: int_pts[num_of_inter * 2] = temp_pts[0] int_pts[num_of_inter * 2 + 1] = temp_pts[1] num_of_inter += 1 return num_of_inter @cuda.jit(device=True, inline=True) def rbbox_to_corners(corners, rbbox): # generate clockwise corners and rotate it clockwise angle = rbbox[4] a_cos = math.cos(angle) a_sin = math.sin(angle) center_x = rbbox[0] center_y = rbbox[1] x_d = rbbox[2] y_d = rbbox[3] corners_x = cuda.local.array((4, ), dtype=numba.float32) corners_y = cuda.local.array((4, ), dtype=numba.float32) corners_x[0] = -x_d / 2 corners_x[1] = -x_d / 2 corners_x[2] = x_d / 2 corners_x[3] = x_d / 2 corners_y[0] = -y_d / 2 corners_y[1] = y_d / 2 corners_y[2] = y_d / 2 corners_y[3] = -y_d / 2 for i in range(4): corners[2 * i] = a_cos * corners_x[i] + a_sin * corners_y[i] + center_x corners[2 * i + 1] = -a_sin * corners_x[i] + a_cos * corners_y[i] + center_y @cuda.jit(device=True, inline=True) def inter(rbbox1, rbbox2): """Compute intersection of two rotated boxes. Args: rbox1 (np.ndarray, shape=[5]): Rotated 2d box. rbox2 (np.ndarray, shape=[5]): Rotated 2d box. Returns: float: Intersection of two rotated boxes. """ corners1 = cuda.local.array((8, ), dtype=numba.float32) corners2 = cuda.local.array((8, ), dtype=numba.float32) intersection_corners = cuda.local.array((16, ), dtype=numba.float32) rbbox_to_corners(corners1, rbbox1) rbbox_to_corners(corners2, rbbox2) num_intersection = quadrilateral_intersection(corners1, corners2, intersection_corners) sort_vertex_in_convex_polygon(intersection_corners, num_intersection) # print(intersection_corners.reshape([-1, 2])[:num_intersection]) return area(intersection_corners, num_intersection) @cuda.jit(device=True, inline=True) def devRotateIoUEval(rbox1, rbox2, criterion=-1): """Compute rotated iou on device. Args: rbox1 (np.ndarray, shape=[5]): Rotated 2d box. rbox2 (np.ndarray, shape=[5]): Rotated 2d box. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, 0 indicate `area_inter / area1`, 1 indicate `area_inter / area2`. Returns: float: iou between two input boxes. """ area1 = rbox1[2] * rbox1[3] area2 = rbox2[2] * rbox2[3] area_inter = inter(rbox1, rbox2) if criterion == -1: return area_inter / (area1 + area2 - area_inter) elif criterion == 0: return area_inter / area1 elif criterion == 1: return area_inter / area2 else: return area_inter @cuda.jit( '(int64, int64, float32[:], float32[:], float32[:], int32)', fastmath=False) def rotate_iou_kernel_eval(N, K, dev_boxes, dev_query_boxes, dev_iou, criterion=-1): """Kernel of computing rotated IoU. This function is for bev boxes in camera coordinate system ONLY (the rotation is clockwise). Args: N (int): The number of boxes. K (int): The number of query boxes. dev_boxes (np.ndarray): Boxes on device. dev_query_boxes (np.ndarray): Query boxes on device. dev_iou (np.ndarray): Computed iou to return. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, 0 indicate `area_inter / area1`, 1 indicate `area_inter / area2`. """ threadsPerBlock = 8 * 8 row_start = cuda.blockIdx.x col_start = cuda.blockIdx.y tx = cuda.threadIdx.x row_size = min(N - row_start * threadsPerBlock, threadsPerBlock) col_size = min(K - col_start * threadsPerBlock, threadsPerBlock) block_boxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) block_qboxes = cuda.shared.array(shape=(64 * 5, ), dtype=numba.float32) dev_query_box_idx = threadsPerBlock * col_start + tx dev_box_idx = threadsPerBlock * row_start + tx if (tx < col_size): block_qboxes[tx * 5 + 0] = dev_query_boxes[dev_query_box_idx * 5 + 0] block_qboxes[tx * 5 + 1] = dev_query_boxes[dev_query_box_idx * 5 + 1] block_qboxes[tx * 5 + 2] = dev_query_boxes[dev_query_box_idx * 5 + 2] block_qboxes[tx * 5 + 3] = dev_query_boxes[dev_query_box_idx * 5 + 3] block_qboxes[tx * 5 + 4] = dev_query_boxes[dev_query_box_idx * 5 + 4] if (tx < row_size): block_boxes[tx * 5 + 0] = dev_boxes[dev_box_idx * 5 + 0] block_boxes[tx * 5 + 1] = dev_boxes[dev_box_idx * 5 + 1] block_boxes[tx * 5 + 2] = dev_boxes[dev_box_idx * 5 + 2] block_boxes[tx * 5 + 3] = dev_boxes[dev_box_idx * 5 + 3] block_boxes[tx * 5 + 4] = dev_boxes[dev_box_idx * 5 + 4] cuda.syncthreads() if tx < row_size: for i in range(col_size): offset = ( row_start * threadsPerBlock * K + col_start * threadsPerBlock + tx * K + i) dev_iou[offset] = devRotateIoUEval(block_qboxes[i * 5:i * 5 + 5], block_boxes[tx * 5:tx * 5 + 5], criterion) def rotate_iou_gpu_eval(boxes, query_boxes, criterion=-1, device_id=0): """Rotated box iou running in gpu. 500x faster than cpu version (take 5ms in one example with numba.cuda code). convert from [this project]( https://github.com/hongzhenwang/RRPN-revise/tree/master/lib/rotation). This function is for bev boxes in camera coordinate system ONLY (the rotation is clockwise). Args: boxes (torch.Tensor): rbboxes. format: centers, dims, angles(clockwise when positive) with the shape of [N, 5]. query_boxes (torch.FloatTensor, shape=(K, 5)): rbboxes to compute iou with boxes. device_id (int, optional): Defaults to 0. Device to use. criterion (int, optional): Indicate different type of iou. -1 indicate `area_inter / (area1 + area2 - area_inter)`, 0 indicate `area_inter / area1`, 1 indicate `area_inter / area2`. Returns: np.ndarray: IoU results. """ boxes = boxes.astype(np.float32) query_boxes = query_boxes.astype(np.float32) N = boxes.shape[0] K = query_boxes.shape[0] iou = np.zeros((N, K), dtype=np.float32) if N == 0 or K == 0: return iou threadsPerBlock = 8 * 8 cuda.select_device(device_id) blockspergrid = (div_up(N, threadsPerBlock), div_up(K, threadsPerBlock)) stream = cuda.stream() with stream.auto_synchronize(): boxes_dev = cuda.to_device(boxes.reshape([-1]), stream) query_boxes_dev = cuda.to_device(query_boxes.reshape([-1]), stream) iou_dev = cuda.to_device(iou.reshape([-1]), stream) rotate_iou_kernel_eval[blockspergrid, threadsPerBlock, stream](N, K, boxes_dev, query_boxes_dev, iou_dev, criterion) iou_dev.copy_to_host(iou.reshape([-1]), stream=stream) return iou.astype(boxes.dtype) ================================================ FILE: mmdet3d/core/evaluation/lyft_eval.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp import mmcv import numpy as np from lyft_dataset_sdk.eval.detection.mAP_evaluation import (Box3D, get_ap, get_class_names, get_ious, group_by_key, wrap_in_box) from mmcv.utils import print_log from terminaltables import AsciiTable def load_lyft_gts(lyft, data_root, eval_split, logger=None): """Loads ground truth boxes from database. Args: lyft (:obj:`LyftDataset`): Lyft class in the sdk. data_root (str): Root of data for reading splits. eval_split (str): Name of the split for evaluation. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. Returns: list[dict]: List of annotation dictionaries. """ split_scenes = mmcv.list_from_file( osp.join(data_root, f'{eval_split}.txt')) # Read out all sample_tokens in DB. sample_tokens_all = [s['token'] for s in lyft.sample] assert len(sample_tokens_all) > 0, 'Error: Database has no samples!' if eval_split == 'test': # Check that you aren't trying to cheat :) assert len(lyft.sample_annotation) > 0, \ 'Error: You are trying to evaluate on the test set \ but you do not have the annotations!' sample_tokens = [] for sample_token in sample_tokens_all: scene_token = lyft.get('sample', sample_token)['scene_token'] scene_record = lyft.get('scene', scene_token) if scene_record['name'] in split_scenes: sample_tokens.append(sample_token) all_annotations = [] print_log('Loading ground truth annotations...', logger=logger) # Load annotations and filter predictions and annotations. for sample_token in mmcv.track_iter_progress(sample_tokens): sample = lyft.get('sample', sample_token) sample_annotation_tokens = sample['anns'] for sample_annotation_token in sample_annotation_tokens: # Get label name in detection task and filter unused labels. sample_annotation = \ lyft.get('sample_annotation', sample_annotation_token) detection_name = sample_annotation['category_name'] if detection_name is None: continue annotation = { 'sample_token': sample_token, 'translation': sample_annotation['translation'], 'size': sample_annotation['size'], 'rotation': sample_annotation['rotation'], 'name': detection_name, } all_annotations.append(annotation) return all_annotations def load_lyft_predictions(res_path): """Load Lyft predictions from json file. Args: res_path (str): Path of result json file recording detections. Returns: list[dict]: List of prediction dictionaries. """ predictions = mmcv.load(res_path) predictions = predictions['results'] all_preds = [] for sample_token in predictions.keys(): all_preds.extend(predictions[sample_token]) return all_preds def lyft_eval(lyft, data_root, res_path, eval_set, output_dir, logger=None): """Evaluation API for Lyft dataset. Args: lyft (:obj:`LyftDataset`): Lyft class in the sdk. data_root (str): Root of data for reading splits. res_path (str): Path of result json file recording detections. eval_set (str): Name of the split for evaluation. output_dir (str): Output directory for output json files. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. Returns: dict[str, float]: The evaluation results. """ # evaluate by lyft metrics gts = load_lyft_gts(lyft, data_root, eval_set, logger) predictions = load_lyft_predictions(res_path) class_names = get_class_names(gts) print('Calculating mAP@0.5:0.95...') iou_thresholds = [0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95] metrics = {} average_precisions = \ get_classwise_aps(gts, predictions, class_names, iou_thresholds) APs_data = [['IOU', 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]] mAPs = np.mean(average_precisions, axis=0) mAPs_cate = np.mean(average_precisions, axis=1) final_mAP = np.mean(mAPs) metrics['average_precisions'] = average_precisions.tolist() metrics['mAPs'] = mAPs.tolist() metrics['Final mAP'] = float(final_mAP) metrics['class_names'] = class_names metrics['mAPs_cate'] = mAPs_cate.tolist() APs_data = [['class', 'mAP@0.5:0.95']] for i in range(len(class_names)): row = [class_names[i], round(mAPs_cate[i], 3)] APs_data.append(row) APs_data.append(['Overall', round(final_mAP, 3)]) APs_table = AsciiTable(APs_data, title='mAPs@0.5:0.95') APs_table.inner_footing_row_border = True print_log(APs_table.table, logger=logger) res_path = osp.join(output_dir, 'lyft_metrics.json') mmcv.dump(metrics, res_path) return metrics def get_classwise_aps(gt, predictions, class_names, iou_thresholds): """Returns an array with an average precision per class. Note: Ground truth and predictions should have the following format. .. code-block:: gt = [{ 'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207 fbb039a550991a5149214f98cec136ac', 'translation': [974.2811881299899, 1714.6815014457964, -23.689857123368846], 'size': [1.796, 4.488, 1.664], 'rotation': [0.14882026466054782, 0, 0, 0.9888642620837121], 'name': 'car' }] predictions = [{ 'sample_token': '0f0e3ce89d2324d8b45aa55a7b4f8207 fbb039a550991a5149214f98cec136ac', 'translation': [971.8343488872263, 1713.6816097857359, -25.82534357061308], 'size': [2.519726579986132, 7.810161372666739, 3.483438286096803], 'rotation': [0.10913582721095375, 0.04099572636992043, 0.01927712319721745, 1.029328402625659], 'name': 'car', 'score': 0.3077029437237213 }] Args: gt (list[dict]): list of dictionaries in the format described below. predictions (list[dict]): list of dictionaries in the format described below. class_names (list[str]): list of the class names. iou_thresholds (list[float]): IOU thresholds used to calculate TP / FN Returns: np.ndarray: an array with an average precision per class. """ assert all([0 <= iou_th <= 1 for iou_th in iou_thresholds]) gt_by_class_name = group_by_key(gt, 'name') pred_by_class_name = group_by_key(predictions, 'name') average_precisions = np.zeros((len(class_names), len(iou_thresholds))) for class_id, class_name in enumerate(class_names): if class_name in pred_by_class_name: recalls, precisions, average_precision = get_single_class_aps( gt_by_class_name[class_name], pred_by_class_name[class_name], iou_thresholds) average_precisions[class_id, :] = average_precision return average_precisions def get_single_class_aps(gt, predictions, iou_thresholds): """Compute recall and precision for all iou thresholds. Adapted from LyftDatasetDevkit. Args: gt (list[dict]): list of dictionaries in the format described above. predictions (list[dict]): list of dictionaries in the format described below. iou_thresholds (list[float]): IOU thresholds used to calculate TP / FN Returns: tuple[np.ndarray]: Returns (recalls, precisions, average precisions) for each class. """ num_gts = len(gt) image_gts = group_by_key(gt, 'sample_token') image_gts = wrap_in_box(image_gts) sample_gt_checked = { sample_token: np.zeros((len(boxes), len(iou_thresholds))) for sample_token, boxes in image_gts.items() } predictions = sorted(predictions, key=lambda x: x['score'], reverse=True) # go down dets and mark TPs and FPs num_predictions = len(predictions) tps = np.zeros((num_predictions, len(iou_thresholds))) fps = np.zeros((num_predictions, len(iou_thresholds))) for prediction_index, prediction in enumerate(predictions): predicted_box = Box3D(**prediction) sample_token = prediction['sample_token'] max_overlap = -np.inf jmax = -1 if sample_token in image_gts: gt_boxes = image_gts[sample_token] # gt_boxes per sample gt_checked = sample_gt_checked[sample_token] # gt flags per sample else: gt_boxes = [] gt_checked = None if len(gt_boxes) > 0: overlaps = get_ious(gt_boxes, predicted_box) max_overlap = np.max(overlaps) jmax = np.argmax(overlaps) for i, iou_threshold in enumerate(iou_thresholds): if max_overlap > iou_threshold: if gt_checked[jmax, i] == 0: tps[prediction_index, i] = 1.0 gt_checked[jmax, i] = 1 else: fps[prediction_index, i] = 1.0 else: fps[prediction_index, i] = 1.0 # compute precision recall fps = np.cumsum(fps, axis=0) tps = np.cumsum(tps, axis=0) recalls = tps / float(num_gts) # avoid divide by zero in case the first detection # matches a difficult ground truth precisions = tps / np.maximum(tps + fps, np.finfo(np.float64).eps) aps = [] for i in range(len(iou_thresholds)): recall = recalls[:, i] precision = precisions[:, i] assert np.all(0 <= recall) & np.all(recall <= 1) assert np.all(0 <= precision) & np.all(precision <= 1) ap = get_ap(recall, precision) aps.append(ap) aps = np.array(aps) return recalls, precisions, aps ================================================ FILE: mmdet3d/core/evaluation/scannet_utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .evaluate_semantic_instance import evaluate_matches, scannet_eval __all__ = ['scannet_eval', 'evaluate_matches'] ================================================ FILE: mmdet3d/core/evaluation/scannet_utils/evaluate_semantic_instance.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/3d_evaluation/evaluate_semantic_instance.py # noqa from copy import deepcopy import numpy as np from . import util_3d def evaluate_matches(matches, class_labels, options): """Evaluate instance segmentation from matched gt and predicted instances for all scenes. Args: matches (dict): Contains gt2pred and pred2gt infos for every scene. class_labels (tuple[str]): Class names. options (dict): ScanNet evaluator options. See get_options. Returns: np.array: Average precision scores for all thresholds and categories. """ overlaps = options['overlaps'] min_region_sizes = [options['min_region_sizes'][0]] dist_threshes = [options['distance_threshes'][0]] dist_confs = [options['distance_confs'][0]] # results: class x overlap ap = np.zeros((len(dist_threshes), len(class_labels), len(overlaps)), np.float) for di, (min_region_size, distance_thresh, distance_conf) in enumerate( zip(min_region_sizes, dist_threshes, dist_confs)): for oi, overlap_th in enumerate(overlaps): pred_visited = {} for m in matches: for label_name in class_labels: for p in matches[m]['pred'][label_name]: if 'filename' in p: pred_visited[p['filename']] = False for li, label_name in enumerate(class_labels): y_true = np.empty(0) y_score = np.empty(0) hard_false_negatives = 0 has_gt = False has_pred = False for m in matches: pred_instances = matches[m]['pred'][label_name] gt_instances = matches[m]['gt'][label_name] # filter groups in ground truth gt_instances = [ gt for gt in gt_instances if gt['instance_id'] >= 1000 and gt['vert_count'] >= min_region_size and gt['med_dist'] <= distance_thresh and gt['dist_conf'] >= distance_conf ] if gt_instances: has_gt = True if pred_instances: has_pred = True cur_true = np.ones(len(gt_instances)) cur_score = np.ones(len(gt_instances)) * (-float('inf')) cur_match = np.zeros(len(gt_instances), dtype=np.bool) # collect matches for (gti, gt) in enumerate(gt_instances): found_match = False for pred in gt['matched_pred']: # greedy assignments if pred_visited[pred['filename']]: continue overlap = float(pred['intersection']) / ( gt['vert_count'] + pred['vert_count'] - pred['intersection']) if overlap > overlap_th: confidence = pred['confidence'] # if already have a prediction for this gt, # the prediction with the lower score is automatically a false positive # noqa if cur_match[gti]: max_score = max(cur_score[gti], confidence) min_score = min(cur_score[gti], confidence) cur_score[gti] = max_score # append false positive cur_true = np.append(cur_true, 0) cur_score = np.append(cur_score, min_score) cur_match = np.append(cur_match, True) # otherwise set score else: found_match = True cur_match[gti] = True cur_score[gti] = confidence pred_visited[pred['filename']] = True if not found_match: hard_false_negatives += 1 # remove non-matched ground truth instances cur_true = cur_true[cur_match] cur_score = cur_score[cur_match] # collect non-matched predictions as false positive for pred in pred_instances: found_gt = False for gt in pred['matched_gt']: overlap = float(gt['intersection']) / ( gt['vert_count'] + pred['vert_count'] - gt['intersection']) if overlap > overlap_th: found_gt = True break if not found_gt: num_ignore = pred['void_intersection'] for gt in pred['matched_gt']: # group? if gt['instance_id'] < 1000: num_ignore += gt['intersection'] # small ground truth instances if gt['vert_count'] < min_region_size or gt[ 'med_dist'] > distance_thresh or gt[ 'dist_conf'] < distance_conf: num_ignore += gt['intersection'] proportion_ignore = float( num_ignore) / pred['vert_count'] # if not ignored append false positive if proportion_ignore <= overlap_th: cur_true = np.append(cur_true, 0) confidence = pred['confidence'] cur_score = np.append(cur_score, confidence) # append to overall results y_true = np.append(y_true, cur_true) y_score = np.append(y_score, cur_score) # compute average precision if has_gt and has_pred: # compute precision recall curve first # sorting and cumsum score_arg_sort = np.argsort(y_score) y_score_sorted = y_score[score_arg_sort] y_true_sorted = y_true[score_arg_sort] y_true_sorted_cumsum = np.cumsum(y_true_sorted) # unique thresholds (thresholds, unique_indices) = np.unique( y_score_sorted, return_index=True) num_prec_recall = len(unique_indices) + 1 # prepare precision recall num_examples = len(y_score_sorted) # follow https://github.com/ScanNet/ScanNet/pull/26 ? # noqa num_true_examples = y_true_sorted_cumsum[-1] if len( y_true_sorted_cumsum) > 0 else 0 precision = np.zeros(num_prec_recall) recall = np.zeros(num_prec_recall) # deal with the first point y_true_sorted_cumsum = np.append(y_true_sorted_cumsum, 0) # deal with remaining for idx_res, idx_scores in enumerate(unique_indices): cumsum = y_true_sorted_cumsum[idx_scores - 1] tp = num_true_examples - cumsum fp = num_examples - idx_scores - tp fn = cumsum + hard_false_negatives p = float(tp) / (tp + fp) r = float(tp) / (tp + fn) precision[idx_res] = p recall[idx_res] = r # first point in curve is artificial precision[-1] = 1. recall[-1] = 0. # compute average of precision-recall curve recall_for_conv = np.copy(recall) recall_for_conv = np.append(recall_for_conv[0], recall_for_conv) recall_for_conv = np.append(recall_for_conv, 0.) stepWidths = np.convolve(recall_for_conv, [-0.5, 0, 0.5], 'valid') # integrate is now simply a dot product ap_current = np.dot(precision, stepWidths) elif has_gt: ap_current = 0.0 else: ap_current = float('nan') ap[di, li, oi] = ap_current return ap def compute_averages(aps, options, class_labels): """Averages AP scores for all categories. Args: aps (np.array): AP scores for all thresholds and categories. options (dict): ScanNet evaluator options. See get_options. class_labels (tuple[str]): Class names. Returns: dict: Overall and per-category AP scores. """ d_inf = 0 o50 = np.where(np.isclose(options['overlaps'], 0.5)) o25 = np.where(np.isclose(options['overlaps'], 0.25)) o_all_but25 = np.where( np.logical_not(np.isclose(options['overlaps'], 0.25))) avg_dict = {} avg_dict['all_ap'] = np.nanmean(aps[d_inf, :, o_all_but25]) avg_dict['all_ap_50%'] = np.nanmean(aps[d_inf, :, o50]) avg_dict['all_ap_25%'] = np.nanmean(aps[d_inf, :, o25]) avg_dict['classes'] = {} for (li, label_name) in enumerate(class_labels): avg_dict['classes'][label_name] = {} avg_dict['classes'][label_name]['ap'] = np.average(aps[d_inf, li, o_all_but25]) avg_dict['classes'][label_name]['ap50%'] = np.average(aps[d_inf, li, o50]) avg_dict['classes'][label_name]['ap25%'] = np.average(aps[d_inf, li, o25]) return avg_dict def assign_instances_for_scan(pred_info, gt_ids, options, valid_class_ids, class_labels, id_to_label): """Assign gt and predicted instances for a single scene. Args: pred_info (dict): Predicted masks, labels and scores. gt_ids (np.array): Ground truth instance masks. options (dict): ScanNet evaluator options. See get_options. valid_class_ids (tuple[int]): Ids of valid categories. class_labels (tuple[str]): Class names. id_to_label (dict[int, str]): Mapping of valid class id to class label. Returns: dict: Per class assigned gt to predicted instances. dict: Per class assigned predicted to gt instances. """ # get gt instances gt_instances = util_3d.get_instances(gt_ids, valid_class_ids, class_labels, id_to_label) # associate gt2pred = deepcopy(gt_instances) for label in gt2pred: for gt in gt2pred[label]: gt['matched_pred'] = [] pred2gt = {} for label in class_labels: pred2gt[label] = [] num_pred_instances = 0 # mask of void labels in the ground truth bool_void = np.logical_not(np.in1d(gt_ids // 1000, valid_class_ids)) # go through all prediction masks for pred_mask_file in pred_info: label_id = int(pred_info[pred_mask_file]['label_id']) conf = pred_info[pred_mask_file]['conf'] if not label_id in id_to_label: # noqa E713 continue label_name = id_to_label[label_id] # read the mask pred_mask = pred_info[pred_mask_file]['mask'] if len(pred_mask) != len(gt_ids): raise ValueError('len(pred_mask) != len(gt_ids)') # convert to binary pred_mask = np.not_equal(pred_mask, 0) num = np.count_nonzero(pred_mask) if num < options['min_region_sizes'][0]: continue # skip if empty pred_instance = {} pred_instance['filename'] = pred_mask_file pred_instance['pred_id'] = num_pred_instances pred_instance['label_id'] = label_id pred_instance['vert_count'] = num pred_instance['confidence'] = conf pred_instance['void_intersection'] = np.count_nonzero( np.logical_and(bool_void, pred_mask)) # matched gt instances matched_gt = [] # go through all gt instances with matching label for (gt_num, gt_inst) in enumerate(gt2pred[label_name]): intersection = np.count_nonzero( np.logical_and(gt_ids == gt_inst['instance_id'], pred_mask)) if intersection > 0: gt_copy = gt_inst.copy() pred_copy = pred_instance.copy() gt_copy['intersection'] = intersection pred_copy['intersection'] = intersection matched_gt.append(gt_copy) gt2pred[label_name][gt_num]['matched_pred'].append(pred_copy) pred_instance['matched_gt'] = matched_gt num_pred_instances += 1 pred2gt[label_name].append(pred_instance) return gt2pred, pred2gt def scannet_eval(preds, gts, options, valid_class_ids, class_labels, id_to_label): """Evaluate instance segmentation in ScanNet protocol. Args: preds (list[dict]): Per scene predictions of mask, label and confidence. gts (list[np.array]): Per scene ground truth instance masks. options (dict): ScanNet evaluator options. See get_options. valid_class_ids (tuple[int]): Ids of valid categories. class_labels (tuple[str]): Class names. id_to_label (dict[int, str]): Mapping of valid class id to class label. Returns: dict: Overall and per-category AP scores. """ options = get_options(options) matches = {} for i, (pred, gt) in enumerate(zip(preds, gts)): matches_key = i # assign gt to predictions gt2pred, pred2gt = assign_instances_for_scan(pred, gt, options, valid_class_ids, class_labels, id_to_label) matches[matches_key] = {} matches[matches_key]['gt'] = gt2pred matches[matches_key]['pred'] = pred2gt ap_scores = evaluate_matches(matches, class_labels, options) avgs = compute_averages(ap_scores, options, class_labels) return avgs def get_options(options=None): """Set ScanNet evaluator options. Args: options (dict, optional): Not default options. Default: None. Returns: dict: Updated options with all 4 keys. """ assert options is None or isinstance(options, dict) _options = dict( overlaps=np.append(np.arange(0.5, 0.95, 0.05), 0.25), min_region_sizes=np.array([100]), distance_threshes=np.array([float('inf')]), distance_confs=np.array([-float('inf')])) if options is not None: _options.update(options) return _options ================================================ FILE: mmdet3d/core/evaluation/scannet_utils/util_3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # adapted from https://github.com/ScanNet/ScanNet/blob/master/BenchmarkScripts/util_3d.py # noqa import json import numpy as np class Instance: """Single instance for ScanNet evaluator. Args: mesh_vert_instances (np.array): Instance ids for each point. instance_id: Id of single instance. """ instance_id = 0 label_id = 0 vert_count = 0 med_dist = -1 dist_conf = 0.0 def __init__(self, mesh_vert_instances, instance_id): if instance_id == -1: return self.instance_id = int(instance_id) self.label_id = int(self.get_label_id(instance_id)) self.vert_count = int( self.get_instance_verts(mesh_vert_instances, instance_id)) @staticmethod def get_label_id(instance_id): return int(instance_id // 1000) @staticmethod def get_instance_verts(mesh_vert_instances, instance_id): return (mesh_vert_instances == instance_id).sum() def to_json(self): return json.dumps( self, default=lambda o: o.__dict__, sort_keys=True, indent=4) def to_dict(self): dict = {} dict['instance_id'] = self.instance_id dict['label_id'] = self.label_id dict['vert_count'] = self.vert_count dict['med_dist'] = self.med_dist dict['dist_conf'] = self.dist_conf return dict def from_json(self, data): self.instance_id = int(data['instance_id']) self.label_id = int(data['label_id']) self.vert_count = int(data['vert_count']) if 'med_dist' in data: self.med_dist = float(data['med_dist']) self.dist_conf = float(data['dist_conf']) def __str__(self): return '(' + str(self.instance_id) + ')' def get_instances(ids, class_ids, class_labels, id2label): """Transform gt instance mask to Instance objects. Args: ids (np.array): Instance ids for each point. class_ids: (tuple[int]): Ids of valid categories. class_labels (tuple[str]): Class names. id2label: (dict[int, str]): Mapping of valid class id to class label. Returns: dict [str, list]: Instance objects grouped by class label. """ instances = {} for label in class_labels: instances[label] = [] instance_ids = np.unique(ids) for id in instance_ids: if id == 0: continue inst = Instance(ids, id) if inst.label_id in class_ids: instances[id2label[inst.label_id]].append(inst.to_dict()) return instances ================================================ FILE: mmdet3d/core/evaluation/seg_eval.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np from mmcv.utils import print_log from terminaltables import AsciiTable def fast_hist(preds, labels, num_classes): """Compute the confusion matrix for every batch. Args: preds (np.ndarray): Prediction labels of points with shape of (num_points, ). labels (np.ndarray): Ground truth labels of points with shape of (num_points, ). num_classes (int): number of classes Returns: np.ndarray: Calculated confusion matrix. """ k = (labels >= 0) & (labels < num_classes) bin_count = np.bincount( num_classes * labels[k].astype(int) + preds[k], minlength=num_classes**2) return bin_count[:num_classes**2].reshape(num_classes, num_classes) def per_class_iou(hist): """Compute the per class iou. Args: hist(np.ndarray): Overall confusion martix (num_classes, num_classes ). Returns: np.ndarray: Calculated per class iou """ return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) def get_acc(hist): """Compute the overall accuracy. Args: hist(np.ndarray): Overall confusion martix (num_classes, num_classes ). Returns: float: Calculated overall acc """ return np.diag(hist).sum() / hist.sum() def get_acc_cls(hist): """Compute the class average accuracy. Args: hist(np.ndarray): Overall confusion martix (num_classes, num_classes ). Returns: float: Calculated class average acc """ return np.nanmean(np.diag(hist) / hist.sum(axis=1)) def seg_eval(gt_labels, seg_preds, label2cat, ignore_index, logger=None): """Semantic Segmentation Evaluation. Evaluate the result of the Semantic Segmentation. Args: gt_labels (list[torch.Tensor]): Ground truth labels. seg_preds (list[torch.Tensor]): Predictions. label2cat (dict): Map from label to category name. ignore_index (int): Index that will be ignored in evaluation. logger (logging.Logger | str, optional): The way to print the mAP summary. See `mmdet.utils.print_log()` for details. Default: None. Returns: dict[str, float]: Dict of results. """ assert len(seg_preds) == len(gt_labels) num_classes = len(label2cat) hist_list = [] for i in range(len(gt_labels)): gt_seg = gt_labels[i].clone().numpy().astype(np.int) pred_seg = seg_preds[i].clone().numpy().astype(np.int) # filter out ignored points pred_seg[gt_seg == ignore_index] = -1 gt_seg[gt_seg == ignore_index] = -1 # calculate one instance result hist_list.append(fast_hist(pred_seg, gt_seg, num_classes)) iou = per_class_iou(sum(hist_list)) miou = np.nanmean(iou) acc = get_acc(sum(hist_list)) acc_cls = get_acc_cls(sum(hist_list)) header = ['classes'] for i in range(len(label2cat)): header.append(label2cat[i]) header.extend(['miou', 'acc', 'acc_cls']) ret_dict = dict() table_columns = [['results']] for i in range(len(label2cat)): ret_dict[label2cat[i]] = float(iou[i]) table_columns.append([f'{iou[i]:.4f}']) ret_dict['miou'] = float(miou) ret_dict['acc'] = float(acc) ret_dict['acc_cls'] = float(acc_cls) table_columns.append([f'{miou:.4f}']) table_columns.append([f'{acc:.4f}']) table_columns.append([f'{acc_cls:.4f}']) table_data = [header] table_rows = list(zip(*table_columns)) table_data += table_rows table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) return ret_dict ================================================ FILE: mmdet3d/core/evaluation/waymo_utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .prediction_kitti_to_waymo import KITTI2Waymo __all__ = ['KITTI2Waymo'] ================================================ FILE: mmdet3d/core/evaluation/waymo_utils/prediction_kitti_to_waymo.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. r"""Adapted from `Waymo to KITTI converter `_. """ try: from waymo_open_dataset import dataset_pb2 as open_dataset except ImportError: raise ImportError( 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' 'to install the official devkit first.') from glob import glob from os.path import join import mmcv import numpy as np import tensorflow as tf from waymo_open_dataset import label_pb2 from waymo_open_dataset.protos import metrics_pb2 class KITTI2Waymo(object): """KITTI predictions to Waymo converter. This class serves as the converter to change predictions from KITTI to Waymo format. Args: kitti_result_files (list[dict]): Predictions in KITTI format. waymo_tfrecords_dir (str): Directory to load waymo raw data. waymo_results_save_dir (str): Directory to save converted predictions in waymo format (.bin files). waymo_results_final_path (str): Path to save combined predictions in waymo format (.bin file), like 'a/b/c.bin'. prefix (str): Prefix of filename. In general, 0 for training, 1 for validation and 2 for testing. workers (str): Number of parallel processes. """ def __init__(self, kitti_result_files, waymo_tfrecords_dir, waymo_results_save_dir, waymo_results_final_path, prefix, workers=64): self.kitti_result_files = kitti_result_files self.waymo_tfrecords_dir = waymo_tfrecords_dir self.waymo_results_save_dir = waymo_results_save_dir self.waymo_results_final_path = waymo_results_final_path self.prefix = prefix self.workers = int(workers) self.name2idx = {} for idx, result in enumerate(kitti_result_files): if len(result['sample_idx']) > 0: self.name2idx[str(result['sample_idx'][0])] = idx # turn on eager execution for older tensorflow versions if int(tf.__version__.split('.')[0]) < 2: tf.enable_eager_execution() self.k2w_cls_map = { 'Car': label_pb2.Label.TYPE_VEHICLE, 'Pedestrian': label_pb2.Label.TYPE_PEDESTRIAN, 'Sign': label_pb2.Label.TYPE_SIGN, 'Cyclist': label_pb2.Label.TYPE_CYCLIST, } self.T_ref_to_front_cam = np.array([[0.0, 0.0, 1.0, 0.0], [-1.0, 0.0, 0.0, 0.0], [0.0, -1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]) self.get_file_names() self.create_folder() def get_file_names(self): """Get file names of waymo raw data.""" self.waymo_tfrecord_pathnames = sorted( glob(join(self.waymo_tfrecords_dir, '*.tfrecord'))) print(len(self.waymo_tfrecord_pathnames), 'tfrecords found.') def create_folder(self): """Create folder for data conversion.""" mmcv.mkdir_or_exist(self.waymo_results_save_dir) def parse_objects(self, kitti_result, T_k2w, context_name, frame_timestamp_micros): """Parse one prediction with several instances in kitti format and convert them to `Object` proto. Args: kitti_result (dict): Predictions in kitti format. - name (np.ndarray): Class labels of predictions. - dimensions (np.ndarray): Height, width, length of boxes. - location (np.ndarray): Bottom center of boxes (x, y, z). - rotation_y (np.ndarray): Orientation of boxes. - score (np.ndarray): Scores of predictions. T_k2w (np.ndarray): Transformation matrix from kitti to waymo. context_name (str): Context name of the frame. frame_timestamp_micros (int): Frame timestamp. Returns: :obj:`Object`: Predictions in waymo dataset Object proto. """ def parse_one_object(instance_idx): """Parse one instance in kitti format and convert them to `Object` proto. Args: instance_idx (int): Index of the instance to be converted. Returns: :obj:`Object`: Predicted instance in waymo dataset Object proto. """ cls = kitti_result['name'][instance_idx] length = round(kitti_result['dimensions'][instance_idx, 0], 4) height = round(kitti_result['dimensions'][instance_idx, 1], 4) width = round(kitti_result['dimensions'][instance_idx, 2], 4) x = round(kitti_result['location'][instance_idx, 0], 4) y = round(kitti_result['location'][instance_idx, 1], 4) z = round(kitti_result['location'][instance_idx, 2], 4) rotation_y = round(kitti_result['rotation_y'][instance_idx], 4) score = round(kitti_result['score'][instance_idx], 4) # y: downwards; move box origin from bottom center (kitti) to # true center (waymo) y -= height / 2 # frame transformation: kitti -> waymo x, y, z = self.transform(T_k2w, x, y, z) # different conventions heading = -(rotation_y + np.pi / 2) while heading < -np.pi: heading += 2 * np.pi while heading > np.pi: heading -= 2 * np.pi box = label_pb2.Label.Box() box.center_x = x box.center_y = y box.center_z = z box.length = length box.width = width box.height = height box.heading = heading o = metrics_pb2.Object() o.object.box.CopyFrom(box) o.object.type = self.k2w_cls_map[cls] o.score = score o.context_name = context_name o.frame_timestamp_micros = frame_timestamp_micros return o objects = metrics_pb2.Objects() for instance_idx in range(len(kitti_result['name'])): o = parse_one_object(instance_idx) objects.objects.append(o) return objects def convert_one(self, file_idx): """Convert action for single file. Args: file_idx (int): Index of the file to be converted. """ file_pathname = self.waymo_tfrecord_pathnames[file_idx] file_data = tf.data.TFRecordDataset(file_pathname, compression_type='') for frame_num, frame_data in enumerate(file_data): frame = open_dataset.Frame() frame.ParseFromString(bytearray(frame_data.numpy())) filename = f'{self.prefix}{file_idx:03d}{frame_num:03d}' for camera in frame.context.camera_calibrations: # FRONT = 1, see dataset.proto for details if camera.name == 1: T_front_cam_to_vehicle = np.array( camera.extrinsic.transform).reshape(4, 4) T_k2w = T_front_cam_to_vehicle @ self.T_ref_to_front_cam context_name = frame.context.name frame_timestamp_micros = frame.timestamp_micros if filename in self.name2idx: kitti_result = \ self.kitti_result_files[self.name2idx[filename]] objects = self.parse_objects(kitti_result, T_k2w, context_name, frame_timestamp_micros) else: print(filename, 'not found.') objects = metrics_pb2.Objects() with open( join(self.waymo_results_save_dir, f'{filename}.bin'), 'wb') as f: f.write(objects.SerializeToString()) def convert(self): """Convert action.""" print('Start converting ...') mmcv.track_parallel_progress(self.convert_one, range(len(self)), self.workers) print('\nFinished ...') # combine all files into one .bin pathnames = sorted(glob(join(self.waymo_results_save_dir, '*.bin'))) combined = self.combine(pathnames) with open(self.waymo_results_final_path, 'wb') as f: f.write(combined.SerializeToString()) def __len__(self): """Length of the filename list.""" return len(self.waymo_tfrecord_pathnames) def transform(self, T, x, y, z): """Transform the coordinates with matrix T. Args: T (np.ndarray): Transformation matrix. x(float): Coordinate in x axis. y(float): Coordinate in y axis. z(float): Coordinate in z axis. Returns: list: Coordinates after transformation. """ pt_bef = np.array([x, y, z, 1.0]).reshape(4, 1) pt_aft = np.matmul(T, pt_bef) return pt_aft[:3].flatten().tolist() def combine(self, pathnames): """Combine predictions in waymo format for each sample together. Args: pathnames (str): Paths to save predictions. Returns: :obj:`Objects`: Combined predictions in Objects proto. """ combined = metrics_pb2.Objects() for pathname in pathnames: objects = metrics_pb2.Objects() with open(pathname, 'rb') as f: objects.ParseFromString(f.read()) for o in objects.objects: combined.objects.append(o) return combined ================================================ FILE: mmdet3d/core/hook/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .ema import MEGVIIEMAHook from .utils import is_parallel from .sequentialsontrol import SequentialControlHook __all__ = ['MEGVIIEMAHook', 'is_parallel', 'SequentialControlHook'] ================================================ FILE: mmdet3d/core/hook/ema.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # modified from megvii-bevdepth. import math import os from copy import deepcopy import torch from mmcv.runner import load_state_dict from mmcv.runner.dist_utils import master_only from mmcv.runner.hooks import HOOKS, Hook from mmdet3d.core.hook.utils import is_parallel __all__ = ['ModelEMA'] class ModelEMA: """Model Exponential Moving Average from https://github.com/rwightman/ pytorch-image-models Keep a moving average of everything in the model state_dict (parameters and buffers). This is intended to allow functionality like https://www.tensorflow.org/api_docs/python/tf/train/ ExponentialMovingAverage A smoothed version of the weights is necessary for some training schemes to perform well. This class is sensitive where it is initialized in the sequence of model init, GPU assignment and distributed training wrappers. """ def __init__(self, model, decay=0.9999, updates=0): """ Args: model (nn.Module): model to apply EMA. decay (float): ema decay reate. updates (int): counter of EMA updates. """ # Create EMA(FP32) self.ema_model = deepcopy(model).eval() self.ema = self.ema_model.module.module if is_parallel( self.ema_model.module) else self.ema_model.module self.updates = updates # decay exponential ramp (to help early epochs) self.decay = lambda x: decay * (1 - math.exp(-x / 2000)) for p in self.ema.parameters(): p.requires_grad_(False) def update(self, trainer, model): # Update EMA parameters with torch.no_grad(): self.updates += 1 d = self.decay(self.updates) msd = model.module.state_dict() if is_parallel( model) else model.state_dict() # model state_dict for k, v in self.ema.state_dict().items(): if v.dtype.is_floating_point: v *= d v += (1.0 - d) * msd[k].detach() @HOOKS.register_module() class MEGVIIEMAHook(Hook): """EMAHook used in BEVDepth. Modified from https://github.com/Megvii-Base Detection/BEVDepth/blob/main/callbacks/ema.py. """ def __init__(self, init_updates=0, decay=0.9990, resume=None, interval=-1): super().__init__() self.init_updates = init_updates self.resume = resume self.decay = decay self.interval = interval def before_run(self, runner): from torch.nn.modules.batchnorm import SyncBatchNorm bn_model_list = list() bn_model_dist_group_list = list() for model_ref in runner.model.modules(): if isinstance(model_ref, SyncBatchNorm): bn_model_list.append(model_ref) bn_model_dist_group_list.append(model_ref.process_group) model_ref.process_group = None runner.ema_model = ModelEMA(runner.model, self.decay) for bn_model, dist_group in zip(bn_model_list, bn_model_dist_group_list): bn_model.process_group = dist_group runner.ema_model.updates = self.init_updates if self.resume is not None: runner.logger.info(f'resume ema checkpoint from {self.resume}') cpt = torch.load(self.resume, map_location='cpu') load_state_dict(runner.ema_model.ema, cpt['state_dict']) runner.ema_model.updates = cpt['updates'] def after_train_iter(self, runner): runner.ema_model.update(runner, runner.model.module) curr_step = runner.iter if self.interval>0: if curr_step % self.interval==0 and curr_step>0: self.save_checkpoint_iter(runner) def after_train_epoch(self, runner): self.save_checkpoint(runner) def after_run(self, runner): self.save_checkpoint_iter(runner) @master_only def save_checkpoint(self, runner): state_dict = runner.ema_model.ema.state_dict() ema_checkpoint = { 'epoch': runner.epoch, 'state_dict': state_dict, 'updates': runner.ema_model.updates } save_path = f'epoch_{runner.epoch+1}_ema.pth' save_path = os.path.join(runner.work_dir, save_path) torch.save(ema_checkpoint, save_path) runner.logger.info(f'Saving ema checkpoint at {save_path}') @master_only def save_checkpoint_iter(self, runner): state_dict = runner.ema_model.ema.state_dict() ema_checkpoint = { 'iter': runner.iter, 'state_dict': state_dict, 'updates': runner.ema_model.updates } save_path = f'iter_{runner.iter}_ema.pth' save_path = os.path.join(runner.work_dir, save_path) torch.save(ema_checkpoint, save_path) runner.logger.info(f'Saving ema checkpoint at {save_path}') ================================================ FILE: mmdet3d/core/hook/forge_load.py ================================================ # -*- coding: utf-8 -*- #!/usr/bin/python ################################################## # AUTHOR : Yandi LI # CREATED_AT : 2018-11-01 # LAST_MODIFIED : 2018-11-07 12:55:32 # USAGE : python -u main.py ################################################## from __future__ import division import math import threading import time from collections import deque from numba import cuda import numpy from mmcv.runner.hooks import HOOKS, Hook import os local_rank = int(os.environ.get('LOCAL_RANK', 0)) cuda.select_device(local_rank) class Monitor(threading.Thread): def __init__(self): super(Monitor, self).__init__() self.setDaemon(True) self._queue = deque([0] * 5, 5) self.avg_load = 0 self.max_load = 0 def update(self, ): load = self.get_current_load() self._queue.append(load) self.avg_load = sum(self._queue)/len(self._queue) self.max_load = max(self._queue) def run(self): while True: self.update() time.sleep(1) @staticmethod def get_current_load(): import GPUtil gpu = GPUtil.getGPUs()[local_rank] load = gpu.load * 100 return load @HOOKS.register_module() class ForgeLoadWorker(Hook): def __init__(self, target=50): super().__init__() if os.path.isfile('/workspace/unlock'): try: os.remove('/workspace/unlock') except: pass def after_run(self, runner): import os target = float(os.environ.get("TARGET", 80)) data = numpy.zeros(512) self._device_data = cuda.to_device(data) self.threadsperblock = 128 self.blockspergrid = int(math.ceil(data.shape[0] / self.threadsperblock)) self.target = target self.multiplier = 1000 self.main(target) pass def __str__(self): return "threadsperblock: {}, blockspergrid: {}".format(self.threadsperblock, self.blockspergrid) @staticmethod @cuda.jit def my_kernel(io_array): """ CUDA kernel """ pos = cuda.grid(1) tx = cuda.threadIdx.x if pos < io_array.size: io_array[pos] += tx # do the computation def run_awhile(self, sec=10): start = time.time() while time.time() - start < sec: self.my_kernel[int(self.multiplier * self.blockspergrid), self.threadsperblock](self._device_data) def idle_awhile(self, sec=5): time.sleep(sec) def _boost(self, rate=1.2): self.multiplier *= rate def _slow_down(self, rate=1.5): self.multiplier /= rate def adjust_speed(self, avg_load): if avg_load < self.target * 0.9: self._boost() # print("Adjusted speed: boost") return if avg_load > self.target * 1.2: self._slow_down() # print("Adjusted speed: slow_down") return # classmethod def main(self, target=50): monitor = Monitor() monitor.start() # print("Monitor started: %s" % monitor.is_alive()) time.sleep(5) # print("Initial average load", monitor.avg_load) while True: try: if os.path.isfile('/workspace/unlock'): break if monitor.max_load > self.target * 1.1: # print("Idle for 5s with load %s" % monitor.max_load) self.idle_awhile(5) continue # print("Run for 10s with load %s and multiplier %s" % (monitor.avg_load, self.multiplier)) self.run_awhile(10) self.adjust_speed(monitor.avg_load) except: pass # if __name__ == "__main__": # import os # target = float(os.environ.get("TARGET", 80)) # Worker.main(target) ================================================ FILE: mmdet3d/core/hook/sequentialsontrol.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.runner.hooks import HOOKS, Hook from mmdet3d.core.hook.utils import is_parallel __all__ = ['SequentialControlHook'] @HOOKS.register_module() class SequentialControlHook(Hook): """ """ def __init__(self, temporal_start_epoch=1, temporal_start_iter=-1): super().__init__() self.temporal_start_epoch=temporal_start_epoch self.temporal_start_iter = temporal_start_iter def set_temporal_flag(self, runner, flag): if is_parallel(runner.model.module): runner.model.module.module.with_prev=flag else: runner.model.module.with_prev = flag def set_temporal_flag_v2(self, runner, flag): if is_parallel(runner.model.module): runner.model.module.module.do_history=flag else: runner.model.module.do_history = flag def before_run(self, runner): self.set_temporal_flag(runner, False) if self.temporal_start_iter>0: self.set_temporal_flag_v2(runner, False) def before_train_epoch(self, runner): if runner.epoch > self.temporal_start_epoch and self.temporal_start_iter<0: self.set_temporal_flag(runner, True) def after_train_iter(self, runner): curr_step = runner.iter if curr_step >= self.temporal_start_iter and self.temporal_start_iter>=0: self.set_temporal_flag_v2(runner, True) ================================================ FILE: mmdet3d/core/hook/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from torch import nn __all__ = ['is_parallel'] def is_parallel(model): """check if model is in parallel mode.""" parallel_type = ( nn.parallel.DataParallel, nn.parallel.DistributedDataParallel, ) return isinstance(model, parallel_type) ================================================ FILE: mmdet3d/core/points/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_points import BasePoints from .cam_points import CameraPoints from .depth_points import DepthPoints from .lidar_points import LiDARPoints __all__ = ['BasePoints', 'CameraPoints', 'DepthPoints', 'LiDARPoints'] def get_points_type(points_type): """Get the class of points according to coordinate type. Args: points_type (str): The type of points coordinate. The valid value are "CAMERA", "LIDAR", or "DEPTH". Returns: class: Points type. """ if points_type == 'CAMERA': points_cls = CameraPoints elif points_type == 'LIDAR': points_cls = LiDARPoints elif points_type == 'DEPTH': points_cls = DepthPoints else: raise ValueError('Only "points_type" of "CAMERA", "LIDAR", or "DEPTH"' f' are supported, got {points_type}') return points_cls ================================================ FILE: mmdet3d/core/points/base_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from abc import abstractmethod import numpy as np import torch from ..bbox.structures.utils import rotation_3d_in_axis class BasePoints(object): """Base class for Points. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int, optional): Number of the dimension of a point. Each row is (x, y, z). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): if isinstance(tensor, torch.Tensor): device = tensor.device else: device = torch.device('cpu') tensor = torch.as_tensor(tensor, dtype=torch.float32, device=device) if tensor.numel() == 0: # Use reshape, so we don't end up creating a new tensor that # does not depend on the inputs (and consequently confuses jit) tensor = tensor.reshape((0, points_dim)).to( dtype=torch.float32, device=device) assert tensor.dim() == 2 and tensor.size(-1) == \ points_dim, tensor.size() self.tensor = tensor self.points_dim = points_dim self.attribute_dims = attribute_dims self.rotation_axis = 0 @property def coord(self): """torch.Tensor: Coordinates of each point in shape (N, 3).""" return self.tensor[:, :3] @coord.setter def coord(self, tensor): """Set the coordinates of each point.""" try: tensor = tensor.reshape(self.shape[0], 3) except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray raise ValueError(f'got unexpected shape {tensor.shape}') if not isinstance(tensor, torch.Tensor): tensor = self.tensor.new_tensor(tensor) self.tensor[:, :3] = tensor @property def height(self): """torch.Tensor: A vector with height of each point in shape (N, 1), or None.""" if self.attribute_dims is not None and \ 'height' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['height']] else: return None @height.setter def height(self, tensor): """Set the height of each point.""" try: tensor = tensor.reshape(self.shape[0]) except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray raise ValueError(f'got unexpected shape {tensor.shape}') if not isinstance(tensor, torch.Tensor): tensor = self.tensor.new_tensor(tensor) if self.attribute_dims is not None and \ 'height' in self.attribute_dims.keys(): self.tensor[:, self.attribute_dims['height']] = tensor else: # add height attribute if self.attribute_dims is None: self.attribute_dims = dict() attr_dim = self.shape[1] self.tensor = torch.cat([self.tensor, tensor.unsqueeze(1)], dim=1) self.attribute_dims.update(dict(height=attr_dim)) self.points_dim += 1 @property def color(self): """torch.Tensor: A vector with color of each point in shape (N, 3), or None.""" if self.attribute_dims is not None and \ 'color' in self.attribute_dims.keys(): return self.tensor[:, self.attribute_dims['color']] else: return None @color.setter def color(self, tensor): """Set the color of each point.""" try: tensor = tensor.reshape(self.shape[0], 3) except (RuntimeError, ValueError): # for torch.Tensor and np.ndarray raise ValueError(f'got unexpected shape {tensor.shape}') if tensor.max() >= 256 or tensor.min() < 0: warnings.warn('point got color value beyond [0, 255]') if not isinstance(tensor, torch.Tensor): tensor = self.tensor.new_tensor(tensor) if self.attribute_dims is not None and \ 'color' in self.attribute_dims.keys(): self.tensor[:, self.attribute_dims['color']] = tensor else: # add color attribute if self.attribute_dims is None: self.attribute_dims = dict() attr_dim = self.shape[1] self.tensor = torch.cat([self.tensor, tensor], dim=1) self.attribute_dims.update( dict(color=[attr_dim, attr_dim + 1, attr_dim + 2])) self.points_dim += 3 @property def shape(self): """torch.Shape: Shape of points.""" return self.tensor.shape def shuffle(self): """Shuffle the points. Returns: torch.Tensor: The shuffled index. """ idx = torch.randperm(self.__len__(), device=self.tensor.device) self.tensor = self.tensor[idx] return idx def rotate(self, rotation, axis=None): """Rotate points with the given rotation matrix or angle. Args: rotation (float | np.ndarray | torch.Tensor): Rotation matrix or angle. axis (int, optional): Axis to rotate at. Defaults to None. """ if not isinstance(rotation, torch.Tensor): rotation = self.tensor.new_tensor(rotation) assert rotation.shape == torch.Size([3, 3]) or \ rotation.numel() == 1, f'invalid rotation shape {rotation.shape}' if axis is None: axis = self.rotation_axis if rotation.numel() == 1: rotated_points, rot_mat_T = rotation_3d_in_axis( self.tensor[:, :3][None], rotation, axis=axis, return_mat=True) self.tensor[:, :3] = rotated_points.squeeze(0) rot_mat_T = rot_mat_T.squeeze(0) else: # rotation.numel() == 9 self.tensor[:, :3] = self.tensor[:, :3] @ rotation rot_mat_T = rotation return rot_mat_T @abstractmethod def flip(self, bev_direction='horizontal'): """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). """ pass def translate(self, trans_vector): """Translate points with the given translation vector. Args: trans_vector (np.ndarray, torch.Tensor): Translation vector of size 3 or nx3. """ if not isinstance(trans_vector, torch.Tensor): trans_vector = self.tensor.new_tensor(trans_vector) trans_vector = trans_vector.squeeze(0) if trans_vector.dim() == 1: assert trans_vector.shape[0] == 3 elif trans_vector.dim() == 2: assert trans_vector.shape[0] == self.tensor.shape[0] and \ trans_vector.shape[1] == 3 else: raise NotImplementedError( f'Unsupported translation vector of shape {trans_vector.shape}' ) self.tensor[:, :3] += trans_vector def in_range_3d(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point (x_min, y_min, z_min, x_max, y_max, z_max) Note: In the original implementation of SECOND, checking whether a box in the range checks whether the points are in a convex polygon, we try to reduce the burden for simpler cases. Returns: torch.Tensor: A binary vector indicating whether each point is inside the reference range. """ in_range_flags = ((self.tensor[:, 0] > point_range[0]) & (self.tensor[:, 1] > point_range[1]) & (self.tensor[:, 2] > point_range[2]) & (self.tensor[:, 0] < point_range[3]) & (self.tensor[:, 1] < point_range[4]) & (self.tensor[:, 2] < point_range[5])) return in_range_flags @property def bev(self): """torch.Tensor: BEV of the points in shape (N, 2).""" return self.tensor[:, [0, 1]] def in_range_bev(self, point_range): """Check whether the points are in the given range. Args: point_range (list | torch.Tensor): The range of point in order of (x_min, y_min, x_max, y_max). Returns: torch.Tensor: Indicating whether each point is inside the reference range. """ in_range_flags = ((self.bev[:, 0] > point_range[0]) & (self.bev[:, 1] > point_range[1]) & (self.bev[:, 0] < point_range[2]) & (self.bev[:, 1] < point_range[3])) return in_range_flags @abstractmethod def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Box mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted box of the same type in the `dst` mode. """ pass def scale(self, scale_factor): """Scale the points with horizontal and vertical scaling factors. Args: scale_factors (float): Scale factors to scale the points. """ self.tensor[:, :3] *= scale_factor def __getitem__(self, item): """ Note: The following usage are allowed: 1. `new_points = points[3]`: return a `Points` that contains only one point. 2. `new_points = points[2:10]`: return a slice of points. 3. `new_points = points[vector]`: where vector is a torch.BoolTensor with `length = len(points)`. Nonzero elements in the vector will be selected. 4. `new_points = points[3:11, vector]`: return a slice of points and attribute dims. 5. `new_points = points[4:12, 2]`: return a slice of points with single attribute. Note that the returned Points might share storage with this Points, subject to Pytorch's indexing semantics. Returns: :obj:`BasePoints`: A new object of :class:`BasePoints` after indexing. """ original_type = type(self) if isinstance(item, int): return original_type( self.tensor[item].view(1, -1), points_dim=self.points_dim, attribute_dims=self.attribute_dims) elif isinstance(item, tuple) and len(item) == 2: if isinstance(item[1], slice): start = 0 if item[1].start is None else item[1].start stop = self.tensor.shape[1] if \ item[1].stop is None else item[1].stop step = 1 if item[1].step is None else item[1].step item = list(item) item[1] = list(range(start, stop, step)) item = tuple(item) elif isinstance(item[1], int): item = list(item) item[1] = [item[1]] item = tuple(item) p = self.tensor[item[0], item[1]] keep_dims = list( set(item[1]).intersection(set(range(3, self.tensor.shape[1])))) if self.attribute_dims is not None: attribute_dims = self.attribute_dims.copy() for key in self.attribute_dims.keys(): cur_attribute_dims = attribute_dims[key] if isinstance(cur_attribute_dims, int): cur_attribute_dims = [cur_attribute_dims] intersect_attr = list( set(cur_attribute_dims).intersection(set(keep_dims))) if len(intersect_attr) == 1: attribute_dims[key] = intersect_attr[0] elif len(intersect_attr) > 1: attribute_dims[key] = intersect_attr else: attribute_dims.pop(key) else: attribute_dims = None elif isinstance(item, (slice, np.ndarray, torch.Tensor)): p = self.tensor[item] attribute_dims = self.attribute_dims else: raise NotImplementedError(f'Invalid slice {item}!') assert p.dim() == 2, \ f'Indexing on Points with {item} failed to return a matrix!' return original_type( p, points_dim=p.shape[1], attribute_dims=attribute_dims) def __len__(self): """int: Number of points in the current object.""" return self.tensor.shape[0] def __repr__(self): """str: Return a strings that describes the object.""" return self.__class__.__name__ + '(\n ' + str(self.tensor) + ')' @classmethod def cat(cls, points_list): """Concatenate a list of Points into a single Points. Args: points_list (list[:obj:`BasePoints`]): List of points. Returns: :obj:`BasePoints`: The concatenated Points. """ assert isinstance(points_list, (list, tuple)) if len(points_list) == 0: return cls(torch.empty(0)) assert all(isinstance(points, cls) for points in points_list) # use torch.cat (v.s. layers.cat) # so the returned points never share storage with input cat_points = cls( torch.cat([p.tensor for p in points_list], dim=0), points_dim=points_list[0].tensor.shape[1], attribute_dims=points_list[0].attribute_dims) return cat_points def to(self, device): """Convert current points to a specific device. Args: device (str | :obj:`torch.device`): The name of the device. Returns: :obj:`BasePoints`: A new boxes object on the specific device. """ original_type = type(self) return original_type( self.tensor.to(device), points_dim=self.points_dim, attribute_dims=self.attribute_dims) def clone(self): """Clone the Points. Returns: :obj:`BasePoints`: Box object with the same properties as self. """ original_type = type(self) return original_type( self.tensor.clone(), points_dim=self.points_dim, attribute_dims=self.attribute_dims) @property def device(self): """str: The device of the points are on.""" return self.tensor.device def __iter__(self): """Yield a point as a Tensor of shape (4,) at a time. Returns: torch.Tensor: A point of shape (4,). """ yield from self.tensor def new_point(self, data): """Create a new point object with data. The new point and its tensor has the similar properties as self and self.tensor, respectively. Args: data (torch.Tensor | numpy.array | list): Data to be copied. Returns: :obj:`BasePoints`: A new point object with ``data``, the object's other properties are similar to ``self``. """ new_tensor = self.tensor.new_tensor(data) \ if not isinstance(data, torch.Tensor) else data.to(self.device) original_type = type(self) return original_type( new_tensor, points_dim=self.points_dim, attribute_dims=self.attribute_dims) ================================================ FILE: mmdet3d/core/points/cam_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_points import BasePoints class CameraPoints(BasePoints): """Points of instances in CAM coordinates. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int, optional): Number of the dimension of a point. Each row is (x, y, z). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): super(CameraPoints, self).__init__( tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 1 def flip(self, bev_direction='horizontal'): """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). """ if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 2] = -self.tensor[:, 2] @property def bev(self): """torch.Tensor: BEV of the points in shape (N, 2).""" return self.tensor[:, [0, 2]] def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode return Coord3DMode.convert_point( point=self, src=Coord3DMode.CAM, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/points/depth_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_points import BasePoints class DepthPoints(BasePoints): """Points of instances in DEPTH coordinates. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int, optional): Number of the dimension of a point. Each row is (x, y, z). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): super(DepthPoints, self).__init__( tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 2 def flip(self, bev_direction='horizontal'): """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). """ if bev_direction == 'horizontal': self.tensor[:, 0] = -self.tensor[:, 0] elif bev_direction == 'vertical': self.tensor[:, 1] = -self.tensor[:, 1] def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode return Coord3DMode.convert_point( point=self, src=Coord3DMode.DEPTH, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/points/lidar_points.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_points import BasePoints class LiDARPoints(BasePoints): """Points of instances in LIDAR coordinates. Args: tensor (torch.Tensor | np.ndarray | list): a N x points_dim matrix. points_dim (int, optional): Number of the dimension of a point. Each row is (x, y, z). Defaults to 3. attribute_dims (dict, optional): Dictionary to indicate the meaning of extra dimension. Defaults to None. Attributes: tensor (torch.Tensor): Float matrix of N x points_dim. points_dim (int): Integer indicating the dimension of a point. Each row is (x, y, z, ...). attribute_dims (bool): Dictionary to indicate the meaning of extra dimension. Defaults to None. rotation_axis (int): Default rotation axis for points rotation. """ def __init__(self, tensor, points_dim=3, attribute_dims=None): super(LiDARPoints, self).__init__( tensor, points_dim=points_dim, attribute_dims=attribute_dims) self.rotation_axis = 2 def flip(self, bev_direction='horizontal'): """Flip the points along given BEV direction. Args: bev_direction (str): Flip direction (horizontal or vertical). """ if bev_direction == 'horizontal': self.tensor[:, 1] = -self.tensor[:, 1] elif bev_direction == 'vertical': self.tensor[:, 0] = -self.tensor[:, 0] def convert_to(self, dst, rt_mat=None): """Convert self to ``dst`` mode. Args: dst (:obj:`CoordMode`): The target Point mode. rt_mat (np.ndarray | torch.Tensor, optional): The rotation and translation matrix between different coordinates. Defaults to None. The conversion from `src` coordinates to `dst` coordinates usually comes along the change of sensors, e.g., from camera to LiDAR. This requires a transformation matrix. Returns: :obj:`BasePoints`: The converted point of the same type in the `dst` mode. """ from mmdet3d.core.bbox import Coord3DMode return Coord3DMode.convert_point( point=self, src=Coord3DMode.LIDAR, dst=dst, rt_mat=rt_mat) ================================================ FILE: mmdet3d/core/post_processing/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.core.post_processing import (merge_aug_bboxes, merge_aug_masks, merge_aug_proposals, merge_aug_scores, multiclass_nms) from .box3d_nms import (aligned_3d_nms, box3d_multiclass_nms, circle_nms, nms_bev, nms_normal_bev) from .merge_augs import merge_aug_bboxes_3d __all__ = [ 'multiclass_nms', 'merge_aug_proposals', 'merge_aug_bboxes', 'merge_aug_scores', 'merge_aug_masks', 'box3d_multiclass_nms', 'aligned_3d_nms', 'merge_aug_bboxes_3d', 'circle_nms', 'nms_bev', 'nms_normal_bev' ] ================================================ FILE: mmdet3d/core/post_processing/box3d_nms.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numba import numpy as np import torch from mmcv.ops import nms, nms_rotated def box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, score_thr, max_num, cfg, mlvl_dir_scores=None, mlvl_attr_scores=None, mlvl_bboxes2d=None): """Multi-class NMS for 3D boxes. The IoU used for NMS is defined as the 2D IoU between BEV boxes. Args: mlvl_bboxes (torch.Tensor): Multi-level boxes with shape (N, M). M is the dimensions of boxes. mlvl_bboxes_for_nms (torch.Tensor): Multi-level boxes with shape (N, 5) ([x1, y1, x2, y2, ry]). N is the number of boxes. The coordinate system of the BEV boxes is counterclockwise. mlvl_scores (torch.Tensor): Multi-level boxes with shape (N, C + 1). N is the number of boxes. C is the number of classes. score_thr (float): Score threshold to filter boxes with low confidence. max_num (int): Maximum number of boxes will be kept. cfg (dict): Configuration dict of NMS. mlvl_dir_scores (torch.Tensor, optional): Multi-level scores of direction classifier. Defaults to None. mlvl_attr_scores (torch.Tensor, optional): Multi-level scores of attribute classifier. Defaults to None. mlvl_bboxes2d (torch.Tensor, optional): Multi-level 2D bounding boxes. Defaults to None. Returns: tuple[torch.Tensor]: Return results after nms, including 3D bounding boxes, scores, labels, direction scores, attribute scores (optional) and 2D bounding boxes (optional). """ # do multi class nms # the fg class id range: [0, num_classes-1] num_classes = mlvl_scores.shape[1] - 1 bboxes = [] scores = [] labels = [] dir_scores = [] attr_scores = [] bboxes2d = [] for i in range(0, num_classes): # get bboxes and scores of this class cls_inds = mlvl_scores[:, i] > score_thr if not cls_inds.any(): continue _scores = mlvl_scores[cls_inds, i] _bboxes_for_nms = mlvl_bboxes_for_nms[cls_inds, :] if cfg.use_rotate_nms: nms_func = nms_bev else: nms_func = nms_normal_bev selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr) _mlvl_bboxes = mlvl_bboxes[cls_inds, :] bboxes.append(_mlvl_bboxes[selected]) scores.append(_scores[selected]) cls_label = mlvl_bboxes.new_full((len(selected), ), i, dtype=torch.long) labels.append(cls_label) if mlvl_dir_scores is not None: _mlvl_dir_scores = mlvl_dir_scores[cls_inds] dir_scores.append(_mlvl_dir_scores[selected]) if mlvl_attr_scores is not None: _mlvl_attr_scores = mlvl_attr_scores[cls_inds] attr_scores.append(_mlvl_attr_scores[selected]) if mlvl_bboxes2d is not None: _mlvl_bboxes2d = mlvl_bboxes2d[cls_inds] bboxes2d.append(_mlvl_bboxes2d[selected]) if bboxes: bboxes = torch.cat(bboxes, dim=0) scores = torch.cat(scores, dim=0) labels = torch.cat(labels, dim=0) if mlvl_dir_scores is not None: dir_scores = torch.cat(dir_scores, dim=0) if mlvl_attr_scores is not None: attr_scores = torch.cat(attr_scores, dim=0) if mlvl_bboxes2d is not None: bboxes2d = torch.cat(bboxes2d, dim=0) if bboxes.shape[0] > max_num: _, inds = scores.sort(descending=True) inds = inds[:max_num] bboxes = bboxes[inds, :] labels = labels[inds] scores = scores[inds] if mlvl_dir_scores is not None: dir_scores = dir_scores[inds] if mlvl_attr_scores is not None: attr_scores = attr_scores[inds] if mlvl_bboxes2d is not None: bboxes2d = bboxes2d[inds] else: bboxes = mlvl_scores.new_zeros((0, mlvl_bboxes.size(-1))) scores = mlvl_scores.new_zeros((0, )) labels = mlvl_scores.new_zeros((0, ), dtype=torch.long) if mlvl_dir_scores is not None: dir_scores = mlvl_scores.new_zeros((0, )) if mlvl_attr_scores is not None: attr_scores = mlvl_scores.new_zeros((0, )) if mlvl_bboxes2d is not None: bboxes2d = mlvl_scores.new_zeros((0, 4)) results = (bboxes, scores, labels) if mlvl_dir_scores is not None: results = results + (dir_scores, ) if mlvl_attr_scores is not None: results = results + (attr_scores, ) if mlvl_bboxes2d is not None: results = results + (bboxes2d, ) return results def aligned_3d_nms(boxes, scores, classes, thresh): """3D NMS for aligned boxes. Args: boxes (torch.Tensor): Aligned box with shape [n, 6]. scores (torch.Tensor): Scores of each box. classes (torch.Tensor): Class of each box. thresh (float): IoU threshold for nms. Returns: torch.Tensor: Indices of selected boxes. """ x1 = boxes[:, 0] y1 = boxes[:, 1] z1 = boxes[:, 2] x2 = boxes[:, 3] y2 = boxes[:, 4] z2 = boxes[:, 5] area = (x2 - x1) * (y2 - y1) * (z2 - z1) zero = boxes.new_zeros(1, ) score_sorted = torch.argsort(scores) pick = [] while (score_sorted.shape[0] != 0): last = score_sorted.shape[0] i = score_sorted[-1] pick.append(i) xx1 = torch.max(x1[i], x1[score_sorted[:last - 1]]) yy1 = torch.max(y1[i], y1[score_sorted[:last - 1]]) zz1 = torch.max(z1[i], z1[score_sorted[:last - 1]]) xx2 = torch.min(x2[i], x2[score_sorted[:last - 1]]) yy2 = torch.min(y2[i], y2[score_sorted[:last - 1]]) zz2 = torch.min(z2[i], z2[score_sorted[:last - 1]]) classes1 = classes[i] classes2 = classes[score_sorted[:last - 1]] inter_l = torch.max(zero, xx2 - xx1) inter_w = torch.max(zero, yy2 - yy1) inter_h = torch.max(zero, zz2 - zz1) inter = inter_l * inter_w * inter_h iou = inter / (area[i] + area[score_sorted[:last - 1]] - inter) iou = iou * (classes1 == classes2).float() score_sorted = score_sorted[torch.nonzero( iou <= thresh, as_tuple=False).flatten()] indices = boxes.new_tensor(pick, dtype=torch.long) return indices @numba.jit(nopython=True) def circle_nms(dets, thresh, post_max_size=83): """Circular NMS. An object is only counted as positive if no other center with a higher confidence exists within a radius r using a bird-eye view distance metric. Args: dets (torch.Tensor): Detection results with the shape of [N, 3]. thresh (float): Value of threshold. post_max_size (int, optional): Max number of prediction to be kept. Defaults to 83. Returns: torch.Tensor: Indexes of the detections to be kept. """ x1 = dets[:, 0] y1 = dets[:, 1] scores = dets[:, 2] order = scores.argsort()[::-1].astype(np.int32) # highest->lowest ndets = dets.shape[0] suppressed = np.zeros((ndets), dtype=np.int32) keep = [] for _i in range(ndets): i = order[_i] # start with highest score box if suppressed[ i] == 1: # if any box have enough iou with this, remove it continue keep.append(i) for _j in range(_i + 1, ndets): j = order[_j] if suppressed[j] == 1: continue # calculate center distance between i and j box dist = (x1[i] - x1[j])**2 + (y1[i] - y1[j])**2 # ovr = inter / areas[j] if dist <= thresh: suppressed[j] = 1 if post_max_size < len(keep): return keep[:post_max_size] return keep # This function duplicates functionality of mmcv.ops.iou_3d.nms_bev # from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms_rotated. # Nms api will be unified in mmdetection3d one day. def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None, xyxyr2xywhr=True): """NMS function GPU implementation (for BEV boxes). The overlap of two boxes for IoU calculation is defined as the exact overlapping area of the two boxes. In this function, one can also set ``pre_max_size`` and ``post_max_size``. Args: boxes (torch.Tensor): Input boxes with the shape of [N, 5] ([x1, y1, x2, y2, ry]). scores (torch.Tensor): Scores of boxes with the shape of [N]. thresh (float): Overlap threshold of NMS. pre_max_size (int, optional): Max size of boxes before NMS. Default: None. post_max_size (int, optional): Max size of boxes after NMS. Default: None. Returns: torch.Tensor: Indexes after NMS. """ assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]' order = scores.sort(0, descending=True)[1] if pre_max_size is not None: order = order[:pre_max_size] boxes = boxes[order].contiguous() scores = scores[order] # xyxyr -> back to xywhr # note: better skip this step before nms_bev call in the future if xyxyr2xywhr: boxes = torch.stack( ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2, boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]), dim=-1) keep = nms_rotated(boxes, scores, thresh)[1] keep = order[keep] if post_max_size is not None: keep = keep[:post_max_size] return keep # This function duplicates functionality of mmcv.ops.iou_3d.nms_normal_bev # from mmcv<=1.5, but using cuda ops from mmcv.ops.nms.nms. # Nms api will be unified in mmdetection3d one day. def nms_normal_bev(boxes, scores, thresh): """Normal NMS function GPU implementation (for BEV boxes). The overlap of two boxes for IoU calculation is defined as the exact overlapping area of the two boxes WITH their yaw angle set to 0. Args: boxes (torch.Tensor): Input boxes with shape (N, 5). scores (torch.Tensor): Scores of predicted boxes with shape (N). thresh (float): Overlap threshold of NMS. Returns: torch.Tensor: Remaining indices with scores in descending order. """ assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]' return nms(boxes[:, :-1], scores, thresh)[1] ================================================ FILE: mmdet3d/core/post_processing/merge_augs.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core.post_processing import nms_bev, nms_normal_bev from ..bbox import bbox3d2result, bbox3d_mapping_back, xywhr2xyxyr def merge_aug_bboxes_3d(aug_results, img_metas, test_cfg): """Merge augmented detection 3D bboxes and scores. Args: aug_results (list[dict]): The dict of detection results. The dict contains the following keys - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. - scores_3d (torch.Tensor): Detection scores. - labels_3d (torch.Tensor): Predicted box labels. img_metas (list[dict]): Meta information of each sample. test_cfg (dict): Test config. Returns: dict: Bounding boxes results in cpu mode, containing merged results. - boxes_3d (:obj:`BaseInstance3DBoxes`): Merged detection bbox. - scores_3d (torch.Tensor): Merged detection scores. - labels_3d (torch.Tensor): Merged predicted box labels. """ assert len(aug_results) == len(img_metas), \ '"aug_results" should have the same length as "img_metas", got len(' \ f'aug_results)={len(aug_results)} and len(img_metas)={len(img_metas)}' recovered_bboxes = [] recovered_scores = [] recovered_labels = [] for bboxes, img_info in zip(aug_results, img_metas): scale_factor = img_info[0]['pcd_scale_factor'] pcd_horizontal_flip = img_info[0]['pcd_horizontal_flip'] pcd_vertical_flip = img_info[0]['pcd_vertical_flip'] recovered_scores.append(bboxes['scores_3d']) recovered_labels.append(bboxes['labels_3d']) bboxes = bbox3d_mapping_back(bboxes['boxes_3d'], scale_factor, pcd_horizontal_flip, pcd_vertical_flip) recovered_bboxes.append(bboxes) aug_bboxes = recovered_bboxes[0].cat(recovered_bboxes) aug_bboxes_for_nms = xywhr2xyxyr(aug_bboxes.bev) aug_scores = torch.cat(recovered_scores, dim=0) aug_labels = torch.cat(recovered_labels, dim=0) # TODO: use a more elegent way to deal with nms if test_cfg.use_rotate_nms: nms_func = nms_bev else: nms_func = nms_normal_bev merged_bboxes = [] merged_scores = [] merged_labels = [] # Apply multi-class nms when merge bboxes if len(aug_labels) == 0: return bbox3d2result(aug_bboxes, aug_scores, aug_labels) for class_id in range(torch.max(aug_labels).item() + 1): class_inds = (aug_labels == class_id) bboxes_i = aug_bboxes[class_inds] bboxes_nms_i = aug_bboxes_for_nms[class_inds, :] scores_i = aug_scores[class_inds] labels_i = aug_labels[class_inds] if len(bboxes_nms_i) == 0: continue selected = nms_func(bboxes_nms_i, scores_i, test_cfg.nms_thr) merged_bboxes.append(bboxes_i[selected, :]) merged_scores.append(scores_i[selected]) merged_labels.append(labels_i[selected]) merged_bboxes = merged_bboxes[0].cat(merged_bboxes) merged_scores = torch.cat(merged_scores, dim=0) merged_labels = torch.cat(merged_labels, dim=0) _, order = merged_scores.sort(0, descending=True) num = min(test_cfg.max_num, len(aug_bboxes)) order = order[:num] merged_bboxes = merged_bboxes[order] merged_scores = merged_scores[order] merged_labels = merged_labels[order] return bbox3d2result(merged_bboxes, merged_scores, merged_labels) ================================================ FILE: mmdet3d/core/utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .array_converter import ArrayConverter, array_converter from .gaussian import (draw_heatmap_gaussian, ellip_gaussian2D, gaussian_2d, gaussian_radius, get_ellip_gaussian_2D) __all__ = [ 'gaussian_2d', 'gaussian_radius', 'draw_heatmap_gaussian', 'ArrayConverter', 'array_converter', 'ellip_gaussian2D', 'get_ellip_gaussian_2D' ] ================================================ FILE: mmdet3d/core/utils/array_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import functools from inspect import getfullargspec import numpy as np import torch def array_converter(to_torch=True, apply_to=tuple(), template_arg_name_=None, recover=True): """Wrapper function for data-type agnostic processing. First converts input arrays to PyTorch tensors or NumPy ndarrays for middle calculation, then convert output to original data-type if `recover=True`. Args: to_torch (Bool, optional): Whether convert to PyTorch tensors for middle calculation. Defaults to True. apply_to (tuple[str], optional): The arguments to which we apply data-type conversion. Defaults to an empty tuple. template_arg_name_ (str, optional): Argument serving as the template ( return arrays should have the same dtype and device as the template). Defaults to None. If None, we will use the first argument in `apply_to` as the template argument. recover (Bool, optional): Whether or not recover the wrapped function outputs to the `template_arg_name_` type. Defaults to True. Raises: ValueError: When template_arg_name_ is not among all args, or when apply_to contains an arg which is not among all args, a ValueError will be raised. When the template argument or an argument to convert is a list or tuple, and cannot be converted to a NumPy array, a ValueError will be raised. TypeError: When the type of the template argument or an argument to convert does not belong to the above range, or the contents of such an list-or-tuple-type argument do not share the same data type, a TypeError is raised. Returns: (function): wrapped function. Example: >>> import torch >>> import numpy as np >>> >>> # Use torch addition for a + b, >>> # and convert return values to the type of a >>> @array_converter(apply_to=('a', 'b')) >>> def simple_add(a, b): >>> return a + b >>> >>> a = np.array([1.1]) >>> b = np.array([2.2]) >>> simple_add(a, b) >>> >>> # Use numpy addition for a + b, >>> # and convert return values to the type of b >>> @array_converter(to_torch=False, apply_to=('a', 'b'), >>> template_arg_name_='b') >>> def simple_add(a, b): >>> return a + b >>> >>> simple_add() >>> >>> # Use torch funcs for floor(a) if flag=True else ceil(a), >>> # and return the torch tensor >>> @array_converter(apply_to=('a',), recover=False) >>> def floor_or_ceil(a, flag=True): >>> return torch.floor(a) if flag else torch.ceil(a) >>> >>> floor_or_ceil(a, flag=False) """ def array_converter_wrapper(func): """Outer wrapper for the function.""" @functools.wraps(func) def new_func(*args, **kwargs): """Inner wrapper for the arguments.""" if len(apply_to) == 0: return func(*args, **kwargs) func_name = func.__name__ arg_spec = getfullargspec(func) arg_names = arg_spec.args arg_num = len(arg_names) default_arg_values = arg_spec.defaults if default_arg_values is None: default_arg_values = [] no_default_arg_num = len(arg_names) - len(default_arg_values) kwonly_arg_names = arg_spec.kwonlyargs kwonly_default_arg_values = arg_spec.kwonlydefaults if kwonly_default_arg_values is None: kwonly_default_arg_values = {} all_arg_names = arg_names + kwonly_arg_names # in case there are args in the form of *args if len(args) > arg_num: named_args = args[:arg_num] nameless_args = args[arg_num:] else: named_args = args nameless_args = [] # template argument data type is used for all array-like arguments if template_arg_name_ is None: template_arg_name = apply_to[0] else: template_arg_name = template_arg_name_ if template_arg_name not in all_arg_names: raise ValueError(f'{template_arg_name} is not among the ' f'argument list of function {func_name}') # inspect apply_to for arg_to_apply in apply_to: if arg_to_apply not in all_arg_names: raise ValueError(f'{arg_to_apply} is not ' f'an argument of {func_name}') new_args = [] new_kwargs = {} converter = ArrayConverter() target_type = torch.Tensor if to_torch else np.ndarray # non-keyword arguments for i, arg_value in enumerate(named_args): if arg_names[i] in apply_to: new_args.append( converter.convert( input_array=arg_value, target_type=target_type)) else: new_args.append(arg_value) if arg_names[i] == template_arg_name: template_arg_value = arg_value kwonly_default_arg_values.update(kwargs) kwargs = kwonly_default_arg_values # keyword arguments and non-keyword arguments using default value for i in range(len(named_args), len(all_arg_names)): arg_name = all_arg_names[i] if arg_name in kwargs: if arg_name in apply_to: new_kwargs[arg_name] = converter.convert( input_array=kwargs[arg_name], target_type=target_type) else: new_kwargs[arg_name] = kwargs[arg_name] else: default_value = default_arg_values[i - no_default_arg_num] if arg_name in apply_to: new_kwargs[arg_name] = converter.convert( input_array=default_value, target_type=target_type) else: new_kwargs[arg_name] = default_value if arg_name == template_arg_name: template_arg_value = kwargs[arg_name] # add nameless args provided by *args (if exists) new_args += nameless_args return_values = func(*new_args, **new_kwargs) converter.set_template(template_arg_value) def recursive_recover(input_data): if isinstance(input_data, (tuple, list)): new_data = [] for item in input_data: new_data.append(recursive_recover(item)) return tuple(new_data) if isinstance(input_data, tuple) else new_data elif isinstance(input_data, dict): new_data = {} for k, v in input_data.items(): new_data[k] = recursive_recover(v) return new_data elif isinstance(input_data, (torch.Tensor, np.ndarray)): return converter.recover(input_data) else: return input_data if recover: return recursive_recover(return_values) else: return return_values return new_func return array_converter_wrapper class ArrayConverter: SUPPORTED_NON_ARRAY_TYPES = (int, float, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float16, np.float32, np.float64) def __init__(self, template_array=None): if template_array is not None: self.set_template(template_array) def set_template(self, array): """Set template array. Args: array (tuple | list | int | float | np.ndarray | torch.Tensor): Template array. Raises: ValueError: If input is list or tuple and cannot be converted to to a NumPy array, a ValueError is raised. TypeError: If input type does not belong to the above range, or the contents of a list or tuple do not share the same data type, a TypeError is raised. """ self.array_type = type(array) self.is_num = False self.device = 'cpu' if isinstance(array, np.ndarray): self.dtype = array.dtype elif isinstance(array, torch.Tensor): self.dtype = array.dtype self.device = array.device elif isinstance(array, (list, tuple)): try: array = np.array(array) if array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES: raise TypeError self.dtype = array.dtype except (ValueError, TypeError): print(f'The following list cannot be converted to' f' a numpy array of supported dtype:\n{array}') raise elif isinstance(array, self.SUPPORTED_NON_ARRAY_TYPES): self.array_type = np.ndarray self.is_num = True self.dtype = np.dtype(type(array)) else: raise TypeError(f'Template type {self.array_type}' f' is not supported.') def convert(self, input_array, target_type=None, target_array=None): """Convert input array to target data type. Args: input_array (tuple | list | np.ndarray | torch.Tensor | int | float ): Input array. Defaults to None. target_type ( | , optional): Type to which input array is converted. Defaults to None. target_array (np.ndarray | torch.Tensor, optional): Template array to which input array is converted. Defaults to None. Raises: ValueError: If input is list or tuple and cannot be converted to to a NumPy array, a ValueError is raised. TypeError: If input type does not belong to the above range, or the contents of a list or tuple do not share the same data type, a TypeError is raised. """ if isinstance(input_array, (list, tuple)): try: input_array = np.array(input_array) if input_array.dtype not in self.SUPPORTED_NON_ARRAY_TYPES: raise TypeError except (ValueError, TypeError): print(f'The input cannot be converted to' f' a single-type numpy array:\n{input_array}') raise elif isinstance(input_array, self.SUPPORTED_NON_ARRAY_TYPES): input_array = np.array(input_array) array_type = type(input_array) assert target_type is not None or target_array is not None, \ 'must specify a target' if target_type is not None: assert target_type in (np.ndarray, torch.Tensor), \ 'invalid target type' if target_type == array_type: return input_array elif target_type == np.ndarray: # default dtype is float32 converted_array = input_array.cpu().numpy().astype(np.float32) else: # default dtype is float32, device is 'cpu' converted_array = torch.tensor( input_array, dtype=torch.float32) else: assert isinstance(target_array, (np.ndarray, torch.Tensor)), \ 'invalid target array type' if isinstance(target_array, array_type): return input_array elif isinstance(target_array, np.ndarray): converted_array = input_array.cpu().numpy().astype( target_array.dtype) else: converted_array = target_array.new_tensor(input_array) return converted_array def recover(self, input_array): assert isinstance(input_array, (np.ndarray, torch.Tensor)), \ 'invalid input array type' if isinstance(input_array, self.array_type): return input_array elif isinstance(input_array, torch.Tensor): converted_array = input_array.cpu().numpy().astype(self.dtype) else: converted_array = torch.tensor( input_array, dtype=self.dtype, device=self.device) if self.is_num: converted_array = converted_array.item() return converted_array ================================================ FILE: mmdet3d/core/utils/gaussian.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch def gaussian_2d(shape, sigma=1): """Generate gaussian map. Args: shape (list[int]): Shape of the map. sigma (float, optional): Sigma to generate gaussian map. Defaults to 1. Returns: np.ndarray: Generated gaussian map. """ m, n = [(ss - 1.) / 2. for ss in shape] y, x = np.ogrid[-m:m + 1, -n:n + 1] h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h def draw_heatmap_gaussian(heatmap, center, radius, k=1): """Get gaussian masked heatmap. Args: heatmap (torch.Tensor): Heatmap to be masked. center (torch.Tensor): Center coord of the heatmap. radius (int): Radius of gaussian. K (int, optional): Multiple of masked_gaussian. Defaults to 1. Returns: torch.Tensor: Masked heatmap. """ diameter = 2 * radius + 1 gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = torch.from_numpy( gaussian[radius - top:radius + bottom, radius - left:radius + right]).to(heatmap.device, torch.float32) if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap def gaussian_radius(det_size, min_overlap=0.5): """Get radius of gaussian. Args: det_size (tuple[torch.Tensor]): Size of the detection result. min_overlap (float, optional): Gaussian_overlap. Defaults to 0.5. Returns: torch.Tensor: Computed radius. """ height, width = det_size a1 = 1 b1 = (height + width) c1 = width * height * (1 - min_overlap) / (1 + min_overlap) sq1 = torch.sqrt(b1**2 - 4 * a1 * c1) r1 = (b1 + sq1) / 2 a2 = 4 b2 = 2 * (height + width) c2 = (1 - min_overlap) * width * height sq2 = torch.sqrt(b2**2 - 4 * a2 * c2) r2 = (b2 + sq2) / 2 a3 = 4 * min_overlap b3 = -2 * min_overlap * (height + width) c3 = (min_overlap - 1) * width * height sq3 = torch.sqrt(b3**2 - 4 * a3 * c3) r3 = (b3 + sq3) / 2 return min(r1, r2, r3) def get_ellip_gaussian_2D(heatmap, center, radius_x, radius_y, k=1): """Generate 2D ellipse gaussian heatmap. Args: heatmap (Tensor): Input heatmap, the gaussian kernel will cover on it and maintain the max value. center (list[int]): Coord of gaussian kernel's center. radius_x (int): X-axis radius of gaussian kernel. radius_y (int): Y-axis radius of gaussian kernel. k (int, optional): Coefficient of gaussian kernel. Default: 1. Returns: out_heatmap (Tensor): Updated heatmap covered by gaussian kernel. """ diameter_x, diameter_y = 2 * radius_x + 1, 2 * radius_y + 1 gaussian_kernel = ellip_gaussian2D((radius_x, radius_y), sigma_x=diameter_x / 6, sigma_y=diameter_y / 6, dtype=heatmap.dtype, device=heatmap.device) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius_x), min(width - x, radius_x + 1) top, bottom = min(y, radius_y), min(height - y, radius_y + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = gaussian_kernel[radius_y - top:radius_y + bottom, radius_x - left:radius_x + right] out_heatmap = heatmap torch.max( masked_heatmap, masked_gaussian * k, out=out_heatmap[y - top:y + bottom, x - left:x + right]) return out_heatmap def ellip_gaussian2D(radius, sigma_x, sigma_y, dtype=torch.float32, device='cpu'): """Generate 2D ellipse gaussian kernel. Args: radius (tuple(int)): Ellipse radius (radius_x, radius_y) of gaussian kernel. sigma_x (int): X-axis sigma of gaussian function. sigma_y (int): Y-axis sigma of gaussian function. dtype (torch.dtype, optional): Dtype of gaussian tensor. Default: torch.float32. device (str, optional): Device of gaussian tensor. Default: 'cpu'. Returns: h (Tensor): Gaussian kernel with a ``(2 * radius_y + 1) * (2 * radius_x + 1)`` shape. """ x = torch.arange( -radius[0], radius[0] + 1, dtype=dtype, device=device).view(1, -1) y = torch.arange( -radius[1], radius[1] + 1, dtype=dtype, device=device).view(-1, 1) h = (-(x * x) / (2 * sigma_x * sigma_x) - (y * y) / (2 * sigma_y * sigma_y)).exp() h[h < torch.finfo(h.dtype).eps * h.max()] = 0 return h ================================================ FILE: mmdet3d/core/visualizer/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .show_result import (show_multi_modality_result, show_result, show_seg_result) __all__ = ['show_result', 'show_seg_result', 'show_multi_modality_result'] ================================================ FILE: mmdet3d/core/visualizer/image_vis.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import cv2 import numpy as np import torch from matplotlib import pyplot as plt def project_pts_on_img(points, raw_img, lidar2img_rt, max_distance=70, thickness=-1): """Project the 3D points cloud on 2D image. Args: points (numpy.array): 3D points cloud (x, y, z) to visualize. raw_img (numpy.array): The numpy array of image. lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. max_distance (float, optional): the max distance of the points cloud. Default: 70. thickness (int, optional): The thickness of 2D points. Default: -1. """ img = raw_img.copy() num_points = points.shape[0] pts_4d = np.concatenate([points[:, :3], np.ones((num_points, 1))], axis=-1) pts_2d = pts_4d @ lidar2img_rt.T # cam_points is Tensor of Nx4 whose last column is 1 # transform camera coordinate to image coordinate pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=99999) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] fov_inds = ((pts_2d[:, 0] < img.shape[1]) & (pts_2d[:, 0] >= 0) & (pts_2d[:, 1] < img.shape[0]) & (pts_2d[:, 1] >= 0)) imgfov_pts_2d = pts_2d[fov_inds, :3] # u, v, d cmap = plt.cm.get_cmap('hsv', 256) cmap = np.array([cmap(i) for i in range(256)])[:, :3] * 255 for i in range(imgfov_pts_2d.shape[0]): depth = imgfov_pts_2d[i, 2] color = cmap[np.clip(int(max_distance * 10 / depth), 0, 255), :] cv2.circle( img, center=(int(np.round(imgfov_pts_2d[i, 0])), int(np.round(imgfov_pts_2d[i, 1]))), radius=1, color=tuple(color), thickness=thickness, ) cv2.imshow('project_pts_img', img.astype(np.uint8)) cv2.waitKey(100) def plot_rect3d_on_img(img, num_rects, rect_corners, color=(0, 255, 0), thickness=1): """Plot the boundary lines of 3D rectangular on 2D images. Args: img (numpy.array): The numpy array of image. num_rects (int): Number of 3D rectangulars. rect_corners (numpy.array): Coordinates of the corners of 3D rectangulars. Should be in the shape of [num_rect, 8, 2]. color (tuple[int], optional): The color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ line_indices = ((0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7), (4, 5), (4, 7), (2, 6), (5, 6), (6, 7)) for i in range(num_rects): corners = rect_corners[i].astype(np.int) for start, end in line_indices: cv2.line(img, (corners[start, 0], corners[start, 1]), (corners[end, 0], corners[end, 1]), color, thickness, cv2.LINE_AA) return img.astype(np.uint8) def draw_lidar_bbox3d_on_img(bboxes3d, raw_img, lidar2img_rt, img_metas, color=(0, 255, 0), thickness=1): """Project the 3D bbox on 2D plane and draw on input image. Args: bboxes3d (:obj:`LiDARInstance3DBoxes`): 3d bbox in lidar coordinate system to visualize. raw_img (numpy.array): The numpy array of image. lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. img_metas (dict): Useless here. color (tuple[int], optional): The color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ img = raw_img.copy() corners_3d = bboxes3d.corners num_bbox = corners_3d.shape[0] pts_4d = np.concatenate( [corners_3d.reshape(-1, 3), np.ones((num_bbox * 8, 1))], axis=-1) lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4) if isinstance(lidar2img_rt, torch.Tensor): lidar2img_rt = lidar2img_rt.cpu().numpy() pts_2d = pts_4d @ lidar2img_rt.T pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2) return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness) # TODO: remove third parameter in all functions here in favour of img_metas def draw_depth_bbox3d_on_img(bboxes3d, raw_img, calibs, img_metas, color=(0, 255, 0), thickness=1): """Project the 3D bbox on 2D plane and draw on input image. Args: bboxes3d (:obj:`DepthInstance3DBoxes`, shape=[M, 7]): 3d bbox in depth coordinate system to visualize. raw_img (numpy.array): The numpy array of image. calibs (dict): Camera calibration information, Rt and K. img_metas (dict): Used in coordinates transformation. color (tuple[int], optional): The color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ from mmdet3d.core.bbox import points_cam2img from mmdet3d.models import apply_3d_transformation img = raw_img.copy() img_metas = copy.deepcopy(img_metas) corners_3d = bboxes3d.corners num_bbox = corners_3d.shape[0] points_3d = corners_3d.reshape(-1, 3) # first reverse the data transformations xyz_depth = apply_3d_transformation( points_3d, 'DEPTH', img_metas, reverse=True) # project to 2d to get image coords (uv) uv_origin = points_cam2img(xyz_depth, xyz_depth.new_tensor(img_metas['depth2img'])) uv_origin = (uv_origin - 1).round() imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy() return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness) def draw_camera_bbox3d_on_img(bboxes3d, raw_img, cam2img, img_metas, color=(0, 255, 0), thickness=1): """Project the 3D bbox on 2D plane and draw on input image. Args: bboxes3d (:obj:`CameraInstance3DBoxes`, shape=[M, 7]): 3d bbox in camera coordinate system to visualize. raw_img (numpy.array): The numpy array of image. cam2img (dict): Camera intrinsic matrix, denoted as `K` in depth bbox coordinate system. img_metas (dict): Useless here. color (tuple[int], optional): The color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ from mmdet3d.core.bbox import points_cam2img img = raw_img.copy() cam2img = copy.deepcopy(cam2img) corners_3d = bboxes3d.corners num_bbox = corners_3d.shape[0] points_3d = corners_3d.reshape(-1, 3) if not isinstance(cam2img, torch.Tensor): cam2img = torch.from_numpy(np.array(cam2img)) assert (cam2img.shape == torch.Size([3, 3]) or cam2img.shape == torch.Size([4, 4])) cam2img = cam2img.float().cpu() # project to 2d to get image coords (uv) uv_origin = points_cam2img(points_3d, cam2img) uv_origin = (uv_origin - 1).round() imgfov_pts_2d = uv_origin[..., :2].reshape(num_bbox, 8, 2).numpy() return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness) ================================================ FILE: mmdet3d/core/visualizer/open3d_vis.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import numpy as np import torch try: import open3d as o3d from open3d import geometry except ImportError: raise ImportError( 'Please run "pip install open3d" to install open3d first.') def _draw_points(points, vis, points_size=2, point_color=(0.5, 0.5, 0.5), mode='xyz'): """Draw points on visualizer. Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. points_size (int, optional): the size of points to show on visualizer. Default: 2. point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). mode (str, optional): indicate type of the input points, available mode ['xyz', 'xyzrgb']. Default: 'xyz'. Returns: tuple: points, color of each point. """ vis.get_render_option().point_size = points_size # set points size if isinstance(points, torch.Tensor): points = points.cpu().numpy() points = points.copy() pcd = geometry.PointCloud() if mode == 'xyz': pcd.points = o3d.utility.Vector3dVector(points[:, :3]) points_colors = np.tile(np.array(point_color), (points.shape[0], 1)) elif mode == 'xyzrgb': pcd.points = o3d.utility.Vector3dVector(points[:, :3]) points_colors = points[:, 3:6] # normalize to [0, 1] for open3d drawing if not ((points_colors >= 0.0) & (points_colors <= 1.0)).all(): points_colors /= 255.0 else: raise NotImplementedError pcd.colors = o3d.utility.Vector3dVector(points_colors) vis.add_geometry(pcd) return pcd, points_colors def _draw_bboxes(bbox3d, vis, points_colors, pcd=None, bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox on visualizer and change the color of points inside bbox3d. Args: bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. points_colors (numpy.array): color of each points. pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud. Default: None. bbox_color (tuple[float], optional): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float], optional): the color of points inside bbox3d. Default: (1, 0, 0). rot_axis (int, optional): rotation axis of bbox. Default: 2. center_mode (bool, optional): indicate the center of bbox is bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str, optional): indicate type of the input points, available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ if isinstance(bbox3d, torch.Tensor): bbox3d = bbox3d.cpu().numpy() bbox3d = bbox3d.copy() in_box_color = np.array(points_in_box_color) for i in range(len(bbox3d)): center = bbox3d[i, 0:3] dim = bbox3d[i, 3:6] yaw = np.zeros(3) yaw[rot_axis] = bbox3d[i, 6] rot_mat = geometry.get_rotation_matrix_from_xyz(yaw) if center_mode == 'lidar_bottom': center[rot_axis] += dim[ rot_axis] / 2 # bottom center to gravity center elif center_mode == 'camera_bottom': center[rot_axis] -= dim[ rot_axis] / 2 # bottom center to gravity center box3d = geometry.OrientedBoundingBox(center, rot_mat, dim) line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d) line_set.paint_uniform_color(bbox_color) # draw bboxes on visualizer vis.add_geometry(line_set) # change the color of points which are in box if pcd is not None and mode == 'xyz': indices = box3d.get_point_indices_within_bounding_box(pcd.points) points_colors[indices] = in_box_color # update points colors if pcd is not None: pcd.colors = o3d.utility.Vector3dVector(points_colors) vis.update_geometry(pcd) def show_pts_boxes(points, bbox3d=None, show=True, save_path=None, points_size=2, point_color=(0.5, 0.5, 0.5), bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox and points on visualizer. Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. bbox3d (numpy.array | torch.tensor, shape=[M, 7], optional): 3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. Defaults to None. show (bool, optional): whether to show the visualization results. Default: True. save_path (str, optional): path to save visualized results. Default: None. points_size (int, optional): the size of points to show on visualizer. Default: 2. point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). bbox_color (tuple[float], optional): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int, optional): rotation axis of bbox. Default: 2. center_mode (bool, optional): indicate the center of bbox is bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str, optional): indicate type of the input points, available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ # TODO: support score and class info assert 0 <= rot_axis <= 2 # init visualizer vis = o3d.visualization.Visualizer() vis.create_window() mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[0, 0, 0]) # create coordinate frame vis.add_geometry(mesh_frame) # draw points pcd, points_colors = _draw_points(points, vis, points_size, point_color, mode) # draw boxes if bbox3d is not None: _draw_bboxes(bbox3d, vis, points_colors, pcd, bbox_color, points_in_box_color, rot_axis, center_mode, mode) if show: vis.run() if save_path is not None: vis.capture_screen_image(save_path) vis.destroy_window() def _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd=None, bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox on visualizer and change the color or points inside bbox3d with indices. Args: bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3d bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. vis (:obj:`open3d.visualization.Visualizer`): open3d visualizer. indices (numpy.array | torch.tensor, shape=[N, M]): indicate which bbox3d that each point lies in. points_colors (numpy.array): color of each points. pcd (:obj:`open3d.geometry.PointCloud`, optional): point cloud. Default: None. bbox_color (tuple[float], optional): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int, optional): rotation axis of bbox. Default: 2. center_mode (bool, optional): indicate the center of bbox is bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str, optional): indicate type of the input points, available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ if isinstance(bbox3d, torch.Tensor): bbox3d = bbox3d.cpu().numpy() if isinstance(indices, torch.Tensor): indices = indices.cpu().numpy() bbox3d = bbox3d.copy() in_box_color = np.array(points_in_box_color) for i in range(len(bbox3d)): center = bbox3d[i, 0:3] dim = bbox3d[i, 3:6] yaw = np.zeros(3) # TODO: fix problem of current coordinate system # dim[0], dim[1] = dim[1], dim[0] # for current coordinate # yaw[rot_axis] = -(bbox3d[i, 6] - 0.5 * np.pi) yaw[rot_axis] = -bbox3d[i, 6] rot_mat = geometry.get_rotation_matrix_from_xyz(yaw) if center_mode == 'lidar_bottom': center[rot_axis] += dim[ rot_axis] / 2 # bottom center to gravity center elif center_mode == 'camera_bottom': center[rot_axis] -= dim[ rot_axis] / 2 # bottom center to gravity center box3d = geometry.OrientedBoundingBox(center, rot_mat, dim) line_set = geometry.LineSet.create_from_oriented_bounding_box(box3d) line_set.paint_uniform_color(bbox_color) # draw bboxes on visualizer vis.add_geometry(line_set) # change the color of points which are in box if pcd is not None and mode == 'xyz': points_colors[indices[:, i].astype(np.bool)] = in_box_color # update points colors if pcd is not None: pcd.colors = o3d.utility.Vector3dVector(points_colors) vis.update_geometry(pcd) def show_pts_index_boxes(points, bbox3d=None, show=True, indices=None, save_path=None, points_size=2, point_color=(0.5, 0.5, 0.5), bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): """Draw bbox and points on visualizer with indices that indicate which bbox3d that each point lies in. Args: points (numpy.array | torch.tensor, shape=[N, 3+C]): points to visualize. bbox3d (numpy.array | torch.tensor, shape=[M, 7]): 3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. Defaults to None. show (bool, optional): whether to show the visualization results. Default: True. indices (numpy.array | torch.tensor, shape=[N, M], optional): indicate which bbox3d that each point lies in. Default: None. save_path (str, optional): path to save visualized results. Default: None. points_size (int, optional): the size of points to show on visualizer. Default: 2. point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). bbox_color (tuple[float], optional): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int, optional): rotation axis of bbox. Default: 2. center_mode (bool, optional): indicate the center of bbox is bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str, optional): indicate type of the input points, available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ # TODO: support score and class info assert 0 <= rot_axis <= 2 # init visualizer vis = o3d.visualization.Visualizer() vis.create_window() mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[0, 0, 0]) # create coordinate frame vis.add_geometry(mesh_frame) # draw points pcd, points_colors = _draw_points(points, vis, points_size, point_color, mode) # draw boxes if bbox3d is not None: _draw_bboxes_ind(bbox3d, vis, indices, points_colors, pcd, bbox_color, points_in_box_color, rot_axis, center_mode, mode) if show: vis.run() if save_path is not None: vis.capture_screen_image(save_path) vis.destroy_window() class Visualizer(object): r"""Online visualizer implemented with Open3d. Args: points (numpy.array, shape=[N, 3+C]): Points to visualize. The Points cloud is in mode of Coord3DMode.DEPTH (please refer to core.structures.coord_3d_mode). bbox3d (numpy.array, shape=[M, 7], optional): 3D bbox (x, y, z, x_size, y_size, z_size, yaw) to visualize. The 3D bbox is in mode of Box3DMode.DEPTH with gravity_center (please refer to core.structures.box_3d_mode). Default: None. save_path (str, optional): path to save visualized results. Default: None. points_size (int, optional): the size of points to show on visualizer. Default: 2. point_color (tuple[float], optional): the color of points. Default: (0.5, 0.5, 0.5). bbox_color (tuple[float], optional): the color of bbox. Default: (0, 1, 0). points_in_box_color (tuple[float], optional): the color of points which are in bbox3d. Default: (1, 0, 0). rot_axis (int, optional): rotation axis of bbox. Default: 2. center_mode (bool, optional): indicate the center of bbox is bottom center or gravity center. available mode ['lidar_bottom', 'camera_bottom']. Default: 'lidar_bottom'. mode (str, optional): indicate type of the input points, available mode ['xyz', 'xyzrgb']. Default: 'xyz'. """ def __init__(self, points, bbox3d=None, save_path=None, points_size=2, point_color=(0.5, 0.5, 0.5), bbox_color=(0, 1, 0), points_in_box_color=(1, 0, 0), rot_axis=2, center_mode='lidar_bottom', mode='xyz'): super(Visualizer, self).__init__() assert 0 <= rot_axis <= 2 # init visualizer self.o3d_visualizer = o3d.visualization.Visualizer() self.o3d_visualizer.create_window() mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[0, 0, 0]) # create coordinate frame self.o3d_visualizer.add_geometry(mesh_frame) self.points_size = points_size self.point_color = point_color self.bbox_color = bbox_color self.points_in_box_color = points_in_box_color self.rot_axis = rot_axis self.center_mode = center_mode self.mode = mode self.seg_num = 0 # draw points if points is not None: self.pcd, self.points_colors = _draw_points( points, self.o3d_visualizer, points_size, point_color, mode) # draw boxes if bbox3d is not None: _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd, bbox_color, points_in_box_color, rot_axis, center_mode, mode) def add_bboxes(self, bbox3d, bbox_color=None, points_in_box_color=None): """Add bounding box to visualizer. Args: bbox3d (numpy.array, shape=[M, 7]): 3D bbox (x, y, z, x_size, y_size, z_size, yaw) to be visualized. The 3d bbox is in mode of Box3DMode.DEPTH with gravity_center (please refer to core.structures.box_3d_mode). bbox_color (tuple[float]): the color of bbox. Default: None. points_in_box_color (tuple[float]): the color of points which are in bbox3d. Default: None. """ if bbox_color is None: bbox_color = self.bbox_color if points_in_box_color is None: points_in_box_color = self.points_in_box_color _draw_bboxes(bbox3d, self.o3d_visualizer, self.points_colors, self.pcd, bbox_color, points_in_box_color, self.rot_axis, self.center_mode, self.mode) def add_seg_mask(self, seg_mask_colors): """Add segmentation mask to visualizer via per-point colorization. Args: seg_mask_colors (numpy.array, shape=[N, 6]): The segmentation mask whose first 3 dims are point coordinates and last 3 dims are converted colors. """ # we can't draw the colors on existing points # in case gt and pred mask would overlap # instead we set a large offset along x-axis for each seg mask self.seg_num += 1 offset = (np.array(self.pcd.points).max(0) - np.array(self.pcd.points).min(0))[0] * 1.2 * self.seg_num mesh_frame = geometry.TriangleMesh.create_coordinate_frame( size=1, origin=[offset, 0, 0]) # create coordinate frame for seg self.o3d_visualizer.add_geometry(mesh_frame) seg_points = copy.deepcopy(seg_mask_colors) seg_points[:, 0] += offset _draw_points( seg_points, self.o3d_visualizer, self.points_size, mode='xyzrgb') def show(self, save_path=None): """Visualize the points cloud. Args: save_path (str, optional): path to save image. Default: None. """ self.o3d_visualizer.run() if save_path is not None: self.o3d_visualizer.capture_screen_image(save_path) self.o3d_visualizer.destroy_window() return ================================================ FILE: mmdet3d/core/visualizer/show_result.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp import mmcv import numpy as np import trimesh from .image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img, draw_lidar_bbox3d_on_img) def _write_obj(points, out_filename): """Write points into ``obj`` format for meshlab visualization. Args: points (np.ndarray): Points in shape (N, dim). out_filename (str): Filename to be saved. """ N = points.shape[0] fout = open(out_filename, 'w') for i in range(N): if points.shape[1] == 6: c = points[i, 3:].astype(int) fout.write( 'v %f %f %f %d %d %d\n' % (points[i, 0], points[i, 1], points[i, 2], c[0], c[1], c[2])) else: fout.write('v %f %f %f\n' % (points[i, 0], points[i, 1], points[i, 2])) fout.close() def _write_oriented_bbox(scene_bbox, out_filename): """Export oriented (around Z axis) scene bbox to meshes. Args: scene_bbox(list[ndarray] or ndarray): xyz pos of center and 3 lengths (x_size, y_size, z_size) and heading angle around Z axis. Y forward, X right, Z upward. heading angle of positive X is 0, heading angle of positive Y is 90 degrees. out_filename(str): Filename. """ def heading2rotmat(heading_angle): rotmat = np.zeros((3, 3)) rotmat[2, 2] = 1 cosval = np.cos(heading_angle) sinval = np.sin(heading_angle) rotmat[0:2, 0:2] = np.array([[cosval, -sinval], [sinval, cosval]]) return rotmat def convert_oriented_box_to_trimesh_fmt(box): ctr = box[:3] lengths = box[3:6] trns = np.eye(4) trns[0:3, 3] = ctr trns[3, 3] = 1.0 trns[0:3, 0:3] = heading2rotmat(box[6]) box_trimesh_fmt = trimesh.creation.box(lengths, trns) return box_trimesh_fmt if len(scene_bbox) == 0: scene_bbox = np.zeros((1, 7)) scene = trimesh.scene.Scene() for box in scene_bbox: scene.add_geometry(convert_oriented_box_to_trimesh_fmt(box)) mesh_list = trimesh.util.concatenate(scene.dump()) # save to obj file trimesh.io.export.export_mesh(mesh_list, out_filename, file_type='obj') return def show_result(points, gt_bboxes, pred_bboxes, out_dir, filename, show=False, snapshot=False, pred_labels=None): """Convert results into format that is directly readable for meshlab. Args: points (np.ndarray): Points. gt_bboxes (np.ndarray): Ground truth boxes. pred_bboxes (np.ndarray): Predicted boxes. out_dir (str): Path of output directory filename (str): Filename of the current frame. show (bool, optional): Visualize the results online. Defaults to False. snapshot (bool, optional): Whether to save the online results. Defaults to False. pred_labels (np.ndarray, optional): Predicted labels of boxes. Defaults to None. """ result_path = osp.join(out_dir, filename) mmcv.mkdir_or_exist(result_path) if show: from .open3d_vis import Visualizer vis = Visualizer(points) if pred_bboxes is not None: if pred_labels is None: vis.add_bboxes(bbox3d=pred_bboxes) else: palette = np.random.randint( 0, 255, size=(pred_labels.max() + 1, 3)) / 256 labelDict = {} for j in range(len(pred_labels)): i = int(pred_labels[j].numpy()) if labelDict.get(i) is None: labelDict[i] = [] labelDict[i].append(pred_bboxes[j]) for i in labelDict: vis.add_bboxes( bbox3d=np.array(labelDict[i]), bbox_color=palette[i], points_in_box_color=palette[i]) if gt_bboxes is not None: vis.add_bboxes(bbox3d=gt_bboxes, bbox_color=(0, 0, 1)) show_path = osp.join(result_path, f'{filename}_online.png') if snapshot else None vis.show(show_path) if points is not None: _write_obj(points, osp.join(result_path, f'{filename}_points.obj')) if gt_bboxes is not None: # bottom center to gravity center gt_bboxes[..., 2] += gt_bboxes[..., 5] / 2 _write_oriented_bbox(gt_bboxes, osp.join(result_path, f'{filename}_gt.obj')) if pred_bboxes is not None: # bottom center to gravity center pred_bboxes[..., 2] += pred_bboxes[..., 5] / 2 _write_oriented_bbox(pred_bboxes, osp.join(result_path, f'{filename}_pred.obj')) def show_seg_result(points, gt_seg, pred_seg, out_dir, filename, palette, ignore_index=None, show=False, snapshot=False): """Convert results into format that is directly readable for meshlab. Args: points (np.ndarray): Points. gt_seg (np.ndarray): Ground truth segmentation mask. pred_seg (np.ndarray): Predicted segmentation mask. out_dir (str): Path of output directory filename (str): Filename of the current frame. palette (np.ndarray): Mapping between class labels and colors. ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. Defaults to None. show (bool, optional): Visualize the results online. Defaults to False. snapshot (bool, optional): Whether to save the online results. Defaults to False. """ # we need 3D coordinates to visualize segmentation mask if gt_seg is not None or pred_seg is not None: assert points is not None, \ '3D coordinates are required for segmentation visualization' # filter out ignored points if gt_seg is not None and ignore_index is not None: if points is not None: points = points[gt_seg != ignore_index] if pred_seg is not None: pred_seg = pred_seg[gt_seg != ignore_index] gt_seg = gt_seg[gt_seg != ignore_index] if gt_seg is not None: gt_seg_color = palette[gt_seg] gt_seg_color = np.concatenate([points[:, :3], gt_seg_color], axis=1) if pred_seg is not None: pred_seg_color = palette[pred_seg] pred_seg_color = np.concatenate([points[:, :3], pred_seg_color], axis=1) result_path = osp.join(out_dir, filename) mmcv.mkdir_or_exist(result_path) # online visualization of segmentation mask # we show three masks in a row, scene_points, gt_mask, pred_mask if show: from .open3d_vis import Visualizer mode = 'xyzrgb' if points.shape[1] == 6 else 'xyz' vis = Visualizer(points, mode=mode) if gt_seg is not None: vis.add_seg_mask(gt_seg_color) if pred_seg is not None: vis.add_seg_mask(pred_seg_color) show_path = osp.join(result_path, f'{filename}_online.png') if snapshot else None vis.show(show_path) if points is not None: _write_obj(points, osp.join(result_path, f'{filename}_points.obj')) if gt_seg is not None: _write_obj(gt_seg_color, osp.join(result_path, f'{filename}_gt.obj')) if pred_seg is not None: _write_obj(pred_seg_color, osp.join(result_path, f'{filename}_pred.obj')) def show_multi_modality_result(img, gt_bboxes, pred_bboxes, proj_mat, out_dir, filename, box_mode='lidar', img_metas=None, show=False, gt_bbox_color=(61, 102, 255), pred_bbox_color=(241, 101, 72)): """Convert multi-modality detection results into 2D results. Project the predicted 3D bbox to 2D image plane and visualize them. Args: img (np.ndarray): The numpy array of image in cv2 fashion. gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes. pred_bboxes (:obj:`BaseInstance3DBoxes`): Predicted boxes. proj_mat (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. out_dir (str): Path of output directory. filename (str): Filename of the current frame. box_mode (str, optional): Coordinate system the boxes are in. Should be one of 'depth', 'lidar' and 'camera'. Defaults to 'lidar'. img_metas (dict, optional): Used in projecting depth bbox. Defaults to None. show (bool, optional): Visualize the results online. Defaults to False. gt_bbox_color (str or tuple(int), optional): Color of bbox lines. The tuple of color should be in BGR order. Default: (255, 102, 61). pred_bbox_color (str or tuple(int), optional): Color of bbox lines. The tuple of color should be in BGR order. Default: (72, 101, 241). """ if box_mode == 'depth': draw_bbox = draw_depth_bbox3d_on_img elif box_mode == 'lidar': draw_bbox = draw_lidar_bbox3d_on_img elif box_mode == 'camera': draw_bbox = draw_camera_bbox3d_on_img else: raise NotImplementedError(f'unsupported box mode {box_mode}') result_path = osp.join(out_dir, filename) mmcv.mkdir_or_exist(result_path) if show: show_img = img.copy() if gt_bboxes is not None: show_img = draw_bbox( gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color) if pred_bboxes is not None: show_img = draw_bbox( pred_bboxes, show_img, proj_mat, img_metas, color=pred_bbox_color) mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0) if img is not None: mmcv.imwrite(img, osp.join(result_path, f'{filename}_img.png')) if gt_bboxes is not None: gt_img = draw_bbox( gt_bboxes, img, proj_mat, img_metas, color=gt_bbox_color) mmcv.imwrite(gt_img, osp.join(result_path, f'{filename}_gt.png')) if pred_bboxes is not None: pred_img = draw_bbox( pred_bboxes, img, proj_mat, img_metas, color=pred_bbox_color) mmcv.imwrite(pred_img, osp.join(result_path, f'{filename}_pred.png')) ================================================ FILE: mmdet3d/core/voxel/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .builder import build_voxel_generator from .voxel_generator import VoxelGenerator __all__ = ['build_voxel_generator', 'VoxelGenerator'] ================================================ FILE: mmdet3d/core/voxel/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv from . import voxel_generator def build_voxel_generator(cfg, **kwargs): """Builder of voxel generator.""" if isinstance(cfg, voxel_generator.VoxelGenerator): return cfg elif isinstance(cfg, dict): return mmcv.runner.obj_from_dict( cfg, voxel_generator, default_args=kwargs) else: raise TypeError('Invalid type {} for building a sampler'.format( type(cfg))) ================================================ FILE: mmdet3d/core/voxel/voxel_generator.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numba import numpy as np class VoxelGenerator(object): """Voxel generator in numpy implementation. Args: voxel_size (list[float]): Size of a single voxel point_cloud_range (list[float]): Range of points max_num_points (int): Maximum number of points in a single voxel max_voxels (int, optional): Maximum number of voxels. Defaults to 20000. """ def __init__(self, voxel_size, point_cloud_range, max_num_points, max_voxels=20000): point_cloud_range = np.array(point_cloud_range, dtype=np.float32) # [0, -40, -3, 70.4, 40, 1] voxel_size = np.array(voxel_size, dtype=np.float32) grid_size = (point_cloud_range[3:] - point_cloud_range[:3]) / voxel_size grid_size = np.round(grid_size).astype(np.int64) self._voxel_size = voxel_size self._point_cloud_range = point_cloud_range self._max_num_points = max_num_points self._max_voxels = max_voxels self._grid_size = grid_size def generate(self, points): """Generate voxels given points.""" return points_to_voxel(points, self._voxel_size, self._point_cloud_range, self._max_num_points, True, self._max_voxels) @property def voxel_size(self): """list[float]: Size of a single voxel.""" return self._voxel_size @property def max_num_points_per_voxel(self): """int: Maximum number of points per voxel.""" return self._max_num_points @property def point_cloud_range(self): """list[float]: Range of point cloud.""" return self._point_cloud_range @property def grid_size(self): """np.ndarray: The size of grids.""" return self._grid_size def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ indent = ' ' * (len(repr_str) + 1) repr_str += f'(voxel_size={self._voxel_size},\n' repr_str += indent + 'point_cloud_range=' repr_str += f'{self._point_cloud_range.tolist()},\n' repr_str += indent + f'max_num_points={self._max_num_points},\n' repr_str += indent + f'max_voxels={self._max_voxels},\n' repr_str += indent + f'grid_size={self._grid_size.tolist()}' repr_str += ')' return repr_str def points_to_voxel(points, voxel_size, coors_range, max_points=35, reverse_index=True, max_voxels=20000): """convert kitti points(N, >=3) to voxels. Args: points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size coors_range (list[float | tuple[float] | ndarray]): Voxel range. format: xyzxyz, minmax max_points (int): Indicate maximum points contained in a voxel. reverse_index (bool): Whether return reversed coordinates. if points has xyz format and reverse_index is True, output coordinates will be zyx format, but points in features always xyz format. max_voxels (int): Maximum number of voxels this function creates. For second, 20000 is a good choice. Points should be shuffled for randomness before this function because max_voxels drops points. Returns: tuple[np.ndarray]: voxels: [M, max_points, ndim] float tensor. only contain points. coordinates: [M, 3] int32 tensor. num_points_per_voxel: [M] int32 tensor. """ if not isinstance(voxel_size, np.ndarray): voxel_size = np.array(voxel_size, dtype=points.dtype) if not isinstance(coors_range, np.ndarray): coors_range = np.array(coors_range, dtype=points.dtype) voxelmap_shape = (coors_range[3:] - coors_range[:3]) / voxel_size voxelmap_shape = tuple(np.round(voxelmap_shape).astype(np.int32).tolist()) if reverse_index: voxelmap_shape = voxelmap_shape[::-1] # don't create large array in jit(nopython=True) code. num_points_per_voxel = np.zeros(shape=(max_voxels, ), dtype=np.int32) coor_to_voxelidx = -np.ones(shape=voxelmap_shape, dtype=np.int32) voxels = np.zeros( shape=(max_voxels, max_points, points.shape[-1]), dtype=points.dtype) coors = np.zeros(shape=(max_voxels, 3), dtype=np.int32) if reverse_index: voxel_num = _points_to_voxel_reverse_kernel( points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points, max_voxels) else: voxel_num = _points_to_voxel_kernel(points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points, max_voxels) coors = coors[:voxel_num] voxels = voxels[:voxel_num] num_points_per_voxel = num_points_per_voxel[:voxel_num] return voxels, coors, num_points_per_voxel @numba.jit(nopython=True) def _points_to_voxel_reverse_kernel(points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points=35, max_voxels=20000): """convert kitti points(N, >=3) to voxels. Args: points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size coors_range (list[float | tuple[float] | ndarray]): Range of voxels. format: xyzxyz, minmax num_points_per_voxel (int): Number of points per voxel. coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), which has the same shape as the complete voxel map. It indicates the index of each corresponding voxel. voxels (np.ndarray): Created empty voxels. coors (np.ndarray): Created coordinates of each voxel. max_points (int): Indicate maximum points contained in a voxel. max_voxels (int): Maximum number of voxels this function create. for second, 20000 is a good choice. Points should be shuffled for randomness before this function because max_voxels drops points. Returns: tuple[np.ndarray]: voxels: Shape [M, max_points, ndim], only contain points. coordinates: Shape [M, 3]. num_points_per_voxel: Shape [M]. """ # put all computations to one loop. # we shouldn't create large array in main jit code, otherwise # reduce performance N = points.shape[0] # ndim = points.shape[1] - 1 ndim = 3 ndim_minus_1 = ndim - 1 grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size # np.round(grid_size) # grid_size = np.round(grid_size).astype(np.int64)(np.int32) grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) coor = np.zeros(shape=(3, ), dtype=np.int32) voxel_num = 0 failed = False for i in range(N): failed = False for j in range(ndim): c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) if c < 0 or c >= grid_size[j]: failed = True break coor[ndim_minus_1 - j] = c if failed: continue voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] if voxelidx == -1: voxelidx = voxel_num if voxel_num >= max_voxels: continue voxel_num += 1 coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx coors[voxelidx] = coor num = num_points_per_voxel[voxelidx] if num < max_points: voxels[voxelidx, num] = points[i] num_points_per_voxel[voxelidx] += 1 return voxel_num @numba.jit(nopython=True) def _points_to_voxel_kernel(points, voxel_size, coors_range, num_points_per_voxel, coor_to_voxelidx, voxels, coors, max_points=35, max_voxels=20000): """convert kitti points(N, >=3) to voxels. Args: points (np.ndarray): [N, ndim]. points[:, :3] contain xyz points and points[:, 3:] contain other information such as reflectivity. voxel_size (list, tuple, np.ndarray): [3] xyz, indicate voxel size. coors_range (list[float | tuple[float] | ndarray]): Range of voxels. format: xyzxyz, minmax num_points_per_voxel (int): Number of points per voxel. coor_to_voxel_idx (np.ndarray): A voxel grid of shape (D, H, W), which has the same shape as the complete voxel map. It indicates the index of each corresponding voxel. voxels (np.ndarray): Created empty voxels. coors (np.ndarray): Created coordinates of each voxel. max_points (int): Indicate maximum points contained in a voxel. max_voxels (int): Maximum number of voxels this function create. for second, 20000 is a good choice. Points should be shuffled for randomness before this function because max_voxels drops points. Returns: tuple[np.ndarray]: voxels: Shape [M, max_points, ndim], only contain points. coordinates: Shape [M, 3]. num_points_per_voxel: Shape [M]. """ N = points.shape[0] # ndim = points.shape[1] - 1 ndim = 3 grid_size = (coors_range[3:] - coors_range[:3]) / voxel_size # grid_size = np.round(grid_size).astype(np.int64)(np.int32) grid_size = np.round(grid_size, 0, grid_size).astype(np.int32) # lower_bound = coors_range[:3] # upper_bound = coors_range[3:] coor = np.zeros(shape=(3, ), dtype=np.int32) voxel_num = 0 failed = False for i in range(N): failed = False for j in range(ndim): c = np.floor((points[i, j] - coors_range[j]) / voxel_size[j]) if c < 0 or c >= grid_size[j]: failed = True break coor[j] = c if failed: continue voxelidx = coor_to_voxelidx[coor[0], coor[1], coor[2]] if voxelidx == -1: voxelidx = voxel_num if voxel_num >= max_voxels: continue voxel_num += 1 coor_to_voxelidx[coor[0], coor[1], coor[2]] = voxelidx coors[voxelidx] = coor num = num_points_per_voxel[voxelidx] if num < max_points: voxels[voxelidx, num] = points[i] num_points_per_voxel[voxelidx] += 1 return voxel_num ================================================ FILE: mmdet3d/datasets/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # from mmdet.datasets.builder import build_dataloader, from .builder import DATASETS, PIPELINES, build_dataset, build_dataloader from .custom_3d import Custom3DDataset from .custom_3d_seg import Custom3DSegDataset from .kitti_dataset import KittiDataset from .kitti_mono_dataset import KittiMonoDataset from .lyft_dataset import LyftDataset from .nuscenes_dataset import NuScenesDataset, NuscenesOccupancy from .nuscenes_mono_dataset import NuScenesMonoDataset # yapf: disable from .pipelines import (AffineResize, BackgroundPointsFilter, GlobalAlignment, GlobalRotScaleTrans, IndoorPatchPointSample, IndoorPointSample, LoadAnnotations3D, LoadPointsFromDict, LoadPointsFromFile, LoadPointsFromMultiSweeps, MultiViewWrapper, NormalizePointsColor, ObjectNameFilter, ObjectNoise, ObjectRangeFilter, ObjectSample, PointSample, PointShuffle, PointsRangeFilter, RandomDropPointsColor, RandomFlip3D, RandomJitterPoints, RandomRotate, RandomShiftScale, RangeLimitedRandomCrop, VoxelBasedPointSampler) # yapf: enable from .s3dis_dataset import S3DISDataset, S3DISSegDataset from .scannet_dataset import (ScanNetDataset, ScanNetInstanceSegDataset, ScanNetSegDataset) from .semantickitti_dataset import SemanticKITTIDataset from .sunrgbd_dataset import SUNRGBDDataset from .utils import get_loading_pipeline from .waymo_dataset import WaymoDataset from .samplers import InfiniteGroupEachSampleInBatchSampler __all__ = [ 'KittiDataset', 'KittiMonoDataset', 'build_dataloader', 'DATASETS', 'build_dataset', 'NuScenesDataset', 'NuScenesMonoDataset', 'LyftDataset', 'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'LoadPointsFromFile', 'S3DISSegDataset', 'S3DISDataset', 'NormalizePointsColor', 'IndoorPatchPointSample', 'IndoorPointSample', 'PointSample', 'LoadAnnotations3D', 'GlobalAlignment', 'SUNRGBDDataset', 'ScanNetDataset', 'ScanNetSegDataset', 'ScanNetInstanceSegDataset', 'SemanticKITTIDataset', 'Custom3DDataset', 'Custom3DSegDataset', 'LoadPointsFromMultiSweeps', 'WaymoDataset', 'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'get_loading_pipeline', 'RandomDropPointsColor', 'RandomJitterPoints', 'ObjectNameFilter', 'AffineResize', 'RandomShiftScale', 'LoadPointsFromDict', 'PIPELINES', 'RangeLimitedRandomCrop', 'RandomRotate', 'MultiViewWrapper' ] ================================================ FILE: mmdet3d/datasets/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import platform from functools import partial from mmcv.utils import Registry, build_from_cfg from mmcv.parallel import collate from mmcv.runner import get_dist_info from mmdet.datasets import DATASETS from mmdet.datasets.builder import _concat_dataset, worker_init_fn from torch.utils.data import DataLoader from mmdet.datasets.samplers import (DistributedGroupSampler, DistributedSampler, GroupSampler) from .samplers import InfiniteGroupEachSampleInBatchSampler, CustomDistributedSampler, InfiniteGroupEachSampleInBatchSamplerEval, TTADistributedSampler if platform.system() != 'Windows': # https://github.com/pytorch/pytorch/issues/973 import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) base_soft_limit = rlimit[0] hard_limit = rlimit[1] soft_limit = min(max(4096, base_soft_limit), hard_limit) resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) OBJECTSAMPLERS = Registry('Object sampler') DATASETS = Registry('dataset') PIPELINES = Registry('pipeline') def build_dataset(cfg, default_args=None): from mmdet3d.datasets.dataset_wrappers import CBGSDataset from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset, ConcatDataset, RepeatDataset) if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) elif cfg['type'] == 'ConcatDataset': dataset = ConcatDataset( [build_dataset(c, default_args) for c in cfg['datasets']], cfg.get('separate_eval', True)) elif cfg['type'] == 'RepeatDataset': dataset = RepeatDataset( build_dataset(cfg['dataset'], default_args), cfg['times']) elif cfg['type'] == 'ClassBalancedDataset': dataset = ClassBalancedDataset( build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) elif cfg['type'] == 'CBGSDataset': dataset = CBGSDataset(build_dataset(cfg['dataset'], default_args)) elif isinstance(cfg.get('ann_file'), (list, tuple)): dataset = _concat_dataset(cfg, default_args) elif cfg['type'] in DATASETS._module_dict.keys(): dataset = build_from_cfg(cfg, DATASETS, default_args) else: dataset = build_from_cfg(cfg, MMDET_DATASETS, default_args) return dataset # https://github.com/open-mmlab/mmdetection/blob/v2.14.0/mmdet/datasets/builder.py def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, runner_type='EpochBasedRunner', val=False, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dist: # When model is :obj:`DistributedDataParallel`, # `batch_size` of :obj:`dataloader` is the # number of training samples on each GPU. batch_size = samples_per_gpu num_workers = workers_per_gpu else: # When model is obj:`DataParallel` # the batch size is samples on all the GPUS batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu if val: # runner_type = 'EpochBasedRunner' assert not shuffle if runner_type == 'IterBasedRunner': # TODO: original has more options, but I'm not using them # https://github.com/open-mmlab/mmdetection/blob/3b72b12fe9b14de906d1363982b9fba05e7d47c1/mmdet/datasets/builder.py#L145-L157 batch_sampler = InfiniteGroupEachSampleInBatchSampler( dataset, batch_size, world_size, rank, seed=seed) batch_size = 1 sampler = None elif runner_type == 'IterBasedRunnerEval': # TODO: original has more options, but I'm not using them # https://github.com/open-mmlab/mmdetection/blob/3b72b12fe9b14de906d1363982b9fba05e7d47c1/mmdet/datasets/builder.py#L145-L157 batch_sampler = InfiniteGroupEachSampleInBatchSamplerEval( dataset, batch_size, world_size, rank, seed=seed) batch_size = 1 sampler = None elif runner_type == 'TTARunnerEval': # TODO: original has more options, but I'm not using them # https://github.com/open-mmlab/mmdetection/blob/3b72b12fe9b14de906d1363982b9fba05e7d47c1/mmdet/datasets/builder.py#L145-L157 batch_sampler = TTADistributedSampler( dataset, samples_per_gpu, world_size, rank, seed=seed) sampler = None else: if dist: # DistributedGroupSampler will definitely shuffle the data to satisfy # that images on each GPU are in the same group if shuffle: sampler = DistributedGroupSampler( dataset, samples_per_gpu, world_size, rank, seed=seed) else: if val: sampler = CustomDistributedSampler( dataset, world_size, rank, shuffle=False, seed=seed) else: sampler = DistributedSampler( dataset, world_size, rank, shuffle=False, seed=seed) else: sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None batch_sampler = None init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None data_loader = DataLoader( dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, batch_sampler=batch_sampler, collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), pin_memory=False, worker_init_fn=init_fn, **kwargs) return data_loader ================================================ FILE: mmdet3d/datasets/custom_3d.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE # Copyright (c) OpenMMLab. All rights reserved. import tempfile import warnings from os import path as osp import mmcv import numpy as np from torch.utils.data import Dataset from ..core.bbox import get_box_type from .builder import DATASETS from .pipelines import Compose from .utils import extract_result_dict, get_loading_pipeline import time @DATASETS.register_module() class Custom3DDataset(Dataset): """Customized 3D dataset. This is the base dataset of SUNRGB-D, ScanNet, nuScenes, and KITTI dataset. .. code-block:: none [ {'sample_idx': 'lidar_points': {'lidar_path': velodyne_path, .... }, 'annos': {'box_type_3d': (str) 'LiDAR/Camera/Depth' 'gt_bboxes_3d': (n, 7) 'gt_names': [list] .... } 'calib': { .....} 'images': { .....} } ] Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR'. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, file_client_args=dict(backend='disk')): super().__init__() self.data_root = data_root self.ann_file = ann_file self.test_mode = test_mode self.modality = modality self.filter_empty_gt = filter_empty_gt self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) self.CLASSES = self.get_classes(classes) self.file_client = mmcv.FileClient(**file_client_args) self.cat2id = {name: i for i, name in enumerate(self.CLASSES)} # load annotations if hasattr(self.file_client, 'get_local_path'): with self.file_client.get_local_path(self.ann_file) as local_path: self.data_infos = self.load_annotations(open(local_path, 'rb')) else: warnings.warn( 'The used MMCV version does not have get_local_path. ' f'We treat the {self.ann_file} as local paths and it ' 'might cause errors if the path is not a local path. ' 'Please use MMCV>= 1.3.16 if you meet errors.') self.data_infos = self.load_annotations(self.ann_file) # process pipeline if pipeline is not None: self.pipeline = Compose(pipeline) # set group flag for the samplers if not self.test_mode: self._set_group_flag() def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ # loading data from a file-like object needs file format return mmcv.load(ann_file, file_format='pkl') def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['sample_idx'] pts_filename = osp.join(self.data_root, info['lidar_points']['lidar_path']) input_dict = dict( pts_filename=pts_filename, sample_idx=sample_idx, file_name=pts_filename) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any(): return None return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. """ info = self.data_infos[index] gt_bboxes_3d = info['annos']['gt_bboxes_3d'] gt_names_3d = info['annos']['gt_names'] gt_labels_3d = [] for cat in gt_names_3d: if cat in self.CLASSES: gt_labels_3d.append(self.CLASSES.index(cat)) else: gt_labels_3d.append(-1) gt_labels_3d = np.array(gt_labels_3d) # Obtain original box 3d type in info file ori_box_type_3d = info['annos']['box_type_3d'] ori_box_type_3d, _ = get_box_type(ori_box_type_3d) # turn original box type to target box type gt_bboxes_3d = ori_box_type_3d( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, gt_names=gt_names_3d) return anns_results def pre_pipeline(self, results): """Initialization before data preparation. Args: results (dict): Dict before data preprocessing. - img_fields (list): Image fields. - bbox3d_fields (list): 3D bounding boxes fields. - pts_mask_fields (list): Mask fields of points. - pts_seg_fields (list): Mask fields of point segments. - bbox_fields (list): Fields of bounding boxes. - mask_fields (list): Fields of masks. - seg_fields (list): Segment fields. - box_type_3d (str): 3D box type. - box_mode_3d (str): 3D box mode. """ results['img_fields'] = [] results['bbox3d_fields'] = [] results['pts_mask_fields'] = [] results['pts_seg_fields'] = [] results['bbox_fields'] = [] results['mask_fields'] = [] results['seg_fields'] = [] results['box_type_3d'] = self.box_type_3d results['box_mode_3d'] = self.box_mode_3d def prepare_train_data(self, index): """Training data preparation. Args: index (int): Index for accessing the target data. Returns: dict: Training data dict of the corresponding index. """ input_dict = self.get_data_info(index) if input_dict is None: return None self.pre_pipeline(input_dict) example = self.pipeline(input_dict) if self.filter_empty_gt and \ (example is None or ~(example['gt_labels_3d']._data != -1).any()): return None return example def prepare_test_data(self, index): """Prepare data for testing. Args: index (int): Index for accessing the target data. Returns: dict: Testing data dict of the corresponding index. """ input_dict = self.get_data_info(index) self.pre_pipeline(input_dict) example = self.pipeline(input_dict) return example @classmethod def get_classes(cls, classes=None): """Get class names of current dataset. Args: classes (Sequence[str] | str): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. Return: list[str]: A list of class names. """ if classes is None: return cls.CLASSES if isinstance(classes, str): # take it as a file path class_names = mmcv.list_from_file(classes) elif isinstance(classes, (tuple, list)): class_names = classes else: raise ValueError(f'Unsupported type {type(classes)} of classes.') return class_names def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (outputs, tmp_dir), outputs is the detection results, tmp_dir is the temporal directory created for saving json files when ``jsonfile_prefix`` is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') out = f'{pklfile_prefix}.pkl' mmcv.dump(outputs, out) return outputs, tmp_dir def evaluate(self, results, metric=None, iou_thr=(0.25, 0.5), logger=None, show=False, out_dir=None, pipeline=None): """Evaluate. Evaluation in indoor protocol. Args: results (list[dict]): List of results. metric (str | list[str], optional): Metrics to be evaluated. Defaults to None. iou_thr (list[float]): AP IoU thresholds. Defaults to (0.25, 0.5). logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Defaults to None. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict: Evaluation results. """ from mmdet3d.core.evaluation import indoor_eval assert isinstance( results, list), f'Expect results to be list, got {type(results)}.' assert len(results) > 0, 'Expect length of results > 0.' assert len(results) == len(self.data_infos) assert isinstance( results[0], dict ), f'Expect elements in results to be dict, got {type(results[0])}.' gt_annos = [info['annos'] for info in self.data_infos] label2cat = {i: cat_id for i, cat_id in enumerate(self.CLASSES)} ret_dict = indoor_eval( gt_annos, results, iou_thr, label2cat, logger=logger, box_type_3d=self.box_type_3d, box_mode_3d=self.box_mode_3d) if show: self.show(results, out_dir, pipeline=pipeline) return ret_dict def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" raise NotImplementedError('_build_default_pipeline is not implemented ' f'for dataset {self.__class__.__name__}') def _get_pipeline(self, pipeline): """Get data loading pipeline in self.show/evaluate function. Args: pipeline (list[dict]): Input pipeline. If None is given, get from self.pipeline. """ if pipeline is None: if not hasattr(self, 'pipeline') or self.pipeline is None: warnings.warn( 'Use default pipeline for data loading, this may cause ' 'errors when data is on ceph') return self._build_default_pipeline() loading_pipeline = get_loading_pipeline(self.pipeline.transforms) return Compose(loading_pipeline) return Compose(pipeline) def _extract_data(self, index, pipeline, key, load_annos=False): """Load data using input pipeline and extract data according to key. Args: index (int): Index for accessing the target data. pipeline (:obj:`Compose`): Composed data loading pipeline. key (str | list[str]): One single or a list of data key. load_annos (bool): Whether to load data annotations. If True, need to set self.test_mode as False before loading. Returns: np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]: A single or a list of loaded data. """ assert pipeline is not None, 'data loading pipeline is not provided' # when we want to load ground-truth via pipeline (e.g. bbox, seg mask) # we need to set self.test_mode as False so that we have 'annos' if load_annos: original_test_mode = self.test_mode self.test_mode = False input_dict = self.get_data_info(index) self.pre_pipeline(input_dict) example = pipeline(input_dict) # extract data items according to keys if isinstance(key, str): data = extract_result_dict(example, key) else: data = [extract_result_dict(example, k) for k in key] if load_annos: self.test_mode = original_test_mode return data def __len__(self): """Return the length of data infos. Returns: int: Length of data infos. """ return len(self.data_infos) def _rand_another(self, idx): """Randomly get another item with the same flag. Returns: int: Another index of item with the same flag. """ pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) def __getitem__(self, idx): """Get item from infos according to the given index. Returns: dict: Data dictionary of the corresponding index. """ if self.test_mode: return self.prepare_test_data(idx) while True: data = self.prepare_train_data(idx) if data is None: idx = self._rand_another(idx) continue return data def _set_group_flag(self): """Set flag according to image aspect ratio. Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. In 3D datasets, they are all the same, thus are all zeros. """ self.flag = np.zeros(len(self), dtype=np.uint8) ================================================ FILE: mmdet3d/datasets/custom_3d_seg.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import tempfile import warnings from os import path as osp import mmcv import numpy as np from torch.utils.data import Dataset from mmseg.datasets import DATASETS as SEG_DATASETS from .builder import DATASETS from .pipelines import Compose from .utils import extract_result_dict, get_loading_pipeline @DATASETS.register_module() @SEG_DATASETS.register_module() class Custom3DSegDataset(Dataset): """Customized 3D dataset for semantic segmentation task. This is the base dataset of ScanNet and S3DIS dataset. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. palette (list[list[int]], optional): The palette of segmentation map. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES) to be consistent with PointSegClassMapping function in pipeline. Defaults to None. scene_idxs (np.ndarray | str, optional): Precomputed index to load data. For scenes with many points, we may sample it several times. Defaults to None. """ # names of all classes data used for the task CLASSES = None # class_ids used for training VALID_CLASS_IDS = None # all possible class_ids in loaded segmentation mask ALL_CLASS_IDS = None # official color for visualization PALETTE = None def __init__(self, data_root, ann_file, pipeline=None, classes=None, palette=None, modality=None, test_mode=False, ignore_index=None, scene_idxs=None, file_client_args=dict(backend='disk')): super().__init__() self.data_root = data_root self.ann_file = ann_file self.test_mode = test_mode self.modality = modality self.file_client = mmcv.FileClient(**file_client_args) # load annotations if hasattr(self.file_client, 'get_local_path'): with self.file_client.get_local_path(self.ann_file) as local_path: self.data_infos = self.load_annotations(open(local_path, 'rb')) else: warnings.warn( 'The used MMCV version does not have get_local_path. ' f'We treat the {self.ann_file} as local paths and it ' 'might cause errors if the path is not a local path. ' 'Please use MMCV>= 1.3.16 if you meet errors.') self.data_infos = self.load_annotations(self.ann_file) if pipeline is not None: self.pipeline = Compose(pipeline) self.ignore_index = len(self.CLASSES) if \ ignore_index is None else ignore_index self.scene_idxs = self.get_scene_idxs(scene_idxs) self.CLASSES, self.PALETTE = \ self.get_classes_and_palette(classes, palette) # set group flag for the sampler if not self.test_mode: self._set_group_flag() def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ # loading data from a file-like object needs file format return mmcv.load(ann_file, file_format='pkl') def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['point_cloud']['lidar_idx'] pts_filename = osp.join(self.data_root, info['pts_path']) input_dict = dict( pts_filename=pts_filename, sample_idx=sample_idx, file_name=pts_filename) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def pre_pipeline(self, results): """Initialization before data preparation. Args: results (dict): Dict before data preprocessing. - img_fields (list): Image fields. - pts_mask_fields (list): Mask fields of points. - pts_seg_fields (list): Mask fields of point segments. - mask_fields (list): Fields of masks. - seg_fields (list): Segment fields. """ results['img_fields'] = [] results['pts_mask_fields'] = [] results['pts_seg_fields'] = [] results['mask_fields'] = [] results['seg_fields'] = [] results['bbox3d_fields'] = [] def prepare_train_data(self, index): """Training data preparation. Args: index (int): Index for accessing the target data. Returns: dict: Training data dict of the corresponding index. """ input_dict = self.get_data_info(index) if input_dict is None: return None self.pre_pipeline(input_dict) example = self.pipeline(input_dict) return example def prepare_test_data(self, index): """Prepare data for testing. Args: index (int): Index for accessing the target data. Returns: dict: Testing data dict of the corresponding index. """ input_dict = self.get_data_info(index) self.pre_pipeline(input_dict) example = self.pipeline(input_dict) return example def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. This function is taken from MMSegmentation. Args: classes (Sequence[str] | str): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. Defaults to None. palette (Sequence[Sequence[int]]] | np.ndarray): The palette of segmentation map. If None is given, random palette will be generated. Defaults to None. """ if classes is None: self.custom_classes = False # map id in the loaded mask to label used for training self.label_map = { cls_id: self.ignore_index for cls_id in self.ALL_CLASS_IDS } self.label_map.update( {cls_id: i for i, cls_id in enumerate(self.VALID_CLASS_IDS)}) # map label to category name self.label2cat = { i: cat_name for i, cat_name in enumerate(self.CLASSES) } return self.CLASSES, self.PALETTE self.custom_classes = True if isinstance(classes, str): # take it as a file path class_names = mmcv.list_from_file(classes) elif isinstance(classes, (tuple, list)): class_names = classes else: raise ValueError(f'Unsupported type {type(classes)} of classes.') if self.CLASSES: if not set(class_names).issubset(self.CLASSES): raise ValueError('classes is not a subset of CLASSES.') # update valid_class_ids self.VALID_CLASS_IDS = [ self.VALID_CLASS_IDS[self.CLASSES.index(cls_name)] for cls_name in class_names ] # dictionary, its keys are the old label ids and its values # are the new label ids. # used for changing pixel labels in load_annotations. self.label_map = { cls_id: self.ignore_index for cls_id in self.ALL_CLASS_IDS } self.label_map.update( {cls_id: i for i, cls_id in enumerate(self.VALID_CLASS_IDS)}) self.label2cat = { i: cat_name for i, cat_name in enumerate(class_names) } # modify palette for visualization palette = [ self.PALETTE[self.CLASSES.index(cls_name)] for cls_name in class_names ] return class_names, palette def get_scene_idxs(self, scene_idxs): """Compute scene_idxs for data sampling. We sample more times for scenes with more points. """ if self.test_mode: # when testing, we load one whole scene every time return np.arange(len(self.data_infos)).astype(np.int32) # we may need to re-sample different scenes according to scene_idxs # this is necessary for indoor scene segmentation such as ScanNet if scene_idxs is None: scene_idxs = np.arange(len(self.data_infos)) if isinstance(scene_idxs, str): with self.file_client.get_local_path(scene_idxs) as local_path: scene_idxs = np.load(local_path) else: scene_idxs = np.array(scene_idxs) return scene_idxs.astype(np.int32) def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (outputs, tmp_dir), outputs is the detection results, tmp_dir is the temporal directory created for saving json files when ``jsonfile_prefix`` is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') out = f'{pklfile_prefix}.pkl' mmcv.dump(outputs, out) return outputs, tmp_dir def evaluate(self, results, metric=None, logger=None, show=False, out_dir=None, pipeline=None): """Evaluate. Evaluation in semantic segmentation protocol. Args: results (list[dict]): List of results. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Defaults to None. show (bool, optional): Whether to visualize. Defaults to False. out_dir (str, optional): Path to save the visualization results. Defaults to None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict: Evaluation results. """ from mmdet3d.core.evaluation import seg_eval assert isinstance( results, list), f'Expect results to be list, got {type(results)}.' assert len(results) > 0, 'Expect length of results > 0.' assert len(results) == len(self.data_infos) assert isinstance( results[0], dict ), f'Expect elements in results to be dict, got {type(results[0])}.' load_pipeline = self._get_pipeline(pipeline) pred_sem_masks = [result['semantic_mask'] for result in results] gt_sem_masks = [ self._extract_data( i, load_pipeline, 'pts_semantic_mask', load_annos=True) for i in range(len(self.data_infos)) ] ret_dict = seg_eval( gt_sem_masks, pred_sem_masks, self.label2cat, self.ignore_index, logger=logger) if show: self.show(pred_sem_masks, out_dir, pipeline=pipeline) return ret_dict def _rand_another(self, idx): """Randomly get another item with the same flag. Returns: int: Another index of item with the same flag. """ pool = np.where(self.flag == self.flag[idx])[0] return np.random.choice(pool) def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" raise NotImplementedError('_build_default_pipeline is not implemented ' f'for dataset {self.__class__.__name__}') def _get_pipeline(self, pipeline): """Get data loading pipeline in self.show/evaluate function. Args: pipeline (list[dict]): Input pipeline. If None is given, get from self.pipeline. """ if pipeline is None: if not hasattr(self, 'pipeline') or self.pipeline is None: warnings.warn( 'Use default pipeline for data loading, this may cause ' 'errors when data is on ceph') return self._build_default_pipeline() loading_pipeline = get_loading_pipeline(self.pipeline.transforms) return Compose(loading_pipeline) return Compose(pipeline) def _extract_data(self, index, pipeline, key, load_annos=False): """Load data using input pipeline and extract data according to key. Args: index (int): Index for accessing the target data. pipeline (:obj:`Compose`): Composed data loading pipeline. key (str | list[str]): One single or a list of data key. load_annos (bool): Whether to load data annotations. If True, need to set self.test_mode as False before loading. Returns: np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]: A single or a list of loaded data. """ assert pipeline is not None, 'data loading pipeline is not provided' # when we want to load ground-truth via pipeline (e.g. bbox, seg mask) # we need to set self.test_mode as False so that we have 'annos' if load_annos: original_test_mode = self.test_mode self.test_mode = False input_dict = self.get_data_info(index) self.pre_pipeline(input_dict) example = pipeline(input_dict) # extract data items according to keys if isinstance(key, str): data = extract_result_dict(example, key) else: data = [extract_result_dict(example, k) for k in key] if load_annos: self.test_mode = original_test_mode return data def __len__(self): """Return the length of scene_idxs. Returns: int: Length of data infos. """ return len(self.scene_idxs) def __getitem__(self, idx): """Get item from infos according to the given index. In indoor scene segmentation task, each scene contains millions of points. However, we only sample less than 10k points within a patch each time. Therefore, we use `scene_idxs` to re-sample different rooms. Returns: dict: Data dictionary of the corresponding index. """ scene_idx = self.scene_idxs[idx] # map to scene idx if self.test_mode: return self.prepare_test_data(scene_idx) while True: data = self.prepare_train_data(scene_idx) if data is None: idx = self._rand_another(idx) scene_idx = self.scene_idxs[idx] # map to scene idx continue return data def _set_group_flag(self): """Set flag according to image aspect ratio. Images with aspect ratio greater than 1 will be set as group 1, otherwise group 0. In 3D datasets, they are all the same, thus are all zeros. """ self.flag = np.zeros(len(self), dtype=np.uint8) ================================================ FILE: mmdet3d/datasets/dataset_wrappers.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np from .builder import DATASETS @DATASETS.register_module() class CBGSDataset(object): """A wrapper of class sampled dataset with ann_file path. Implementation of paper `Class-balanced Grouping and Sampling for Point Cloud 3D Object Detection `_. Balance the number of scenes under different classes. Args: dataset (:obj:`CustomDataset`): The dataset to be class sampled. """ def __init__(self, dataset): self.dataset = dataset self.CLASSES = dataset.CLASSES self.cat2id = {name: i for i, name in enumerate(self.CLASSES)} self.sample_indices = self._get_sample_indices() # self.dataset.data_infos = self.data_infos if hasattr(self.dataset, 'flag'): self.flag = np.array( [self.dataset.flag[ind] for ind in self.sample_indices], dtype=np.uint8) def _get_sample_indices(self): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations after class sampling. """ class_sample_idxs = {cat_id: [] for cat_id in self.cat2id.values()} for idx in range(len(self.dataset)): sample_cat_ids = self.dataset.get_cat_ids(idx) for cat_id in sample_cat_ids: class_sample_idxs[cat_id].append(idx) duplicated_samples = sum( [len(v) for _, v in class_sample_idxs.items()]) class_distribution = { k: len(v) / duplicated_samples for k, v in class_sample_idxs.items() } sample_indices = [] frac = 1.0 / len(self.CLASSES) ratios = [frac / v for v in class_distribution.values()] for cls_inds, ratio in zip(list(class_sample_idxs.values()), ratios): sample_indices += np.random.choice(cls_inds, int(len(cls_inds) * ratio)).tolist() return sample_indices def __getitem__(self, idx): """Get item from infos according to the given index. Returns: dict: Data dictionary of the corresponding index. """ ori_idx = self.sample_indices[idx] return self.dataset[ori_idx] def __len__(self): """Return the length of data infos. Returns: int: Length of data infos. """ return len(self.sample_indices) ================================================ FILE: mmdet3d/datasets/evals/eval_utils.py ================================================ import json import torch import tqdm from typing import List, Dict, Tuple, Callable, Union from nuscenes import NuScenes from pyquaternion import Quaternion import numpy as np from .metric_utils import min_ade, min_fde, miss_rate from nuscenes.utils.splits import create_splits_scenes from nuscenes.eval.detection.utils import category_to_detection_name from nuscenes.prediction import PredictHelper, convert_local_coords_to_global from nuscenes.eval.common.data_classes import EvalBox, EvalBoxes from nuscenes.eval.detection.data_classes import DetectionBox from nuscenes.eval.detection.data_classes import DetectionMetricData, DetectionMetricDataList, DetectionMetrics from nuscenes.eval.common.utils import center_distance, scale_iou, yaw_diff, velocity_l2, attr_acc, cummean def category_to_motion_name(category_name: str): """ Default label mapping from nuScenes to nuScenes detection classes. Note that pedestrian does not include personal_mobility, stroller and wheelchair. :param category_name: Generic nuScenes class. :return: nuScenes detection class. """ detection_mapping = { 'movable_object.barrier': 'barrier', 'vehicle.bicycle': 'car', 'vehicle.bus.bendy': 'car', 'vehicle.bus.rigid': 'car', 'vehicle.car': 'car', 'vehicle.construction': 'car', 'vehicle.motorcycle': 'car', 'human.pedestrian.adult': 'pedestrian', 'human.pedestrian.child': 'pedestrian', 'human.pedestrian.construction_worker': 'pedestrian', 'human.pedestrian.police_officer': 'pedestrian', 'movable_object.trafficcone': 'barrier', 'vehicle.trailer': 'car', 'vehicle.truck': 'car' } if category_name in detection_mapping: return detection_mapping[category_name] else: return None def detection_prediction_category_to_motion_name(category_name: str): """ Default label mapping from nuScenes to nuScenes detection classes. Note that pedestrian does not include personal_mobility, stroller and wheelchair. :param category_name: Generic nuScenes class. :return: nuScenes detection class. """ detection_mapping = { 'car': 'car', 'truck': 'car', 'construction_vehicle': 'car', 'bus': 'car', 'trailer': 'car', 'motorcycle': 'car', 'bicycle': 'car', 'pedestrian': 'pedestrian', 'traffic_cone': 'barrier', 'barrier': 'barrier', } if category_name in detection_mapping: return detection_mapping[category_name] else: return None class DetectionMotionMetrics(DetectionMetrics): """ Stores average precision and true positive metric results. Provides properties to summarize. """ @classmethod def deserialize(cls, content: dict): """ Initialize from serialized dictionary. """ cfg = DetectionConfig.deserialize(content['cfg']) metrics = cls(cfg=cfg) metrics.add_runtime(content['eval_time']) for detection_name, label_aps in content['label_aps'].items(): for dist_th, ap in label_aps.items(): metrics.add_label_ap(detection_name=detection_name, dist_th=float(dist_th), ap=float(ap)) for detection_name, label_tps in content['label_tp_errors'].items(): for metric_name, tp in label_tps.items(): metrics.add_label_tp(detection_name=detection_name, metric_name=metric_name, tp=float(tp)) return metrics class DetectionMotionMetricDataList(DetectionMetricDataList): """ This stores a set of MetricData in a dict indexed by (name, match-distance). """ @classmethod def deserialize(cls, content: dict): mdl = cls() for key, md in content.items(): name, distance = key.split(':') mdl.set(name, float(distance), DetectionMotionMetricData.deserialize(md)) return mdl class DetectionMotionMetricData(DetectionMetricData): """ This class holds accumulated and interpolated data required to calculate the detection metrics. """ nelem = 101 def __init__(self, recall: np.array, precision: np.array, confidence: np.array, trans_err: np.array, vel_err: np.array, scale_err: np.array, orient_err: np.array, attr_err: np.array, min_ade_err: np.array, min_fde_err: np.array, miss_rate_err: np.array): # Assert lengths. assert len(recall) == self.nelem assert len(precision) == self.nelem assert len(confidence) == self.nelem assert len(trans_err) == self.nelem assert len(vel_err) == self.nelem assert len(scale_err) == self.nelem assert len(orient_err) == self.nelem assert len(attr_err) == self.nelem assert len(min_ade_err) == self.nelem assert len(min_fde_err) == self.nelem assert len(miss_rate_err) == self.nelem # Assert ordering. assert all(confidence == sorted(confidence, reverse=True)) # Confidences should be descending. assert all(recall == sorted(recall)) # Recalls should be ascending. # Set attributes explicitly to help IDEs figure out what is going on. self.recall = recall self.precision = precision self.confidence = confidence self.trans_err = trans_err self.vel_err = vel_err self.scale_err = scale_err self.orient_err = orient_err self.attr_err = attr_err self.min_ade_err = min_ade_err self.min_fde_err = min_fde_err self.miss_rate_err = miss_rate_err def __eq__(self, other): eq = True for key in self.serialize().keys(): eq = eq and np.array_equal(getattr(self, key), getattr(other, key)) return eq @property def max_recall_ind(self): """ Returns index of max recall achieved. """ # Last instance of confidence > 0 is index of max achieved recall. non_zero = np.nonzero(self.confidence)[0] if len(non_zero) == 0: # If there are no matches, all the confidence values will be zero. max_recall_ind = 0 else: max_recall_ind = non_zero[-1] return max_recall_ind @property def max_recall(self): """ Returns max recall achieved. """ return self.recall[self.max_recall_ind] def serialize(self): """ Serialize instance into json-friendly format. """ return { 'recall': self.recall.tolist(), 'precision': self.precision.tolist(), 'confidence': self.confidence.tolist(), 'trans_err': self.trans_err.tolist(), 'vel_err': self.vel_err.tolist(), 'scale_err': self.scale_err.tolist(), 'orient_err': self.orient_err.tolist(), 'attr_err': self.attr_err.tolist(), 'min_ade_err': self.min_ade_err.tolist(), 'min_fde_err': self.min_fde_err.tolist(), 'miss_rate_err': self.miss_rate_err.tolist(), } @classmethod def deserialize(cls, content: dict): """ Initialize from serialized content. """ return cls(recall=np.array(content['recall']), precision=np.array(content['precision']), confidence=np.array(content['confidence']), trans_err=np.array(content['trans_err']), vel_err=np.array(content['vel_err']), scale_err=np.array(content['scale_err']), orient_err=np.array(content['orient_err']), attr_err=np.array(content['attr_err']), min_ade_err=np.array(content['min_ade_err']), min_fde_err=np.array(content['min_fde_err']), miss_rate_err=np.array(content['miss_rate_err'])) @classmethod def no_predictions(cls): """ Returns a md instance corresponding to having no predictions. """ return cls(recall=np.linspace(0, 1, cls.nelem), precision=np.zeros(cls.nelem), confidence=np.zeros(cls.nelem), trans_err=np.ones(cls.nelem), vel_err=np.ones(cls.nelem), scale_err=np.ones(cls.nelem), orient_err=np.ones(cls.nelem), attr_err=np.ones(cls.nelem), min_ade_err=np.ones(cls.nelem), min_fde_err=np.ones(cls.nelem), miss_rate_err=np.ones(cls.nelem)) @classmethod def random_md(cls): """ Returns an md instance corresponding to a random results. """ return cls(recall=np.linspace(0, 1, cls.nelem), precision=np.random.random(cls.nelem), confidence=np.linspace(0, 1, cls.nelem)[::-1], trans_err=np.random.random(cls.nelem), vel_err=np.random.random(cls.nelem), scale_err=np.random.random(cls.nelem), orient_err=np.random.random(cls.nelem), attr_err=np.random.random(cls.nelem), min_ade_err=np.random.random(cls.nelem), min_fde_err=np.random.random(cls.nelem), miss_rate_err=np.random.random(cls.nelem)) class DetectionMotionBox(DetectionBox): def __init__(self, sample_token: str = "", translation: Tuple[float, float, float] = (0, 0, 0), size: Tuple[float, float, float] = (0, 0, 0), rotation: Tuple[float, float, float, float] = (0, 0, 0, 0), velocity: Tuple[float, float] = (0, 0), ego_translation: [float, float, float] = (0, 0, 0), # Translation to ego vehicle in meters. num_pts: int = -1, # Nbr. LIDAR or RADAR inside the box. Only for gt boxes. detection_name: str = 'car', # The class name used in the detection challenge. detection_score: float = -1.0, # GT samples do not have a score. tracking_id = -1, attribute_name: str = '', traj=None, traj_scores=None): # Box attribute. Each box can have at most 1 attribute. super(DetectionBox, self).__init__(sample_token, translation, size, rotation, velocity, ego_translation, num_pts) assert detection_name is not None, 'Error: detection_name cannot be empty!' # assert detection_name in DETECTION_NAMES, 'Error: Unknown detection_name %s' % detection_name # assert attribute_name in ATTRIBUTE_NAMES or attribute_name == '', \ # 'Error: Unknown attribute_name %s' % attribute_name assert type(detection_score) == float, 'Error: detection_score must be a float!' assert not np.any(np.isnan(detection_score)), 'Error: detection_score may not be NaN!' # Assign. self.detection_name = detection_name self.attribute_name = attribute_name self.detection_score = detection_score self.traj = traj self.traj_scores = traj_scores self.traj_index = None def __eq__(self, other): return (self.sample_token == other.sample_token and self.translation == other.translation and self.size == other.size and self.rotation == other.rotation and self.velocity == other.velocity and self.ego_translation == other.ego_translation and self.num_pts == other.num_pts and self.detection_name == other.detection_name and self.detection_score == other.detection_score and self.attribute_name == other.attribute_name and np.all(self.traj == other.traj) and np.all(self.traj_scores == other.traj_scores)) def serialize(self) -> dict: """ Serialize instance into json-friendly format. """ return { 'sample_token': self.sample_token, 'translation': self.translation, 'size': self.size, 'rotation': self.rotation, 'velocity': self.velocity, 'ego_translation': self.ego_translation, 'num_pts': self.num_pts, 'detection_name': self.detection_name, 'detection_score': self.detection_score, 'attribute_name': self.attribute_name, 'traj': self.traj, 'traj_scores': self.traj_scores } @classmethod def deserialize(cls, content: dict): """ Initialize from serialized content. """ return cls(sample_token=content['sample_token'], translation=tuple(content['translation']), size=tuple(content['size']), rotation=tuple(content['rotation']), velocity=tuple(content['velocity']), ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content else tuple(content['ego_translation']), num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), detection_name=content['detection_name'], detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), attribute_name=content['attribute_name'], traj=content['traj'], traj_scores=content['traj_scores']) class DetectionMotionBox_modified(DetectionMotionBox): def __init__(self, *args, token=None, visibility=None, index=None, **kwargs): ''' add annotation token ''' super().__init__(*args, **kwargs) self.token = token self.visibility = visibility self.index = index def serialize(self) -> dict: """ Serialize instance into json-friendly format. """ return { 'token': self.token, 'sample_token': self.sample_token, 'translation': self.translation, 'size': self.size, 'rotation': self.rotation, 'velocity': self.velocity, 'ego_translation': self.ego_translation, 'num_pts': self.num_pts, 'detection_name': self.detection_name, 'detection_score': self.detection_score, 'attribute_name': self.attribute_name, 'visibility': self.visibility, 'index': self.index, 'traj': self.traj, 'traj_scores': self.traj_scores } @classmethod def deserialize(cls, content: dict): """ Initialize from serialized content. """ return cls( token=content['token'], sample_token=content['sample_token'], translation=tuple(content['translation']), size=tuple(content['size']), rotation=tuple(content['rotation']), velocity=tuple(content['velocity']), ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content else tuple(content['ego_translation']), num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), detection_name=content['detection_name'], detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), attribute_name=content['attribute_name'], visibility=content['visibility'], index=content['index'], traj=content['traj'], ) def load_prediction(result_path: str, max_boxes_per_sample: int, box_cls, verbose: bool = False, category_convert_type='detection_category') \ -> Tuple[EvalBoxes, Dict]: """ Loads object predictions from file. :param result_path: Path to the .json result file provided by the user. :param max_boxes_per_sample: Maximim number of boxes allowed per sample. :param box_cls: Type of box to load, e.g. DetectionBox, DetectionMotionBox or TrackingBox. :param verbose: Whether to print messages to stdout. :return: The deserialized results and meta data. """ # Load from file and check that the format is correct. with open(result_path) as f: data = json.load(f) assert 'results' in data, 'Error: No field `results` in result file. Please note that the result format changed.' \ 'See https://www.nuscenes.org/object-detection for more information.' if category_convert_type == 'motion_category': for key in data['results'].keys(): for i in range(len(data['results'][key])): data['results'][key][i]['detection_name'] = detection_prediction_category_to_motion_name(data['results'][key][i]['detection_name']) # Deserialize results and get meta data. all_results = EvalBoxes.deserialize(data['results'], box_cls) meta = data['meta'] if verbose: print("Loaded results from {}. Found detections for {} samples." .format(result_path, len(all_results.sample_tokens))) # Check that each sample has no more than x predicted boxes. for sample_token in all_results.sample_tokens: assert len(all_results.boxes[sample_token]) <= max_boxes_per_sample, \ "Error: Only <= %d boxes per sample allowed!" % max_boxes_per_sample return all_results, meta def load_gt(nusc: NuScenes, eval_split: str, box_cls, data_infos = None, verbose: bool = False, category_convert_type='detection_category'): """ Loads ground truth boxes from DB. :param nusc: A NuScenes instance. :param eval_split: The evaluation split for which we load GT boxes. :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. :param verbose: Whether to print messages to stdout. :return: The GT boxes. """ predict_helper = PredictHelper(nusc) # Init. if box_cls == DetectionMotionBox_modified: attribute_map = {a['token']: a['name'] for a in nusc.attribute} if verbose: print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version)) # Read out all sample_tokens in DB. sample_tokens_all = [s['token'] for s in nusc.sample] assert len(sample_tokens_all) > 0, "Error: Database has no samples!" # Only keep samples from this split. splits = create_splits_scenes() # Check compatibility of split with nusc_version. version = nusc.version if eval_split in {'train', 'val', 'train_detect', 'train_track'}: assert version.endswith('trainval'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split in {'mini_train', 'mini_val'}: assert version.endswith('mini'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split == 'test': assert version.endswith('test'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) else: raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.' .format(eval_split)) if eval_split == 'test': # Check that you aren't trying to cheat :). assert len(nusc.sample_annotation) > 0, \ 'Error: You are trying to evaluate on the test set but you do not have the annotations!' index_map = {} for scene in nusc.scene: first_sample_token = scene['first_sample_token'] sample = nusc.get('sample', first_sample_token) index_map[first_sample_token] = 1 index = 2 while sample['next'] != '': sample = nusc.get('sample', sample['next']) index_map[sample['token']] = index index += 1 sample_tokens = [] for sample_token in sample_tokens_all: scene_token = nusc.get('sample', sample_token)['scene_token'] scene_record = nusc.get('scene', scene_token) if scene_record['name'] in splits[eval_split]: sample_tokens.append(sample_token) all_annotations = EvalBoxes() # Load annotations and filter predictions and annotations. tracking_id_set = set() for sample_token in tqdm.tqdm(sample_tokens, leave=verbose): sample = nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] # info = data_infos[sample_token] sample_boxes = [] for sample_annotation_token in sample_annotation_tokens: sample_annotation = nusc.get('sample_annotation', sample_annotation_token) if box_cls == DetectionMotionBox_modified: # Get label name in detection task and filter unused labels. if category_convert_type == 'detection_category': detection_name = category_to_detection_name(sample_annotation['category_name']) elif category_convert_type == 'motion_category': detection_name = category_to_motion_name(sample_annotation['category_name']) else: raise NotImplementedError if detection_name is None: continue # Get attribute_name. attr_tokens = sample_annotation['attribute_tokens'] attr_count = len(attr_tokens) if attr_count == 0: attribute_name = '' elif attr_count == 1: attribute_name = attribute_map[attr_tokens[0]] else: raise Exception('Error: GT annotations must not have more than one attribute!') instance_token = nusc.get('sample_annotation', sample_annotation['token'])['instance_token'] fut_traj_global = predict_helper.get_future_for_agent(instance_token, sample_token, seconds=4, in_agent_frame=False) fut_traj_scence_centric = np.zeros((0,)) # if fut_traj_local.shape[0] > 0: # _, boxes, _ = nusc.get_sample_data(sample['data']['LIDAR_TOP'], selected_anntokens=[sample_annotation['token']]) # box = boxes[0] # trans = box.center # rot = Quaternion(matrix=box.rotation_matrix) # fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot) sample_boxes.append( box_cls( token=sample_annotation_token, sample_token=sample_token, translation=sample_annotation['translation'], size=sample_annotation['size'], rotation=sample_annotation['rotation'], velocity=nusc.box_velocity(sample_annotation['token'])[:2], num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], detection_name=detection_name, detection_score=-1.0, # GT samples do not have a score. attribute_name=attribute_name, visibility=sample_annotation['visibility_token'], index=index_map[sample_token], traj=fut_traj_global, ) ) elif box_cls == TrackingBox: assert False else: raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) all_annotations.add_boxes(sample_token, sample_boxes) if verbose: print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens))) return all_annotations def prediction_metrics(gt_box_match, pred_box): pred_traj = np.array(pred_box.traj) gt_traj_steps = gt_box_match.traj.reshape((-1, 2)) valid_steps = gt_traj_steps.shape[0] if valid_steps <= 0: return np.array([0]), np.array([0]), 0 nmodes = pred_traj.shape[0] pred_steps = pred_traj.shape[1] valid_mask = np.zeros((pred_steps, )) gt_traj = np.zeros((pred_steps, 2)) gt_traj[:valid_steps, :] = gt_traj_steps valid_mask[: valid_steps] = 1 pred_traj = torch.tensor(pred_traj[None]) gt_traj = torch.tensor(gt_traj[None]) valid_mask = torch.tensor(valid_mask[None]) ade_err, inds = min_ade(pred_traj, gt_traj, 1 - valid_mask) fde_err, inds = min_fde(pred_traj, gt_traj, 1 - valid_mask) mr_err = miss_rate(pred_traj, gt_traj, 1 - valid_mask, dist_thresh=2) return ade_err.numpy(), fde_err.numpy(), mr_err.numpy() def accumulate(gt_boxes: EvalBoxes, pred_boxes: EvalBoxes, class_name: str, dist_fcn: Callable, dist_th: float, verbose: bool = False) -> DetectionMotionMetricData: """ Average Precision over predefined different recall thresholds for a single distance threshold. The recall/conf thresholds and other raw metrics will be used in secondary metrics. :param gt_boxes: Maps every sample_token to a list of its sample_annotations. :param pred_boxes: Maps every sample_token to a list of its sample_results. :param class_name: Class to compute AP on. :param dist_fcn: Distance function used to match detections and ground truths. :param dist_th: Distance threshold for a match. :param verbose: If true, print debug messages. :return: (average_prec, metrics). The average precision value and raw data for a number of metrics. """ # --------------------------------------------- # Organize input and initialize accumulators. # --------------------------------------------- # Count the positives. npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name]) if verbose: print("Found {} GT of class {} out of {} total across {} samples.". format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens))) # For missing classes in the GT, return a data structure corresponding to no predictions. if npos == 0: return DetectionMotionMetricData.no_predictions(), 0, 0, 0 # Organize the predictions in a single list. pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name] pred_confs = [box.detection_score for box in pred_boxes_list] if verbose: print("Found {} PRED of class {} out of {} total across {} samples.". format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens))) # Sort by confidence. sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1] # Do the actual matching. tp = [] # Accumulator of true positives. fp = [] # Accumulator of false positives. conf = [] # Accumulator of confidences. # match_data holds the extra metrics we calculate for each match. match_data = {'trans_err': [], 'vel_err': [], 'scale_err': [], 'orient_err': [], 'attr_err': [], 'conf': [], 'min_ade_err': [], 'min_fde_err': [], 'miss_rate_err': []} # --------------------------------------------- # Match and accumulate match data. # --------------------------------------------- taken = set() # Initially no gt bounding box is matched. for ind in sortind: pred_box = pred_boxes_list[ind] min_dist = np.inf match_gt_idx = None for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]): # Find closest match among ground truth boxes if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken: this_distance = dist_fcn(gt_box, pred_box) if this_distance < min_dist: min_dist = this_distance match_gt_idx = gt_idx # If the closest match is close enough according to threshold we have a match! is_match = min_dist < dist_th if is_match: taken.add((pred_box.sample_token, match_gt_idx)) # Update tp, fp and confs. tp.append(1) fp.append(0) conf.append(pred_box.detection_score) # Since it is a match, update match data also. gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx] match_data['trans_err'].append(center_distance(gt_box_match, pred_box)) match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box)) match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box)) # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later) period = np.pi if class_name == 'barrier' else 2 * np.pi match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period)) match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box)) minade, minfde, m_r = prediction_metrics(gt_box_match, pred_box) match_data['min_ade_err'].append(minade) match_data['min_fde_err'].append(minfde) match_data['miss_rate_err'].append(m_r) match_data['conf'].append(pred_box.detection_score) else: # No match. Mark this as a false positive. tp.append(0) fp.append(1) conf.append(pred_box.detection_score) # Check if we have any matches. If not, just return a "no predictions" array. if len(match_data['trans_err']) == 0: return DetectionMotionMetricData.no_predictions(), 0, 0, 0 # --------------------------------------------- # Calculate and interpolate precision and recall # --------------------------------------------- # Accumulate. N_tp = np.sum(tp) N_fp = np.sum(fp) tp = np.cumsum(tp).astype(float) fp = np.cumsum(fp).astype(float) conf = np.array(conf) # Calculate precision and recall. prec = tp / (fp + tp) rec = tp / float(npos) rec_interp = np.linspace(0, 1, DetectionMotionMetricData.nelem) # 101 steps, from 0% to 100% recall. prec = np.interp(rec_interp, rec, prec, right=0) conf = np.interp(rec_interp, rec, conf, right=0) rec = rec_interp # --------------------------------------------- # Re-sample the match-data to match, prec, recall and conf. # --------------------------------------------- for key in match_data.keys(): if key == "conf": continue # Confidence is used as reference to align with fp and tp. So skip in this step. else: # For each match_data, we first calculate the accumulated mean. tmp = cummean(np.array(match_data[key])) # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays) match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1] # --------------------------------------------- # Done. Instantiate MetricData and return # --------------------------------------------- return DetectionMotionMetricData(recall=rec, precision=prec, confidence=conf, trans_err=match_data['trans_err'], vel_err=match_data['vel_err'], scale_err=match_data['scale_err'], orient_err=match_data['orient_err'], attr_err=match_data['attr_err'], min_ade_err=match_data['min_ade_err'], min_fde_err=match_data['min_fde_err'], miss_rate_err=match_data['miss_rate_err'] ), N_tp, N_fp, npos def accumulate_motion(gt_boxes: EvalBoxes, pred_boxes: EvalBoxes, class_name: str, dist_fcn: Callable, traj_fcn: Callable, dist_th: float, traj_dist_th: float, verbose: bool = False, final_step: float = 12) -> DetectionMotionMetricData: """ Average Precision over predefined different recall thresholds for a single distance threshold. The recall/conf thresholds and other raw metrics will be used in secondary metrics. :param gt_boxes: Maps every sample_token to a list of its sample_annotations. :param pred_boxes: Maps every sample_token to a list of its sample_results. :param class_name: Class to compute AP on. :param dist_fcn: Distance function used to match detections and ground truths. :param dist_th: Distance threshold for a match. :param verbose: If true, print debug messages. :return: (average_prec, metrics). The average precision value and raw data for a number of metrics. """ # --------------------------------------------- # Organize input and initialize accumulators. # --------------------------------------------- # Count the positives. npos = len([1 for gt_box in gt_boxes.all if gt_box.detection_name == class_name]) if verbose: print("Found {} GT of class {} out of {} total across {} samples.". format(npos, class_name, len(gt_boxes.all), len(gt_boxes.sample_tokens))) # For missing classes in the GT, return a data structure corresponding to no predictions. if npos == 0: return DetectionMotionMetricData.no_predictions(), 0, 0, 0 # # Organize the predictions in a single list. pred_boxes_list = [] pred_confs = [] pred_boxes_list = [box for box in pred_boxes.all if box.detection_name == class_name] pred_confs = [box.detection_score for box in pred_boxes_list] # for box in pred_boxes.all: # if box.detection_name == class_name: # box.traj_scores = np.exp(box.traj_scores) # for i in range(len(box.traj_scores)): # box.traj_index = i # pred_boxes_list.append(box) # pred_confs = [box.detection_score * box.traj_scores[box.traj_index] for box in pred_boxes_list] if verbose: print("Found {} PRED of class {} out of {} total across {} samples.". format(len(pred_confs), class_name, len(pred_boxes.all), len(pred_boxes.sample_tokens))) # Sort by confidence. sortind = [i for (v, i) in sorted((v, i) for (i, v) in enumerate(pred_confs))][::-1] # Do the actual matching. tp = [] # Accumulator of true positives. fp = [] # Accumulator of false positives. conf = [] # Accumulator of confidences. # match_data holds the extra metrics we calculate for each match. match_data = {'trans_err': [], 'vel_err': [], 'scale_err': [], 'orient_err': [], 'attr_err': [], 'conf': [], 'min_ade_err': [], 'min_fde_err': [], 'miss_rate_err': []} # --------------------------------------------- # Match and accumulate match data. # --------------------------------------------- taken = set() # Initially no gt bounding box is matched. for ind in sortind: pred_box = pred_boxes_list[ind] min_dist = np.inf match_gt_idx = None for gt_idx, gt_box in enumerate(gt_boxes[pred_box.sample_token]): # Find closest match among ground truth boxes if gt_box.detection_name == class_name and not (pred_box.sample_token, gt_idx) in taken: this_distance = dist_fcn(gt_box, pred_box) if this_distance < min_dist: min_dist = this_distance match_gt_idx = gt_idx fde_distance = traj_fcn(gt_box, pred_box, final_step) # If the closest match is close enough according to threshold we have a match! is_match = min_dist < dist_th and fde_distance < traj_dist_th if is_match: taken.add((pred_box.sample_token, match_gt_idx)) # Update tp, fp and confs. tp.append(1) fp.append(0) conf.append(pred_box.detection_score) # Since it is a match, update match data also. gt_box_match = gt_boxes[pred_box.sample_token][match_gt_idx] match_data['trans_err'].append(center_distance(gt_box_match, pred_box)) match_data['vel_err'].append(velocity_l2(gt_box_match, pred_box)) match_data['scale_err'].append(1 - scale_iou(gt_box_match, pred_box)) # Barrier orientation is only determined up to 180 degree. (For cones orientation is discarded later) period = np.pi if class_name == 'barrier' else 2 * np.pi match_data['orient_err'].append(yaw_diff(gt_box_match, pred_box, period=period)) match_data['attr_err'].append(1 - attr_acc(gt_box_match, pred_box)) minade, minfde, m_r = prediction_metrics(gt_box_match, pred_box) match_data['min_ade_err'].append(minade) match_data['min_fde_err'].append(minfde) match_data['miss_rate_err'].append(m_r) match_data['conf'].append(pred_box.detection_score) else: # No match. Mark this as a false positive. tp.append(0) fp.append(1) conf.append(pred_box.detection_score) # conf.append(pred_box.detection_score * pred_box.traj_scores[pred_box.traj_index]) # # Check if we have any matches. If not, just return a "no predictions" array. if len(match_data['trans_err']) == 0: return DetectionMotionMetricData.no_predictions(), 0, 0, 0 # --------------------------------------------- # Calculate and interpolate precision and recall # --------------------------------------------- # Accumulate. N_tp = np.sum(tp) N_fp = np.sum(fp) tp = np.cumsum(tp).astype(float) fp = np.cumsum(fp).astype(float) conf = np.array(conf) # Calculate precision and recall. prec = tp / (fp + tp) rec = tp / float(npos) rec_interp = np.linspace(0, 1, DetectionMotionMetricData.nelem) # 101 steps, from 0% to 100% recall. prec = np.interp(rec_interp, rec, prec, right=0) conf = np.interp(rec_interp, rec, conf, right=0) rec = rec_interp # --------------------------------------------- # Re-sample the match-data to match, prec, recall and conf. # --------------------------------------------- for key in match_data.keys(): if key == "conf": continue # Confidence is used as reference to align with fp and tp. So skip in this step. else: # For each match_data, we first calculate the accumulated mean. tmp = cummean(np.array(match_data[key])) # Then interpolate based on the confidences. (Note reversing since np.interp needs increasing arrays) match_data[key] = np.interp(conf[::-1], match_data['conf'][::-1], tmp[::-1])[::-1] # --------------------------------------------- # Done. Instantiate MetricData and return # --------------------------------------------- return DetectionMotionMetricData(recall=rec, precision=prec, confidence=conf, trans_err=match_data['trans_err'], vel_err=match_data['vel_err'], scale_err=match_data['scale_err'], orient_err=match_data['orient_err'], attr_err=match_data['attr_err'], min_ade_err=match_data['min_ade_err'], min_fde_err=match_data['min_fde_err'], miss_rate_err=match_data['miss_rate_err'] ), N_tp, N_fp, npos ================================================ FILE: mmdet3d/datasets/evals/map_api.py ================================================ # nuScenes dev-kit. # Code written by Sergi Adipraja Widjaja, 2019. # + Map mask by Kiwoo Shin, 2019. # + Methods operating on NuScenesMap and NuScenes by Holger Caesar, 2019. import json import os import random from typing import Dict, List, Tuple, Optional, Union import cv2 import math import descartes import matplotlib.gridspec as gridspec import matplotlib.pyplot as plt import numpy as np from PIL import Image from matplotlib.axes import Axes from matplotlib.figure import Figure from matplotlib.patches import Rectangle, Arrow from mpl_toolkits.axes_grid1.inset_locator import mark_inset from pyquaternion import Quaternion from shapely import affinity from shapely.geometry import Polygon, MultiPolygon, LineString, Point, box from tqdm import tqdm from nuscenes.map_expansion.arcline_path_utils import discretize_lane, ArcLinePath from nuscenes.map_expansion.bitmap import BitMap from nuscenes.nuscenes import NuScenes from nuscenes.utils.geometry_utils import view_points from functools import partial # Recommended style to use as the plots will show grids. plt.style.use('seaborn-whitegrid') # Define a map geometry type for polygons and lines. Geometry = Union[Polygon, LineString] locations = ['singapore-onenorth', 'singapore-hollandvillage', 'singapore-queenstown', 'boston-seaport'] class NuScenesMap: """ NuScenesMap database class for querying and retrieving information from the semantic maps. Before using this class please use the provided tutorial `map_expansion_tutorial.ipynb`. Below you can find the map origins (south western corner, in [lat, lon]) for each of the 4 maps in nuScenes: boston-seaport: [42.336849169438615, -71.05785369873047] singapore-onenorth: [1.2882100868743724, 103.78475189208984] singapore-hollandvillage: [1.2993652317780957, 103.78217697143555] singapore-queenstown: [1.2782562240223188, 103.76741409301758] The dimensions of the maps are as follows ([width, height] in meters): singapore-onenorth: [1585.6, 2025.0] singapore-hollandvillage: [2808.3, 2922.9] singapore-queenstown: [3228.6, 3687.1] boston-seaport: [2979.5, 2118.1] The rasterized semantic maps (e.g. singapore-onenorth.png) published with nuScenes v1.0 have a scale of 10px/m, hence the above numbers are the image dimensions divided by 10. We use the same WGS 84 Web Mercator (EPSG:3857) projection as Google Maps/Earth. """ def __init__(self, dataroot: str = '/data/sets/nuscenes', map_name: str = 'singapore-onenorth'): """ Loads the layers, create reverse indices and shortcuts, initializes the explorer class. :param dataroot: Path to the layers in the form of a .json file. :param map_name: Which map out of `singapore-onenorth`, `singepore-hollandvillage`, `singapore-queenstown`, `boston-seaport` that we want to load. """ assert map_name in locations, 'Error: Unknown map name %s!' % map_name self.dataroot = dataroot self.map_name = map_name self.geometric_layers = ['polygon', 'line', 'node'] # These are the non-geometric layers which have polygons as the geometric descriptors. self.non_geometric_polygon_layers = ['drivable_area', 'road_segment', 'road_block', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area'] # We want to be able to search for lane connectors, but not render them. self.lookup_polygon_layers = self.non_geometric_polygon_layers + ['lane_connector'] # These are the non-geometric layers which have line strings as the geometric descriptors. self.non_geometric_line_layers = ['road_divider', 'lane_divider', 'traffic_light'] self.non_geometric_layers = self.non_geometric_polygon_layers + self.non_geometric_line_layers self.layer_names = self.geometric_layers + self.lookup_polygon_layers + self.non_geometric_line_layers # Load the selected map. self.json_fname = os.path.join(self.dataroot, 'maps', 'expansion', '{}.json'.format(self.map_name)) with open(self.json_fname, 'r') as fh: self.json_obj = json.load(fh) # Parse the map version and print an error for deprecated maps. if 'version' in self.json_obj: self.version = self.json_obj['version'] else: self.version = '1.0' if self.version < '1.3': raise Exception('Error: You are using an outdated map version (%s)! ' 'Please go to https://www.nuscenes.org/download to download the latest map!') self.canvas_edge = self.json_obj['canvas_edge'] self._load_layers() self._make_token2ind() self._make_shortcuts() self.explorer = NuScenesMapExplorer(self) def _load_layer(self, layer_name: str) -> List[dict]: """ Returns a list of records corresponding to the layer name. :param layer_name: Name of the layer that will be loaded. :return: A list of records corresponding to a layer. """ return self.json_obj[layer_name] def _load_layer_dict(self, layer_name: str) -> Dict[str, Union[dict, list]]: """ Returns a dict of records corresponding to the layer name. :param layer_name: Name of the layer that will be loaded. :return: A dict of records corresponding to a layer. """ return self.json_obj[layer_name] def _load_layers(self) -> None: """ Loads each available layer. """ # Explicit assignment of layers are necessary to help the IDE determine valid class members. self.polygon = self._load_layer('polygon') self.line = self._load_layer('line') self.node = self._load_layer('node') self.drivable_area = self._load_layer('drivable_area') self.road_segment = self._load_layer('road_segment') self.road_block = self._load_layer('road_block') self.lane = self._load_layer('lane') self.ped_crossing = self._load_layer('ped_crossing') self.walkway = self._load_layer('walkway') self.stop_line = self._load_layer('stop_line') self.carpark_area = self._load_layer('carpark_area') self.road_divider = self._load_layer('road_divider') self.lane_divider = self._load_layer('lane_divider') self.traffic_light = self._load_layer('traffic_light') self.arcline_path_3: Dict[str, List[dict]] = self._load_layer_dict('arcline_path_3') self.connectivity: Dict[str, dict] = self._load_layer_dict('connectivity') self.lane_connector = self._load_layer('lane_connector') def _make_token2ind(self) -> None: """ Store the mapping from token to layer index for each layer. """ self._token2ind = dict() for layer_name in self.layer_names: self._token2ind[layer_name] = dict() for ind, member in enumerate(getattr(self, layer_name)): self._token2ind[layer_name][member['token']] = ind def _make_shortcuts(self) -> None: """ Makes the record shortcuts. """ # Makes a shortcut between non geometric records to their nodes. for layer_name in self.non_geometric_polygon_layers: if layer_name == 'drivable_area': # Drivable area has more than one geometric representation. pass else: for record in self.__dict__[layer_name]: polygon_obj = self.get('polygon', record['polygon_token']) record['exterior_node_tokens'] = polygon_obj['exterior_node_tokens'] record['holes'] = polygon_obj['holes'] for layer_name in self.non_geometric_line_layers: for record in self.__dict__[layer_name]: record['node_tokens'] = self.get('line', record['line_token'])['node_tokens'] # Makes a shortcut between stop lines to their cues, there's different cues for different types of stop line. # Refer to `_get_stop_line_cue()` for details. for record in self.stop_line: cue = self._get_stop_line_cue(record) record['cue'] = cue # Makes a shortcut between lanes to their lane divider segment nodes. for record in self.lane: record['left_lane_divider_segment_nodes'] = [self.get('node', segment['node_token']) for segment in record['left_lane_divider_segments']] record['right_lane_divider_segment_nodes'] = [self.get('node', segment['node_token']) for segment in record['right_lane_divider_segments']] def _get_stop_line_cue(self, stop_line_record: dict) -> List[dict]: """ Get the different cues for different types of stop lines. :param stop_line_record: A single stop line record. :return: The cue for that stop line. """ if stop_line_record['stop_line_type'] in ['PED_CROSSING', 'TURN_STOP']: return [self.get('ped_crossing', token) for token in stop_line_record['ped_crossing_tokens']] elif stop_line_record['stop_line_type'] in ['STOP_SIGN', 'YIELD']: return [] elif stop_line_record['stop_line_type'] == 'TRAFFIC_LIGHT': return [self.get('traffic_light', token) for token in stop_line_record['traffic_light_tokens']] def get(self, layer_name: str, token: str) -> dict: """ Returns a record from the layer in constant runtime. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record. :return: A single layer record. """ assert layer_name in self.layer_names, "Layer {} not found".format(layer_name) return getattr(self, layer_name)[self.getind(layer_name, token)] def getind(self, layer_name: str, token: str) -> int: """ This returns the index of the record in a layer in constant runtime. :param layer_name: Name of the layer we are interested in. :param token: Token of the record. :return: The index of the record in the layer, layer is an array. """ return self._token2ind[layer_name][token] def render_record(self, layer_name: str, token: str, alpha: float = 0.5, figsize: Tuple[float, float] = None, other_layers: List[str] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Tuple[Axes, Axes]]: """ Render a single map record. By default will also render 3 layers which are `drivable_area`, `lane`, and `walkway` unless specified by `other_layers`. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record that you want to render. :param alpha: The opacity of each layer that gets rendered. :param figsize: Size of the whole figure. :param other_layers: What other layers to render aside from the one specified in `layer_name`. :param bitmap: Optional BitMap object to render below the other map layers. :return: The matplotlib figure and axes of the rendered layers. """ return self.explorer.render_record(layer_name, token, alpha, figsize=figsize, other_layers=other_layers, bitmap=bitmap) def render_layers(self, layer_names: List[str], alpha: float = 0.5, figsize: Union[None, float, Tuple[float, float]] = None, tokens: List[str] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Render a list of layer names. :param layer_names: A list of layer names. :param alpha: The opacity of each layer that gets rendered. :param figsize: Size of the whole figure. :param tokens: Optional list of tokens to render. None means all tokens are rendered. :param bitmap: Optional BitMap object to render below the other map layers. :return: The matplotlib figure and axes of the rendered layers. """ return self.explorer.render_layers(layer_names, alpha, figsize=figsize, tokens=tokens, bitmap=bitmap) def render_map_patch(self, box_coords: Tuple[float, float, float, float], layer_names: List[str] = None, alpha: float = 0.5, figsize: Tuple[int, int] = (15, 15), render_egoposes_range: bool = True, render_legend: bool = True, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Renders a rectangular patch specified by `box_coords`. By default renders all layers. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param layer_names: All the non geometric layers that we want to render. :param alpha: The opacity of each layer. :param figsize: Size of the whole figure. :param render_egoposes_range: Whether to render a rectangle around all ego poses. :param render_legend: Whether to render the legend of map layers. :param bitmap: Optional BitMap object to render below the other map layers. :return: The matplotlib figure and axes of the rendered layers. """ return self.explorer.render_map_patch(box_coords, layer_names=layer_names, alpha=alpha, figsize=figsize, render_egoposes_range=render_egoposes_range, render_legend=render_legend, bitmap=bitmap) def render_map_in_image(self, nusc: NuScenes, sample_token: str, camera_channel: str = 'CAM_FRONT', alpha: float = 0.3, patch_radius: float = 10000, min_polygon_area: float = 1000, render_behind_cam: bool = True, render_outside_im: bool = True, layer_names: List[str] = None, verbose: bool = True, out_path: str = None) -> Tuple[Figure, Axes]: """ Render a nuScenes camera image and overlay the polygons for the specified map layers. Note that the projections are not always accurate as the localization is in 2d. :param nusc: The NuScenes instance to load the image from. :param sample_token: The image's corresponding sample_token. :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'. :param alpha: The transparency value of the layers to render in [0, 1]. :param patch_radius: The radius in meters around the ego car in which to select map records. :param min_polygon_area: Minimum area a polygon needs to have to be rendered. :param render_behind_cam: Whether to render polygons where any point is behind the camera. :param render_outside_im: Whether to render polygons where any point is outside the image. :param layer_names: The names of the layers to render, e.g. ['lane']. If set to None, the recommended setting will be used. :param verbose: Whether to print to stdout. :param out_path: Optional path to save the rendered figure to disk. """ return self.explorer.render_map_in_image( nusc, sample_token, camera_channel=camera_channel, alpha=alpha, patch_radius=patch_radius, min_polygon_area=min_polygon_area, render_behind_cam=render_behind_cam, render_outside_im=render_outside_im, layer_names=layer_names, verbose=verbose, out_path=out_path) def get_map_mask_in_image(self, nusc: NuScenes, sample_token: str, camera_channel: str = 'CAM_FRONT', alpha: float = 0.3, patch_radius: float = 10000, min_polygon_area: float = 1000, render_behind_cam: bool = True, render_outside_im: bool = True, layer_names: List[str] = None, verbose: bool = False, out_path: str = None): """ Render a nuScenes camera image and overlay the polygons for the specified map layers. Note that the projections are not always accurate as the localization is in 2d. :param nusc: The NuScenes instance to load the image from. :param sample_token: The image's corresponding sample_token. :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'. :param alpha: The transparency value of the layers to render in [0, 1]. :param patch_radius: The radius in meters around the ego car in which to select map records. :param min_polygon_area: Minimum area a polygon needs to have to be rendered. :param render_behind_cam: Whether to render polygons where any point is behind the camera. :param render_outside_im: Whether to render polygons where any point is outside the image. :param layer_names: The names of the layers to render, e.g. ['lane']. If set to None, the recommended setting will be used. :param verbose: Whether to print to stdout. :param out_path: Optional path to save the rendered figure to disk. """ return self.explorer.get_map_mask_in_image( nusc, sample_token, camera_channel=camera_channel, alpha=alpha, patch_radius=patch_radius, min_polygon_area=min_polygon_area, render_behind_cam=render_behind_cam, render_outside_im=render_outside_im, layer_names=layer_names, verbose=verbose, out_path=out_path) def render_egoposes_on_fancy_map(self, nusc: NuScenes, scene_tokens: List = None, verbose: bool = True, out_path: str = None, render_egoposes: bool = True, render_egoposes_range: bool = True, render_legend: bool = True, bitmap: Optional[BitMap] = None) -> Tuple[np.ndarray, Figure, Axes]: """ Renders each ego pose of a list of scenes on the map (around 40 poses per scene). This method is heavily inspired by NuScenes.render_egoposes_on_map(), but uses the map expansion pack maps. :param nusc: The NuScenes instance to load the ego poses from. :param scene_tokens: Optional list of scene tokens corresponding to the current map location. :param verbose: Whether to show status messages and progress bar. :param out_path: Optional path to save the rendered figure to disk. :param render_egoposes: Whether to render ego poses. :param render_egoposes_range: Whether to render a rectangle around all ego poses. :param render_legend: Whether to render the legend of map layers. :param bitmap: Optional BitMap object to render below the other map layers. :return: . Returns a matrix with n ego poses in global map coordinates. """ return self.explorer.render_egoposes_on_fancy_map(nusc, scene_tokens=scene_tokens, verbose=verbose, out_path=out_path, render_egoposes=render_egoposes, render_egoposes_range=render_egoposes_range, render_legend=render_legend, bitmap=bitmap) def render_centerlines(self, resolution_meters: float = 0.5, figsize: Union[None, float, Tuple[float, float]] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Render the centerlines of all lanes and lane connectors. :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved lanes are properly represented. :param figsize: Size of the figure. :param bitmap: Optional BitMap object to render below the other map layers. """ return self.explorer.render_centerlines(resolution_meters=resolution_meters, figsize=figsize, bitmap=bitmap) def render_map_mask(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_names: List[str] = None, canvas_size: Tuple[int, int] = (100, 100), figsize: Tuple[int, int] = (15, 15), n_row: int = 2) -> Tuple[Figure, List[Axes]]: """ Render map mask of the patch specified by patch_box and patch_angle. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :param layer_names: A list of layer names to be returned. :param canvas_size: Size of the output mask (h, w). :param figsize: Size of the figure. :param n_row: Number of rows with plots. :return: The matplotlib figure and a list of axes of the rendered layers. """ return self.explorer.render_map_mask(patch_box, patch_angle, layer_names=layer_names, canvas_size=canvas_size, figsize=figsize, n_row=n_row) def get_map_mask(self, patch_box: Optional[Tuple[float, float, float, float]], patch_angle: float, layer_names: List[str] = None, canvas_size: Optional[Tuple[int, int]] = (100, 100)) -> np.ndarray: """ Return list of map mask layers of the specified patch. :param patch_box: Patch box defined as [x_center, y_center, height, width]. If None, this plots the entire map. :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0. :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers. :param canvas_size: Size of the output mask (h, w). If None, we use the default resolution of 10px/m. :return: Stacked numpy array of size [c x h x w] with c channels and the same width/height as the canvas. """ return self.explorer.get_map_mask(patch_box, patch_angle, layer_names=layer_names, canvas_size=canvas_size) def get_map_geom(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_names: List[str]) -> List[Tuple[str, List[Geometry]]]: """ Returns a list of geometries in the specified patch_box. These are unscaled, but aligned with the patch angle. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0. :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers. :return: List of layer names and their corresponding geometries. """ return self.explorer.get_map_geom(patch_box, patch_angle, layer_names) def get_records_in_patch(self, box_coords: Tuple[float, float, float, float], layer_names: List[str] = None, mode: str = 'intersect') -> Dict[str, List[str]]: """ Get all the record token that intersects or is within a particular rectangular patch. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param layer_names: Names of the layers that we want to retrieve in a particular patch. By default will always look at the all non geometric layers. :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return all non geometric records that are within the patch. :return: Dictionary of layer_name - tokens pairs. """ return self.explorer.get_records_in_patch(box_coords, layer_names=layer_names, mode=mode) def is_record_in_patch(self, layer_name: str, token: str, box_coords: Tuple[float, float, float, float], mode: str = 'intersect') -> bool: """ Query whether a particular record is in a rectangular patch :param layer_name: The layer name of the record. :param token: The record token. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param mode: "intersect" means it will return True if the geometric object intersects the patch, "within" will return True if the geometric object is within the patch. :return: Boolean value on whether a particular record intersects or within a particular patch. """ return self.explorer.is_record_in_patch(layer_name, token, box_coords, mode=mode) def layers_on_point(self, x: float, y: float, layer_names: List[str] = None) -> Dict[str, str]: """ Returns all the polygonal layers that a particular point is on. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :param layer_names: The names of the layers to search for. :return: All the polygonal layers that a particular point is on. {: } """ return self.explorer.layers_on_point(x, y, layer_names=layer_names) def record_on_point(self, x: float, y: float, layer_name: str) -> str: """ Query what record of a layer a particular point is on. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :param layer_name: The non geometric polygonal layer name that we are interested in. :return: The first token of a layer a particular point is on or '' if no layer is found. """ return self.explorer.record_on_point(x, y, layer_name) def extract_polygon(self, polygon_token: str) -> Polygon: """ Construct a shapely Polygon object out of a polygon token. :param polygon_token: The token of the polygon record. :return: The polygon wrapped in a shapely Polygon object. """ return self.explorer.extract_polygon(polygon_token) def extract_line(self, line_token: str) -> LineString: """ Construct a shapely LineString object out of a line token. :param line_token: The token of the line record. :return: The line wrapped in a LineString object. """ return self.explorer.extract_line(line_token) def get_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]: """ Get the bounds of the geometric object that corresponds to a non geometric record. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record. :return: min_x, min_y, max_x, max_y of of the line representation. """ return self.explorer.get_bounds(layer_name, token) def get_records_in_radius(self, x: float, y: float, radius: float, layer_names: List[str], mode: str = 'intersect') -> Dict[str, List[str]]: """ Get all the record tokens that intersect a square patch of side length 2*radius centered on (x,y). :param x: X-coordinate in global frame. :param y: y-coordinate in global frame. :param radius: All records within radius meters of point (x, y) will be returned. :param layer_names: Names of the layers that we want to retrieve. By default will always look at the all non geometric layers. :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return all non geometric records that are within the patch. :return: Dictionary of layer_name - tokens pairs. """ patch = (x - radius, y - radius, x + radius, y + radius) return self.explorer.get_records_in_patch(patch, layer_names, mode=mode) def discretize_centerlines(self, resolution_meters: float) -> List[np.array]: """ Discretize the centerlines of lanes and lane connectors. :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved lanes are properly represented. :return: A list of np.arrays with x, y and z values for each point. """ pose_lists = [] for lane in self.lane + self.lane_connector: my_lane = self.arcline_path_3.get(lane['token'], []) discretized = np.array(discretize_lane(my_lane, resolution_meters)) pose_lists.append(discretized) return pose_lists def discretize_lanes(self, tokens: List[str], resolution_meters: float) -> Dict[str, List[Tuple[float, float, float]]]: """ Discretizes a list of lane/lane connector tokens. :param tokens: List of lane and/or lane connector record tokens. Can be retrieved with get_records_in_radius or get_records_in_patch. :param resolution_meters: How finely to discretize the splines. :return: Mapping from lane/lane connector token to sequence of poses along the lane. """ return {ID: discretize_lane(self.arcline_path_3.get(ID, []), resolution_meters) for ID in tokens} def _get_connected_lanes(self, lane_token: str, incoming_outgoing: str) -> List[str]: """ Helper for getting the lanes connected to a given lane :param lane_token: Token for the lane. :param incoming_outgoing: Whether to get incoming or outgoing lanes :return: List of lane tokens this lane is connected to. """ if lane_token not in self.connectivity: raise ValueError(f"{lane_token} is not a valid lane.") return self.connectivity[lane_token][incoming_outgoing] def get_outgoing_lane_ids(self, lane_token: str) -> List[str]: """ Get the out-going lanes. :param lane_token: Token for the lane. :return: List of lane tokens that start at the end of this lane. """ return self._get_connected_lanes(lane_token, 'outgoing') def get_incoming_lane_ids(self, lane_token: str) -> List[str]: """ Get the incoming lanes. :param lane_token: Token for the lane. :return: List of lane tokens that end at the start of this lane. """ return self._get_connected_lanes(lane_token, 'incoming') def get_arcline_path(self, lane_token: str) -> List[ArcLinePath]: """ Get the arcline path representation for a lane. Note: This function was previously called `get_lane()`, but renamed to avoid confusion between lanes and arcline paths. :param lane_token: Token for the lane. :return: Arc line path representation of the lane. """ arcline_path = self.arcline_path_3.get(lane_token) if not arcline_path: raise ValueError(f'Error: Lane with token {lane_token} does not have a valid arcline path!') return arcline_path def get_closest_lane(self, x: float, y: float, radius: float = 5) -> str: """ Get closest lane id within a radius of query point. The distance from a point (x, y) to a lane is the minimum l2 distance from (x, y) to a point on the lane. :param x: X coordinate in global coordinate frame. :param y: Y Coordinate in global coordinate frame. :param radius: Radius around point to consider. :return: Lane id of closest lane within radius. """ lanes = self.get_records_in_radius(x, y, radius, ['lane', 'lane_connector']) lanes = lanes['lane'] + lanes['lane_connector'] discrete_points = self.discretize_lanes(lanes, 0.5) current_min = np.inf min_id = "" for lane_id, points in discrete_points.items(): distance = np.linalg.norm(np.array(points)[:, :2] - [x, y], axis=1).min() if distance <= current_min: current_min = distance min_id = lane_id return min_id def render_next_roads(self, x: float, y: float, alpha: float = 0.5, figsize: Union[None, float, Tuple[float, float]] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Renders the possible next roads from a point of interest. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :param alpha: The opacity of each layer that gets rendered. :param figsize: Size of the whole figure. :param bitmap: Optional BitMap object to render below the other map layers. """ return self.explorer.render_next_roads(x, y, alpha, figsize=figsize, bitmap=bitmap) def get_next_roads(self, x: float, y: float) -> Dict[str, List[str]]: """ Get the possible next roads from a point of interest. Returns road_segment, road_block and lane. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :return: Dictionary of layer_name - tokens pairs. """ # Filter out irrelevant layers. road_layers = ['road_segment', 'road_block', 'lane'] layers = self.explorer.layers_on_point(x, y) rel_layers = {layer: layers[layer] for layer in road_layers} # Pick most fine-grained road layer (lane, road_block, road_segment) object that contains the point. rel_layer = None rel_token = None for layer in road_layers[::-1]: if rel_layers[layer] != '': rel_layer = layer rel_token = rel_layers[layer] break assert rel_layer is not None, 'Error: No suitable layer in the specified point location!' # Get all records that overlap with the bounding box of the selected road. box_coords = self.explorer.get_bounds(rel_layer, rel_token) intersect = self.explorer.get_records_in_patch(box_coords, road_layers, mode='intersect') # Go through all objects within the bounding box. result = {layer: [] for layer in road_layers} if rel_layer == 'road_segment': # For road segments, we do not have a direction. # Return objects that have ANY exterior points in common with the relevant layer. rel_exterior_nodes = self.get(rel_layer, rel_token)['exterior_node_tokens'] for layer in road_layers: for token in intersect[layer]: exterior_nodes = self.get(layer, token)['exterior_node_tokens'] if any(n in exterior_nodes for n in rel_exterior_nodes) \ and token != rel_layers[layer]: result[layer].append(token) else: # For lanes and road blocks, the next road is indicated by the edge line. # Return objects where ALL edge line nodes are included in the exterior nodes. to_edge_line = self.get(rel_layer, rel_token)['to_edge_line_token'] to_edge_nodes = self.get('line', to_edge_line)['node_tokens'] for layer in road_layers: for token in intersect[layer]: exterior_nodes = self.get(layer, token)['exterior_node_tokens'] if all(n in exterior_nodes for n in to_edge_nodes) \ and token != rel_layers[layer]: result[layer].append(token) return result class NuScenesMapExplorer: """ Helper class to explore the nuScenes map data. """ def __init__(self, map_api: NuScenesMap, representative_layers: Tuple[str] = ('drivable_area', 'lane', 'walkway'), color_map: dict = None): """ :param map_api: NuScenesMap database class. :param representative_layers: These are the layers that we feel are representative of the whole mapping data. :param color_map: Color map. """ # Mutable default argument. if color_map is None: color_map = dict(drivable_area='#a6cee3', road_segment='#1f78b4', road_block='#b2df8a', lane='#33a02c', ped_crossing='#fb9a99', walkway='#e31a1c', stop_line='#fdbf6f', carpark_area='#ff7f00', road_divider='#cab2d6', lane_divider='#6a3d9a', traffic_light='#7e772e') self.map_api = map_api self.representative_layers = representative_layers self.color_map = color_map self.canvas_max_x = self.map_api.canvas_edge[0] self.canvas_min_x = 0 self.canvas_max_y = self.map_api.canvas_edge[1] self.canvas_min_y = 0 self.canvas_aspect_ratio = (self.canvas_max_x - self.canvas_min_x) / (self.canvas_max_y - self.canvas_min_y) def render_centerlines(self, resolution_meters: float, figsize: Union[None, float, Tuple[float, float]] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Render the centerlines of all lanes and lane connectors. :param resolution_meters: How finely to discretize the lane. Smaller values ensure curved lanes are properly represented. :param figsize: Size of the figure. :param bitmap: Optional BitMap object to render below the other map layers. """ # Discretize all lanes and lane connectors. pose_lists = self.map_api.discretize_centerlines(resolution_meters) # Render connectivity lines. fig = plt.figure(figsize=self._get_figsize(figsize)) ax = fig.add_axes([0, 0, 1, 1 / self.canvas_aspect_ratio]) if bitmap is not None: bitmap.render(self.map_api.canvas_edge, ax) for pose_list in pose_lists: if len(pose_list) > 0: plt.plot(pose_list[:, 0], pose_list[:, 1]) return fig, ax def render_map_mask(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_names: List[str], canvas_size: Tuple[int, int], figsize: Tuple[int, int], n_row: int = 2) -> Tuple[Figure, List[Axes]]: """ Render map mask of the patch specified by patch_box and patch_angle. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :param layer_names: A list of layer names to be extracted. :param canvas_size: Size of the output mask (h, w). :param figsize: Size of the figure. :param n_row: Number of rows with plots. :return: The matplotlib figure and a list of axes of the rendered layers. """ if layer_names is None: layer_names = self.map_api.non_geometric_layers map_mask = self.get_map_mask(patch_box, patch_angle, layer_names, canvas_size) # If no canvas_size is specified, retrieve the default from the output of get_map_mask. if canvas_size is None: canvas_size = map_mask.shape[1:] fig = plt.figure(figsize=figsize) ax = fig.add_axes([0, 0, 1, 1]) ax.set_xlim(0, canvas_size[1]) ax.set_ylim(0, canvas_size[0]) n_col = len(map_mask) // n_row gs = gridspec.GridSpec(n_row, n_col) gs.update(wspace=0.025, hspace=0.05) for i in range(len(map_mask)): r = i // n_col c = i - r * n_col subax = plt.subplot(gs[r, c]) subax.imshow(map_mask[i], origin='lower') subax.text(canvas_size[0] * 0.5, canvas_size[1] * 1.1, layer_names[i]) subax.grid(False) return fig, fig.axes def get_map_geom(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_names: List[str]) -> List[Tuple[str, List[Geometry]]]: """ Returns a list of geometries in the specified patch_box. These are unscaled, but aligned with the patch angle. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0. :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers. :return: List of layer names and their corresponding geometries. """ # If None, return all geometric layers. if layer_names is None: layer_names = self.map_api.non_geometric_layers # Get each layer name and geometry and store them in a list. map_geom = [] for layer_name in layer_names: layer_geom = self._get_layer_geom(patch_box, patch_angle, layer_name) if layer_geom is None: continue map_geom.append((layer_name, layer_geom)) return map_geom def map_geom_to_mask(self, map_geom: List[Tuple[str, List[Geometry]]], local_box: Tuple[float, float, float, float], canvas_size: Tuple[int, int]) -> np.ndarray: """ Return list of map mask layers of the specified patch. :param map_geom: List of layer names and their corresponding geometries. :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically x_center = y_center = 0. :param canvas_size: Size of the output mask (h, w). :return: Stacked numpy array of size [c x h x w] with c channels and the same height/width as the canvas. """ # Get each layer mask and stack them into a numpy tensor. map_mask = [] for layer_name, layer_geom in map_geom: layer_mask = self._layer_geom_to_mask(layer_name, layer_geom, local_box, canvas_size) if layer_mask is not None: map_mask.append(layer_mask) return np.array(map_mask) def get_map_mask(self, patch_box: Optional[Tuple[float, float, float, float]], patch_angle: float, layer_names: List[str] = None, canvas_size: Tuple[int, int] = (100, 100)) -> np.ndarray: """ Return list of map mask layers of the specified patch. :param patch_box: Patch box defined as [x_center, y_center, height, width]. If None, this plots the entire map. :param patch_angle: Patch orientation in degrees. North-facing corresponds to 0. :param layer_names: A list of layer names to be extracted, or None for all non-geometric layers. :param canvas_size: Size of the output mask (h, w). If None, we use the default resolution of 10px/m. :return: Stacked numpy array of size [c x h x w] with c channels and the same width/height as the canvas. """ # For some combination of parameters, we need to know the size of the current map. if self.map_api.map_name == 'singapore-onenorth': map_dims = [1585.6, 2025.0] elif self.map_api.map_name == 'singapore-hollandvillage': map_dims = [2808.3, 2922.9] elif self.map_api.map_name == 'singapore-queenstown': map_dims = [3228.6, 3687.1] elif self.map_api.map_name == 'boston-seaport': map_dims = [2979.5, 2118.1] else: raise Exception('Error: Invalid map!') # If None, return the entire map. if patch_box is None: patch_box = [map_dims[0] / 2, map_dims[1] / 2, map_dims[1], map_dims[0]] # If None, return all geometric layers. if layer_names is None: layer_names = self.map_api.non_geometric_layers # If None, return the specified patch in the original scale of 10px/m. if canvas_size is None: map_scale = 10 canvas_size = np.array((patch_box[2], patch_box[3])) * map_scale canvas_size = tuple(np.round(canvas_size).astype(np.int32)) # Get geometry of each layer. map_geom = self.get_map_geom(patch_box, patch_angle, layer_names) # Convert geometry of each layer into mask and stack them into a numpy tensor. # Convert the patch box from global coordinates to local coordinates by setting the center to (0, 0). local_box = (0.0, 0.0, patch_box[2], patch_box[3]) map_mask = self.map_geom_to_mask(map_geom, local_box, canvas_size) assert np.all(map_mask.shape[1:] == canvas_size) return map_mask def render_record(self, layer_name: str, token: str, alpha: float = 0.5, figsize: Union[None, float, Tuple[float, float]] = None, other_layers: List[str] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Tuple[Axes, Axes]]: """ Render a single map record. By default will also render 3 layers which are `drivable_area`, `lane`, and `walkway` unless specified by `other_layers`. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record that you want to render. :param alpha: The opacity of each layer that gets rendered. :param figsize: Size of the whole figure. :param other_layers: What other layers to render aside from the one specified in `layer_name`. :param bitmap: Optional BitMap object to render below the other map layers. :return: The matplotlib figure and axes of the rendered layers. """ if other_layers is None: other_layers = list(self.representative_layers) for other_layer in other_layers: if other_layer not in self.map_api.non_geometric_layers: raise ValueError("{} is not a non geometric layer".format(layer_name)) x1, y1, x2, y2 = self.map_api.get_bounds(layer_name, token) local_width = x2 - x1 local_height = y2 - y1 assert local_height > 0, 'Error: Map has 0 height!' local_aspect_ratio = local_width / local_height # We obtained the values 0.65 and 0.66 by trials. fig = plt.figure(figsize=self._get_figsize(figsize)) global_ax = fig.add_axes([0, 0, 0.65, 0.65 / self.canvas_aspect_ratio]) local_ax = fig.add_axes([0.66, 0.66 / self.canvas_aspect_ratio, 0.34, 0.34 / local_aspect_ratio]) # To make sure the sequence of the layer overlays is always consistent after typesetting set(). random.seed('nutonomy') if bitmap is not None: bitmap.render(self.map_api.canvas_edge, global_ax) bitmap.render(self.map_api.canvas_edge, local_ax) layer_names = other_layers + [layer_name] layer_names = list(set(layer_names)) for layer in layer_names: self._render_layer(global_ax, layer, alpha) for layer in layer_names: self._render_layer(local_ax, layer, alpha) if layer_name == 'drivable_area': # Bad output aesthetically if we add spacing between the objects and the axes for drivable area. local_ax_xlim = (x1, x2) local_ax_ylim = (y1, y2) else: # Add some spacing between the object and the axes. local_ax_xlim = (x1 - local_width / 3, x2 + local_width / 3) local_ax_ylim = (y1 - local_height / 3, y2 + local_height / 3) # Draws the rectangular patch on the local_ax. local_ax.add_patch(Rectangle((x1, y1), local_width, local_height, linestyle='-.', color='red', fill=False, lw=2)) local_ax.set_xlim(*local_ax_xlim) local_ax.set_ylim(*local_ax_ylim) local_ax.set_title('Local View') global_ax.set_xlim(self.canvas_min_x, self.canvas_max_x) global_ax.set_ylim(self.canvas_min_y, self.canvas_max_y) global_ax.set_title('Global View') global_ax.legend() # Adds the zoomed in effect to the plot. mark_inset(global_ax, local_ax, loc1=2, loc2=4) return fig, (global_ax, local_ax) def render_layers(self, layer_names: List[str], alpha: float, figsize: Union[None, float, Tuple[float, float]], tokens: List[str] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Render a list of layers. :param layer_names: A list of layer names. :param alpha: The opacity of each layer. :param figsize: Size of the whole figure. :param tokens: Optional list of tokens to render. None means all tokens are rendered. :param bitmap: Optional BitMap object to render below the other map layers. :return: The matplotlib figure and axes of the rendered layers. """ fig = plt.figure(figsize=self._get_figsize(figsize)) ax = fig.add_axes([0, 0, 1, 1 / self.canvas_aspect_ratio]) ax.set_xlim(self.canvas_min_x, self.canvas_max_x) ax.set_ylim(self.canvas_min_y, self.canvas_max_y) if bitmap is not None: bitmap.render(self.map_api.canvas_edge, ax) layer_names = list(set(layer_names)) for layer_name in layer_names: self._render_layer(ax, layer_name, alpha, tokens) ax.legend() return fig, ax def render_map_patch(self, box_coords: Tuple[float, float, float, float], layer_names: List[str] = None, alpha: float = 0.5, figsize: Tuple[float, float] = (15, 15), render_egoposes_range: bool = True, render_legend: bool = True, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Renders a rectangular patch specified by `box_coords`. By default renders all layers. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param layer_names: All the non geometric layers that we want to render. :param alpha: The opacity of each layer. :param figsize: Size of the whole figure. :param render_egoposes_range: Whether to render a rectangle around all ego poses. :param render_legend: Whether to render the legend of map layers. :param bitmap: Optional BitMap object to render below the other map layers. :return: The matplotlib figure and axes of the rendered layers. """ x_min, y_min, x_max, y_max = box_coords if layer_names is None: layer_names = self.map_api.non_geometric_layers fig = plt.figure(figsize=figsize) local_width = x_max - x_min local_height = y_max - y_min assert local_height > 0, 'Error: Map patch has 0 height!' local_aspect_ratio = local_width / local_height ax = fig.add_axes([0, 0, 1, 1 / local_aspect_ratio]) if bitmap is not None: bitmap.render(self.map_api.canvas_edge, ax) for layer_name in layer_names: self._render_layer(ax, layer_name, alpha) x_margin = np.minimum(local_width / 4, 50) y_margin = np.minimum(local_height / 4, 10) ax.set_xlim(x_min - x_margin, x_max + x_margin) ax.set_ylim(y_min - y_margin, y_max + y_margin) if render_egoposes_range: ax.add_patch(Rectangle((x_min, y_min), local_width, local_height, fill=False, linestyle='-.', color='red', lw=2)) ax.text(x_min + local_width / 100, y_min + local_height / 2, "%g m" % local_height, fontsize=14, weight='bold') ax.text(x_min + local_width / 2, y_min + local_height / 100, "%g m" % local_width, fontsize=14, weight='bold') if render_legend: ax.legend(frameon=True, loc='upper right') return fig, ax def render_map_in_image(self, nusc: NuScenes, sample_token: str, camera_channel: str = 'CAM_FRONT', alpha: float = 0.3, patch_radius: float = 10000, min_polygon_area: float = 1000, render_behind_cam: bool = True, render_outside_im: bool = True, layer_names: List[str] = None, verbose: bool = True, out_path: str = None) -> Tuple[Figure, Axes]: """ Render a nuScenes camera image and overlay the polygons for the specified map layers. Note that the projections are not always accurate as the localization is in 2d. :param nusc: The NuScenes instance to load the image from. :param sample_token: The image's corresponding sample_token. :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'. :param alpha: The transparency value of the layers to render in [0, 1]. :param patch_radius: The radius in meters around the ego car in which to select map records. :param min_polygon_area: Minimum area a polygon needs to have to be rendered. :param render_behind_cam: Whether to render polygons where any point is behind the camera. :param render_outside_im: Whether to render polygons where any point is outside the image. :param layer_names: The names of the layers to render, e.g. ['lane']. If set to None, the recommended setting will be used. :param verbose: Whether to print to stdout. :param out_path: Optional path to save the rendered figure to disk. """ near_plane = 1e-8 if verbose: print('Warning: Note that the projections are not always accurate as the localization is in 2d.') # Default layers. if layer_names is None: layer_names = ['road_segment', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area'] # Check layers whether we can render them. for layer_name in layer_names: assert layer_name in self.map_api.non_geometric_polygon_layers, \ 'Error: Can only render non-geometry polygons: %s' % layer_names # Check that NuScenesMap was loaded for the correct location. sample_record = nusc.get('sample', sample_token) scene_record = nusc.get('scene', sample_record['scene_token']) log_record = nusc.get('log', scene_record['log_token']) log_location = log_record['location'] assert self.map_api.map_name == log_location, \ 'Error: NuScenesMap loaded for location %s, should be %s!' % (self.map_api.map_name, log_location) # Grab the front camera image and intrinsics. cam_token = sample_record['data'][camera_channel] cam_record = nusc.get('sample_data', cam_token) cam_path = nusc.get_sample_data_path(cam_token) im = Image.open(cam_path) im_size = im.size cs_record = nusc.get('calibrated_sensor', cam_record['calibrated_sensor_token']) cam_intrinsic = np.array(cs_record['camera_intrinsic']) # Retrieve the current map. poserecord = nusc.get('ego_pose', cam_record['ego_pose_token']) ego_pose = poserecord['translation'] box_coords = ( ego_pose[0] - patch_radius, ego_pose[1] - patch_radius, ego_pose[0] + patch_radius, ego_pose[1] + patch_radius, ) records_in_patch = self.get_records_in_patch(box_coords, layer_names, 'intersect') # Init axes. fig = plt.figure(figsize=(9, 16)) ax = fig.add_axes([0, 0, 1, 1]) ax.set_xlim(0, im_size[0]) ax.set_ylim(0, im_size[1]) ax.imshow(im) # Retrieve and render each record. for layer_name in layer_names: for token in records_in_patch[layer_name]: record = self.map_api.get(layer_name, token) if layer_name == 'drivable_area': polygon_tokens = record['polygon_tokens'] else: polygon_tokens = [record['polygon_token']] for polygon_token in polygon_tokens: polygon = self.map_api.extract_polygon(polygon_token) # Convert polygon nodes to pointcloud with 0 height. points = np.array(polygon.exterior.xy) points = np.vstack((points, np.zeros((1, points.shape[1])))) # Transform into the ego vehicle frame for the timestamp of the image. points = points - np.array(poserecord['translation']).reshape((-1, 1)) points = np.dot(Quaternion(poserecord['rotation']).rotation_matrix.T, points) # Transform into the camera. points = points - np.array(cs_record['translation']).reshape((-1, 1)) points = np.dot(Quaternion(cs_record['rotation']).rotation_matrix.T, points) # Remove points that are partially behind the camera. depths = points[2, :] behind = depths < near_plane if np.all(behind): continue if render_behind_cam: # Perform clipping on polygons that are partially behind the camera. points = NuScenesMapExplorer._clip_points_behind_camera(points, near_plane) elif np.any(behind): # Otherwise ignore any polygon that is partially behind the camera. continue # Ignore polygons with less than 3 points after clipping. if len(points) == 0 or points.shape[1] < 3: continue # Take the actual picture (matrix multiplication with camera-matrix + renormalization). points = view_points(points, cam_intrinsic, normalize=True) # Skip polygons where all points are outside the image. # Leave a margin of 1 pixel for aesthetic reasons. inside = np.ones(points.shape[1], dtype=bool) inside = np.logical_and(inside, points[0, :] > 1) inside = np.logical_and(inside, points[0, :] < im.size[0] - 1) inside = np.logical_and(inside, points[1, :] > 1) inside = np.logical_and(inside, points[1, :] < im.size[1] - 1) if render_outside_im: if np.all(np.logical_not(inside)): continue else: if np.any(np.logical_not(inside)): continue points = points[:2, :] points = [(p0, p1) for (p0, p1) in zip(points[0], points[1])] polygon_proj = Polygon(points) # Filter small polygons if polygon_proj.area < min_polygon_area: continue label = layer_name ax.add_patch(descartes.PolygonPatch(polygon_proj, fc=self.color_map[layer_name], alpha=alpha, label=label)) # Display the image. plt.axis('off') ax.invert_yaxis() if out_path is not None: plt.tight_layout() plt.savefig(out_path, bbox_inches='tight', pad_inches=0) return fig, ax @staticmethod def points_transform(points, poserecord, cs_record, cam_intrinsic, im_size, near_plane=1e-8, render_behind_cam=True, render_outside_im=True): points = np.vstack((points, np.zeros((1, points.shape[1])))) # Transform into the ego vehicle frame for the timestamp of the image. points = points - np.array(poserecord['translation']).reshape((-1, 1)) points = np.dot(Quaternion(poserecord['rotation']).rotation_matrix.T, points) # Transform into the camera. points = points - np.array(cs_record['translation']).reshape((-1, 1)) points = np.dot(Quaternion(cs_record['rotation']).rotation_matrix.T, points) # Remove points that are partially behind the camera. depths = points[2, :] behind = depths < near_plane if np.all(behind): return None if render_behind_cam: # Perform clipping on polygons that are partially behind the camera. points = NuScenesMapExplorer._clip_points_behind_camera(points, near_plane) elif np.any(behind): # Otherwise ignore any polygon that is partially behind the camera. return None # Take the actual picture (matrix multiplication with camera-matrix + renormalization). points = view_points(points, cam_intrinsic, normalize=True) # Skip polygons where all points are outside the image. # Leave a margin of 1 pixel for aesthetic reasons. inside = np.ones(points.shape[1], dtype=bool) inside = np.logical_and(inside, points[0, :] > 1) inside = np.logical_and(inside, points[0, :] < im_size[0] - 1) inside = np.logical_and(inside, points[1, :] > 1) inside = np.logical_and(inside, points[1, :] < im_size[1] - 1) if render_outside_im: if np.all(np.logical_not(inside)): return None else: if np.any(np.logical_not(inside)): return None # points = points[:, inside] # Ignore polygons with less than 3 points after clipping. if len(points) == 0 or points.shape[1] < 3: return None points = points[:2, :] points = [(p0, p1) for (p0, p1) in zip(points[0], points[1])] return points def get_map_mask_in_image(self, nusc: NuScenes, sample_token: str, camera_channel: str = 'CAM_FRONT', alpha: float = 0.3, patch_radius: float = 10000, min_polygon_area: float = 1000, render_behind_cam: bool = True, render_outside_im: bool = True, layer_names: List[str] = None, verbose: bool = False, out_path: str = None) -> np.ndarray: """ Render a nuScenes camera image and overlay the polygons for the specified map layers. Note that the projections are not always accurate as the localization is in 2d. :param nusc: The NuScenes instance to load the image from. :param sample_token: The image's corresponding sample_token. :param camera_channel: Camera channel name, e.g. 'CAM_FRONT'. :param alpha: The transparency value of the layers to render in [0, 1]. :param patch_radius: The radius in meters around the ego car in which to select map records. :param min_polygon_area: Minimum area a polygon needs to have to be rendered. :param render_behind_cam: Whether to render polygons where any point is behind the camera. :param render_outside_im: Whether to render polygons where any point is outside the image. :param layer_names: The names of the layers to render, e.g. ['lane']. If set to None, the recommended setting will be used. :param verbose: Whether to print to stdout. :param out_path: Optional path to save the rendered figure to disk. """ near_plane = 1e-8 if verbose: print('Warning: Note that the projections are not always accurate as the localization is in 2d.') # Default layers. if layer_names is None: layer_names = ['road_segment', 'lane', 'ped_crossing', 'walkway', 'stop_line', 'carpark_area'] # # Check layers whether we can render them. # for layer_name in layer_names: # assert layer_name in self.map_api.non_geometric_polygon_layers, \ # 'Error: Can only render non-geometry polygons: %s' % layer_names # Check that NuScenesMap was loaded for the correct location. sample_record = nusc.get('sample', sample_token) scene_record = nusc.get('scene', sample_record['scene_token']) log_record = nusc.get('log', scene_record['log_token']) log_location = log_record['location'] assert self.map_api.map_name == log_location, \ 'Error: NuScenesMap loaded for location %s, should be %s!' % (self.map_api.map_name, log_location) # Grab the front camera image and intrinsics. cam_token = sample_record['data'][camera_channel] cam_record = nusc.get('sample_data', cam_token) cam_path = nusc.get_sample_data_path(cam_token) im = Image.open(cam_path) im_size = im.size cs_record = nusc.get('calibrated_sensor', cam_record['calibrated_sensor_token']) cam_intrinsic = np.array(cs_record['camera_intrinsic']) # Retrieve the current map. poserecord = nusc.get('ego_pose', cam_record['ego_pose_token']) ego_pose = poserecord['translation'] box_coords = ( ego_pose[0] - patch_radius, ego_pose[1] - patch_radius, ego_pose[0] + patch_radius, ego_pose[1] + patch_radius, ) records_in_patch = self.get_records_in_patch(box_coords, layer_names, 'intersect') if out_path is not None: # Init axes. fig = plt.figure(figsize=(9, 16)) ax = fig.add_axes([0, 0, 1, 1]) ax.set_xlim(0, im_size[0]) ax.set_ylim(0, im_size[1]) ax.imshow(im) points_transform = partial(self.points_transform, poserecord=poserecord, cs_record=cs_record, cam_intrinsic=cam_intrinsic, near_plane=near_plane, im_size=im_size, render_behind_cam=render_behind_cam, render_outside_im=render_outside_im) # Retrieve and render each record. map_geom = [] for layer_name in layer_names: if layer_name in self.map_api.non_geometric_line_layers: line_list = [] for token in records_in_patch[layer_name]: record = self.map_api.get(layer_name, token) line = self.map_api.extract_line(record['line_token']) if line.is_empty: # Skip lines without nodes. continue points = np.array(line.xy) points = points_transform(points) if points is None: continue line = LineString(points) line_list.append(line) # For visualize if out_path is not None: polygon = Polygon(points) ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha, label=layer_name)) map_geom.append((layer_name, line_list)) elif layer_name == 'drivable_area': polygon_list = [] for token in records_in_patch[layer_name]: record = self.map_api.get(layer_name, token) polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] for polygon in polygons: ex_points = np.array(polygon.exterior.xy) ex_points = points_transform(ex_points) if ex_points is None: continue interiors = [] for interior in polygon.interiors: in_points = np.array(interior.xy) in_points = points_transform(in_points) if in_points is None: continue interiors.append(in_points) polygon = Polygon(ex_points, interiors) polygon = polygon.buffer(0.01) if polygon.geom_type == 'Polygon': polygon = MultiPolygon([polygon]) # Filter small polygons if polygon.area < min_polygon_area: continue polygon_list.append(polygon) # For visualize if out_path is not None: ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha, label=layer_name)) map_geom.append((layer_name, polygon_list)) else: polygon_list = [] for token in records_in_patch[layer_name]: record = self.map_api.get(layer_name, token) polygon = self.map_api.extract_polygon(record['polygon_token']) if polygon.is_valid: if not polygon.is_empty: ex_points = np.array(polygon.exterior.xy) ex_points = points_transform(ex_points) if ex_points is None: continue interiors = [] for interior in polygon.interiors: in_points = np.array(interior.xy) in_points = points_transform(in_points) if in_points is None: continue interiors.append(in_points) polygon = Polygon(ex_points, interiors) polygon = polygon.buffer(0.01) if polygon.geom_type == 'Polygon': polygon = MultiPolygon([polygon]) # Filter small polygons if polygon.area < min_polygon_area: continue polygon_list.append(polygon) # For visualize if out_path is not None: ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha, label=layer_name)) map_geom.append((layer_name, polygon_list)) # For visualize if out_path is not None: # Display the image. plt.axis('off') ax.invert_yaxis() plt.tight_layout() plt.savefig(out_path, bbox_inches='tight', pad_inches=0) plt.close() # Convert geometry of each layer into mask and stack them into a numpy tensor. # Convert the patch box from global coordinates to local coordinates by setting the center to (0, 0). local_box = (im_size[0] // 2, im_size[1] // 2, im_size[1], im_size[0]) canvas_size = (im_size[1], im_size[0]) img_mask = self.map_geom_to_mask(map_geom, local_box, canvas_size) assert np.all(img_mask.shape[1:] == canvas_size) return img_mask def render_egoposes_on_fancy_map(self, nusc: NuScenes, scene_tokens: List = None, verbose: bool = True, out_path: str = None, render_egoposes: bool = True, render_egoposes_range: bool = True, render_legend: bool = True, bitmap: Optional[BitMap] = None) -> Tuple[np.ndarray, Figure, Axes]: """ Renders each ego pose of a list of scenes on the map (around 40 poses per scene). This method is heavily inspired by NuScenes.render_egoposes_on_map(), but uses the map expansion pack maps. Note that the maps are constantly evolving, whereas we only released a single snapshot of the data. Therefore for some scenes there is a bad fit between ego poses and maps. :param nusc: The NuScenes instance to load the ego poses from. :param scene_tokens: Optional list of scene tokens corresponding to the current map location. :param verbose: Whether to show status messages and progress bar. :param out_path: Optional path to save the rendered figure to disk. :param render_egoposes: Whether to render ego poses. :param render_egoposes_range: Whether to render a rectangle around all ego poses. :param render_legend: Whether to render the legend of map layers. :param bitmap: Optional BitMap object to render below the other map layers. :return: . Returns a matrix with n ego poses in global map coordinates. """ # Settings patch_margin = 2 min_diff_patch = 30 # Ids of scenes with a bad match between localization and map. scene_blacklist = [499, 515, 517] # Get logs by location. log_location = self.map_api.map_name log_tokens = [log['token'] for log in nusc.log if log['location'] == log_location] assert len(log_tokens) > 0, 'Error: This split has 0 scenes for location %s!' % log_location # Filter scenes. scene_tokens_location = [e['token'] for e in nusc.scene if e['log_token'] in log_tokens] if scene_tokens is not None: scene_tokens_location = [t for t in scene_tokens_location if t in scene_tokens] assert len(scene_tokens_location) > 0, 'Error: Found 0 valid scenes for location %s!' % log_location map_poses = [] if verbose: print('Adding ego poses to map...') for scene_token in tqdm(scene_tokens_location, disable=not verbose): # Check that the scene is from the correct location. scene_record = nusc.get('scene', scene_token) scene_name = scene_record['name'] scene_id = int(scene_name.replace('scene-', '')) log_record = nusc.get('log', scene_record['log_token']) assert log_record['location'] == log_location, \ 'Error: The provided scene_tokens do not correspond to the provided map location!' # Print a warning if the localization is known to be bad. if verbose and scene_id in scene_blacklist: print('Warning: %s is known to have a bad fit between ego pose and map.' % scene_name) # For each sample in the scene, store the ego pose. sample_tokens = nusc.field2token('sample', 'scene_token', scene_token) for sample_token in sample_tokens: sample_record = nusc.get('sample', sample_token) # Poses are associated with the sample_data. Here we use the lidar sample_data. sample_data_record = nusc.get('sample_data', sample_record['data']['LIDAR_TOP']) pose_record = nusc.get('ego_pose', sample_data_record['ego_pose_token']) # Calculate the pose on the map and append. map_poses.append(pose_record['translation']) # Check that ego poses aren't empty. assert len(map_poses) > 0, 'Error: Found 0 ego poses. Please check the inputs.' # Compute number of close ego poses. if verbose: print('Creating plot...') map_poses = np.vstack(map_poses)[:, :2] # Render the map patch with the current ego poses. min_patch = np.floor(map_poses.min(axis=0) - patch_margin) max_patch = np.ceil(map_poses.max(axis=0) + patch_margin) diff_patch = max_patch - min_patch if any(diff_patch < min_diff_patch): center_patch = (min_patch + max_patch) / 2 diff_patch = np.maximum(diff_patch, min_diff_patch) min_patch = center_patch - diff_patch / 2 max_patch = center_patch + diff_patch / 2 my_patch = (min_patch[0], min_patch[1], max_patch[0], max_patch[1]) fig, ax = self.render_map_patch(my_patch, self.map_api.non_geometric_layers, figsize=(10, 10), render_egoposes_range=render_egoposes_range, render_legend=render_legend, bitmap=bitmap) # Plot in the same axis as the map. # Make sure these are plotted "on top". if render_egoposes: ax.scatter(map_poses[:, 0], map_poses[:, 1], s=20, c='k', alpha=1.0, zorder=2) plt.axis('off') if out_path is not None: plt.savefig(out_path, bbox_inches='tight', pad_inches=0) return map_poses, fig, ax def render_next_roads(self, x: float, y: float, alpha: float = 0.5, figsize: Union[None, float, Tuple[float, float]] = None, bitmap: Optional[BitMap] = None) -> Tuple[Figure, Axes]: """ Renders the possible next roads from a point of interest. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :param alpha: The opacity of each layer that gets rendered. :param figsize: Size of the whole figure. :param bitmap: Optional BitMap object to render below the other map layers. """ # Get next roads. next_roads = self.map_api.get_next_roads(x, y) layer_names = [] tokens = [] for layer_name, layer_tokens in next_roads.items(): if len(layer_tokens) > 0: layer_names.append(layer_name) tokens.extend(layer_tokens) # Render them. fig, ax = self.render_layers(layer_names, alpha, figsize, tokens=tokens, bitmap=bitmap) # Render current location with an x. ax.plot(x, y, 'x', markersize=12, color='red') return fig, ax @staticmethod def _clip_points_behind_camera(points, near_plane: float): """ Perform clipping on polygons that are partially behind the camera. This method is necessary as the projection does not work for points behind the camera. Hence we compute the line between the point and the camera and follow that line until we hit the near plane of the camera. Then we use that point. :param points: Matrix of points, where each point (x, y, z) is along each column. :param near_plane: If we set the near_plane distance of the camera to 0 then some points will project to infinity. Therefore we need to clip these points at the near plane. :return: The clipped version of the polygon. This may have fewer points than the original polygon if some lines were entirely behind the polygon. """ points_clipped = [] # Loop through each line on the polygon. # For each line where exactly 1 endpoints is behind the camera, move the point along the line until # it hits the near plane of the camera (clipping). assert points.shape[0] == 3 point_count = points.shape[1] for line_1 in range(point_count): line_2 = (line_1 + 1) % point_count point_1 = points[:, line_1] point_2 = points[:, line_2] z_1 = point_1[2] z_2 = point_2[2] if z_1 >= near_plane and z_2 >= near_plane: # Both points are in front. # Add both points unless the first is already added. if len(points_clipped) == 0 or all(points_clipped[-1] != point_1): points_clipped.append(point_1) points_clipped.append(point_2) elif z_1 < near_plane and z_2 < near_plane: # Both points are in behind. # Don't add anything. continue else: # One point is in front, one behind. # By convention pointA is behind the camera and pointB in front. if z_1 <= z_2: point_a = points[:, line_1] point_b = points[:, line_2] else: point_a = points[:, line_2] point_b = points[:, line_1] z_a = point_a[2] z_b = point_b[2] # Clip line along near plane. pointdiff = point_b - point_a alpha = (near_plane - z_b) / (z_a - z_b) clipped = point_a + (1 - alpha) * pointdiff assert np.abs(clipped[2] - near_plane) < 1e-6 # Add the first point (if valid and not duplicate), the clipped point and the second point (if valid). if z_1 >= near_plane and (len(points_clipped) == 0 or all(points_clipped[-1] != point_1)): points_clipped.append(point_1) points_clipped.append(clipped) if z_2 >= near_plane: points_clipped.append(point_2) points_clipped = np.array(points_clipped).transpose() return points_clipped def get_records_in_patch(self, box_coords: Tuple[float, float, float, float], layer_names: List[str] = None, mode: str = 'intersect') -> Dict[str, List[str]]: """ Get all the record token that intersects or within a particular rectangular patch. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param layer_names: Names of the layers that we want to retrieve in a particular patch. By default will always look for all non geometric layers. :param mode: "intersect" will return all non geometric records that intersects the patch, "within" will return all non geometric records that are within the patch. :return: Dictionary of layer_name - tokens pairs. """ if mode not in ['intersect', 'within']: raise ValueError("Mode {} is not valid, choice=('intersect', 'within')".format(mode)) if layer_names is None: layer_names = self.map_api.non_geometric_layers records_in_patch = dict() for layer_name in layer_names: layer_records = [] for record in getattr(self.map_api, layer_name): token = record['token'] if self.is_record_in_patch(layer_name, token, box_coords, mode): layer_records.append(token) records_in_patch.update({layer_name: layer_records}) return records_in_patch def is_record_in_patch(self, layer_name: str, token: str, box_coords: Tuple[float, float, float, float], mode: str = 'intersect') -> bool: """ Query whether a particular record is in a rectangular patch. :param layer_name: The layer name of the record. :param token: The record token. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param mode: "intersect" means it will return True if the geometric object intersects the patch and False otherwise, "within" will return True if the geometric object is within the patch and False otherwise. :return: Boolean value on whether a particular record intersects or is within a particular patch. """ if mode not in ['intersect', 'within']: raise ValueError("Mode {} is not valid, choice=('intersect', 'within')".format(mode)) if layer_name in self.map_api.lookup_polygon_layers: return self._is_polygon_record_in_patch(token, layer_name, box_coords, mode) elif layer_name in self.map_api.non_geometric_line_layers: return self._is_line_record_in_patch(token, layer_name, box_coords, mode) else: raise ValueError("{} is not a valid layer".format(layer_name)) def layers_on_point(self, x: float, y: float, layer_names: List[str] = None) -> Dict[str, str]: """ Returns all the polygonal layers that a particular point is on. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :param layer_names: The names of the layers to search for. :return: All the polygonal layers that a particular point is on. """ # Default option. if layer_names is None: layer_names = self.map_api.non_geometric_polygon_layers layers_on_point = dict() for layer_name in layer_names: layers_on_point.update({layer_name: self.record_on_point(x, y, layer_name)}) return layers_on_point def record_on_point(self, x: float, y: float, layer_name: str) -> str: """ Query what record of a layer a particular point is on. :param x: x coordinate of the point of interest. :param y: y coordinate of the point of interest. :param layer_name: The non geometric polygonal layer name that we are interested in. :return: The first token of a layer a particular point is on or '' if no layer is found. """ if layer_name not in self.map_api.non_geometric_polygon_layers: raise ValueError("{} is not a polygon layer".format(layer_name)) point = Point(x, y) records = getattr(self.map_api, layer_name) if layer_name == 'drivable_area': for record in records: polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] for polygon in polygons: if point.within(polygon): return record['token'] else: pass else: for record in records: polygon = self.map_api.extract_polygon(record['polygon_token']) if point.within(polygon): return record['token'] else: pass # If nothing is found, return an empty string. return '' def extract_polygon(self, polygon_token: str) -> Polygon: """ Construct a shapely Polygon object out of a polygon token. :param polygon_token: The token of the polygon record. :return: The polygon wrapped in a shapely Polygon object. """ polygon_record = self.map_api.get('polygon', polygon_token) exterior_coords = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y']) for token in polygon_record['exterior_node_tokens']] interiors = [] for hole in polygon_record['holes']: interior_coords = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y']) for token in hole['node_tokens']] if len(interior_coords) > 0: # Add only non-empty holes. interiors.append(interior_coords) return Polygon(exterior_coords, interiors) def extract_line(self, line_token: str) -> LineString: """ Construct a shapely LineString object out of a line token. :param line_token: The token of the line record. :return: The line wrapped in a LineString object. """ line_record = self.map_api.get('line', line_token) line_nodes = [(self.map_api.get('node', token)['x'], self.map_api.get('node', token)['y']) for token in line_record['node_tokens']] return LineString(line_nodes) def get_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]: """ Get the bounds of the geometric object that corresponds to a non geometric record. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record. :return: min_x, min_y, max_x, max_y of the line representation. """ if layer_name in self.map_api.non_geometric_polygon_layers: return self._get_polygon_bounds(layer_name, token) elif layer_name in self.map_api.non_geometric_line_layers: return self._get_line_bounds(layer_name, token) else: raise ValueError("{} is not a valid layer".format(layer_name)) def _get_polygon_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]: """ Get the extremities of the polygon object that corresponds to a non geometric record. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record. :return: min_x, min_y, max_x, max_y of of the polygon or polygons (for drivable_area) representation. """ if layer_name not in self.map_api.non_geometric_polygon_layers: raise ValueError("{} is not a record with polygon representation".format(token)) record = self.map_api.get(layer_name, token) if layer_name == 'drivable_area': polygons = [self.map_api.get('polygon', polygon_token) for polygon_token in record['polygon_tokens']] exterior_node_coords = [] for polygon in polygons: nodes = [self.map_api.get('node', node_token) for node_token in polygon['exterior_node_tokens']] node_coords = [(node['x'], node['y']) for node in nodes] exterior_node_coords.extend(node_coords) exterior_node_coords = np.array(exterior_node_coords) else: exterior_nodes = [self.map_api.get('node', token) for token in record['exterior_node_tokens']] exterior_node_coords = np.array([(node['x'], node['y']) for node in exterior_nodes]) xs = exterior_node_coords[:, 0] ys = exterior_node_coords[:, 1] x2 = xs.max() x1 = xs.min() y2 = ys.max() y1 = ys.min() return x1, y1, x2, y2 def _get_line_bounds(self, layer_name: str, token: str) -> Tuple[float, float, float, float]: """ Get the bounds of the line object that corresponds to a non geometric record. :param layer_name: Name of the layer that we are interested in. :param token: Token of the record. :return: min_x, min_y, max_x, max_y of of the line representation. """ if layer_name not in self.map_api.non_geometric_line_layers: raise ValueError("{} is not a record with line representation".format(token)) record = self.map_api.get(layer_name, token) nodes = [self.map_api.get('node', node_token) for node_token in record['node_tokens']] node_coords = [(node['x'], node['y']) for node in nodes] node_coords = np.array(node_coords) xs = node_coords[:, 0] ys = node_coords[:, 1] x2 = xs.max() x1 = xs.min() y2 = ys.max() y1 = ys.min() return x1, y1, x2, y2 def _is_polygon_record_in_patch(self, token: str, layer_name: str, box_coords: Tuple[float, float, float, float], mode: str = 'intersect') -> bool: """ Query whether a particular polygon record is in a rectangular patch. :param layer_name: The layer name of the record. :param token: The record token. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param mode: "intersect" means it will return True if the geometric object intersects the patch and False otherwise, "within" will return True if the geometric object is within the patch and False otherwise. :return: Boolean value on whether a particular polygon record intersects or is within a particular patch. """ if layer_name not in self.map_api.lookup_polygon_layers: raise ValueError('{} is not a polygonal layer'.format(layer_name)) x_min, y_min, x_max, y_max = box_coords record = self.map_api.get(layer_name, token) rectangular_patch = box(x_min, y_min, x_max, y_max) if layer_name == 'drivable_area': polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] geom = MultiPolygon(polygons) else: geom = self.map_api.extract_polygon(record['polygon_token']) if mode == 'intersect': return geom.intersects(rectangular_patch) elif mode == 'within': return geom.within(rectangular_patch) def _is_line_record_in_patch(self, token: str, layer_name: str, box_coords: Tuple[float, float, float, float], mode: str = 'intersect') -> bool: """ Query whether a particular line record is in a rectangular patch. :param layer_name: The layer name of the record. :param token: The record token. :param box_coords: The rectangular patch coordinates (x_min, y_min, x_max, y_max). :param mode: "intersect" means it will return True if the geometric object intersects the patch and False otherwise, "within" will return True if the geometric object is within the patch and False otherwise. :return: Boolean value on whether a particular line record intersects or is within a particular patch. """ if layer_name not in self.map_api.non_geometric_line_layers: raise ValueError("{} is not a line layer".format(layer_name)) # Retrieve nodes of this line. record = self.map_api.get(layer_name, token) node_recs = [self.map_api.get('node', node_token) for node_token in record['node_tokens']] node_coords = [[node['x'], node['y']] for node in node_recs] node_coords = np.array(node_coords) # A few lines in Queenstown have zero nodes. In this case we return False. if len(node_coords) == 0: return False # Check that nodes fall inside the path. x_min, y_min, x_max, y_max = box_coords cond_x = np.logical_and(node_coords[:, 0] < x_max, node_coords[:, 0] > x_min) cond_y = np.logical_and(node_coords[:, 1] < y_max, node_coords[:, 1] > y_min) cond = np.logical_and(cond_x, cond_y) if mode == 'intersect': return np.any(cond) elif mode == 'within': return np.all(cond) def _render_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None: """ Wrapper method that renders individual layers on an axis. :param ax: The matplotlib axes where the layer will get rendered. :param layer_name: Name of the layer that we are interested in. :param alpha: The opacity of the layer to be rendered. :param tokens: Optional list of tokens to render. None means all tokens are rendered. """ if layer_name in self.map_api.non_geometric_polygon_layers: self._render_polygon_layer(ax, layer_name, alpha, tokens) elif layer_name in self.map_api.non_geometric_line_layers: self._render_line_layer(ax, layer_name, alpha, tokens) else: raise ValueError("{} is not a valid layer".format(layer_name)) def _render_polygon_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None: """ Renders an individual non-geometric polygon layer on an axis. :param ax: The matplotlib axes where the layer will get rendered. :param layer_name: Name of the layer that we are interested in. :param alpha: The opacity of the layer to be rendered. :param tokens: Optional list of tokens to render. None means all tokens are rendered. """ if layer_name not in self.map_api.non_geometric_polygon_layers: raise ValueError('{} is not a polygonal layer'.format(layer_name)) first_time = True records = getattr(self.map_api, layer_name) if tokens is not None: records = [r for r in records if r['token'] in tokens] if layer_name == 'drivable_area': for record in records: polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] for polygon in polygons: if first_time: label = layer_name first_time = False else: label = None ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha, label=label)) else: for record in records: polygon = self.map_api.extract_polygon(record['polygon_token']) if first_time: label = layer_name first_time = False else: label = None ax.add_patch(descartes.PolygonPatch(polygon, fc=self.color_map[layer_name], alpha=alpha, label=label)) def _render_line_layer(self, ax: Axes, layer_name: str, alpha: float, tokens: List[str] = None) -> None: """ Renders an individual non-geometric line layer on an axis. :param ax: The matplotlib axes where the layer will get rendered. :param layer_name: Name of the layer that we are interested in. :param alpha: The opacity of the layer to be rendered. :param tokens: Optional list of tokens to render. None means all tokens are rendered. """ if layer_name not in self.map_api.non_geometric_line_layers: raise ValueError("{} is not a line layer".format(layer_name)) first_time = True records = getattr(self.map_api, layer_name) if tokens is not None: records = [r for r in records if r['token'] in tokens] for record in records: if first_time: label = layer_name first_time = False else: label = None line = self.map_api.extract_line(record['line_token']) if line.is_empty: # Skip lines without nodes continue xs, ys = line.xy if layer_name == 'traffic_light': # Draws an arrow with the physical traffic light as the starting point, pointing to the direction on # where the traffic light points. ax.add_patch(Arrow(xs[0], ys[0], xs[1]-xs[0], ys[1]-ys[0], color=self.color_map[layer_name], label=label)) else: ax.plot(xs, ys, color=self.color_map[layer_name], alpha=alpha, label=label) def _get_layer_geom(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_name: str) -> List[Geometry]: """ Wrapper method that gets the geometries for each layer. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :param layer_name: Name of map layer to be converted to binary map mask patch. :return: List of geometries for the given layer. """ if layer_name in self.map_api.non_geometric_polygon_layers: return self._get_layer_polygon(patch_box, patch_angle, layer_name) elif layer_name in self.map_api.non_geometric_line_layers: return self._get_layer_line(patch_box, patch_angle, layer_name) else: raise ValueError("{} is not a valid layer".format(layer_name)) def _layer_geom_to_mask(self, layer_name: str, layer_geom: List[Geometry], local_box: Tuple[float, float, float, float], canvas_size: Tuple[int, int]) -> np.ndarray: """ Wrapper method that gets the mask for each layer's geometries. :param layer_name: The name of the layer for which we get the masks. :param layer_geom: List of the geometries of the layer specified in layer_name. :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically x_center = y_center = 0. :param canvas_size: Size of the output mask (h, w). """ if layer_name in self.map_api.non_geometric_polygon_layers: return self._polygon_geom_to_mask(layer_geom, local_box, layer_name, canvas_size) elif layer_name in self.map_api.non_geometric_line_layers: return self._line_geom_to_mask(layer_geom, local_box, layer_name, canvas_size) else: raise ValueError("{} is not a valid layer".format(layer_name)) @staticmethod def mask_for_polygons(polygons: MultiPolygon, mask: np.ndarray) -> np.ndarray: """ Convert a polygon or multipolygon list to an image mask ndarray. :param polygons: List of Shapely polygons to be converted to numpy array. :param mask: Canvas where mask will be generated. :return: Numpy ndarray polygon mask. """ if not polygons: return mask def int_coords(x): # function to round and convert to int return np.array(x).round().astype(np.int32) exteriors = [int_coords(poly.exterior.coords) for poly in polygons] interiors = [int_coords(pi.coords) for poly in polygons for pi in poly.interiors] cv2.fillPoly(mask, exteriors, 1) cv2.fillPoly(mask, interiors, 0) return mask @staticmethod def mask_for_lines(lines: LineString, mask: np.ndarray) -> np.ndarray: """ Convert a Shapely LineString back to an image mask ndarray. :param lines: List of shapely LineStrings to be converted to a numpy array. :param mask: Canvas where mask will be generated. :return: Numpy ndarray line mask. """ if lines.geom_type == 'MultiLineString': for line in lines: coords = np.asarray(list(line.coords), np.int32) coords = coords.reshape((-1, 2)) cv2.polylines(mask, [coords], False, 1, 2) else: coords = np.asarray(list(lines.coords), np.int32) coords = coords.reshape((-1, 2)) cv2.polylines(mask, [coords], False, 1, 2) return mask def _polygon_geom_to_mask(self, layer_geom: List[Polygon], local_box: Tuple[float, float, float, float], layer_name: str, canvas_size: Tuple[int, int]) -> np.ndarray: """ Convert polygon inside patch to binary mask and return the map patch. :param layer_geom: list of polygons for each map layer :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically x_center = y_center = 0. :param layer_name: name of map layer to be converted to binary map mask patch. :param canvas_size: Size of the output mask (h, w). :return: Binary map mask patch with the size canvas_size. """ if layer_name not in self.map_api.non_geometric_polygon_layers: raise ValueError('{} is not a polygonal layer'.format(layer_name)) patch_x, patch_y, patch_h, patch_w = local_box patch = self.get_patch_coord(local_box) canvas_h = canvas_size[0] canvas_w = canvas_size[1] scale_height = canvas_h / patch_h scale_width = canvas_w / patch_w trans_x = -patch_x + patch_w / 2.0 trans_y = -patch_y + patch_h / 2.0 map_mask = np.zeros(canvas_size, np.uint8) for polygon in layer_geom: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y]) new_polygon = affinity.scale(new_polygon, xfact=scale_width, yfact=scale_height, origin=(0, 0)) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) # if new_polygon.area < 1000: # continue if not isinstance(new_polygon, MultiPolygon): print(new_polygon) continue map_mask = self.mask_for_polygons(new_polygon, map_mask) return map_mask def _line_geom_to_mask(self, layer_geom: List[LineString], local_box: Tuple[float, float, float, float], layer_name: str, canvas_size: Tuple[int, int]) -> Optional[np.ndarray]: """ Convert line inside patch to binary mask and return the map patch. :param layer_geom: list of LineStrings for each map layer :param local_box: The local patch box defined as (x_center, y_center, height, width), where typically x_center = y_center = 0. :param layer_name: name of map layer to be converted to binary map mask patch. :param canvas_size: Size of the output mask (h, w). :return: Binary map mask patch in a canvas size. """ if layer_name not in self.map_api.non_geometric_line_layers: raise ValueError("{} is not a line layer".format(layer_name)) patch_x, patch_y, patch_h, patch_w = local_box patch = self.get_patch_coord(local_box) canvas_h = canvas_size[0] canvas_w = canvas_size[1] scale_height = canvas_h/patch_h scale_width = canvas_w/patch_w trans_x = -patch_x + patch_w / 2.0 trans_y = -patch_y + patch_h / 2.0 map_mask = np.zeros(canvas_size, np.uint8) if layer_name == 'traffic_light': return None for line in layer_geom: new_line = line.intersection(patch) if not new_line.is_empty: new_line = affinity.affine_transform(new_line, [1.0, 0.0, 0.0, 1.0, trans_x, trans_y]) new_line = affinity.scale(new_line, xfact=scale_width, yfact=scale_height, origin=(0, 0)) map_mask = self.mask_for_lines(new_line, map_mask) return map_mask def _get_layer_polygon(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_name: str) -> List[Polygon]: """ Retrieve the polygons of a particular layer within the specified patch. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :param layer_name: name of map layer to be extracted. :return: List of Polygon in a patch box. """ if layer_name not in self.map_api.non_geometric_polygon_layers: raise ValueError('{} is not a polygonal layer'.format(layer_name)) patch_x = patch_box[0] patch_y = patch_box[1] patch = self.get_patch_coord(patch_box, patch_angle) records = getattr(self.map_api, layer_name) polygon_list = [] if layer_name == 'drivable_area': for record in records: polygons = [self.map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] for polygon in polygons: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) else: for record in records: polygon = self.map_api.extract_polygon(record['polygon_token']) if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_polygon = affinity.affine_transform(new_polygon, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) return polygon_list def _get_layer_line(self, patch_box: Tuple[float, float, float, float], patch_angle: float, layer_name: str) -> Optional[List[LineString]]: """ Retrieve the lines of a particular layer within the specified patch. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :param layer_name: name of map layer to be converted to binary map mask patch. :return: List of LineString in a patch box. """ if layer_name not in self.map_api.non_geometric_line_layers: raise ValueError("{} is not a line layer".format(layer_name)) if layer_name == 'traffic_light': return None patch_x = patch_box[0] patch_y = patch_box[1] patch = self.get_patch_coord(patch_box, patch_angle) line_list = [] records = getattr(self.map_api, layer_name) for record in records: line = self.map_api.extract_line(record['line_token']) if line.is_empty: # Skip lines without nodes. continue new_line = line.intersection(patch) if not new_line.is_empty: new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False) new_line = affinity.affine_transform(new_line, [1.0, 0.0, 0.0, 1.0, -patch_x, -patch_y]) line_list.append(new_line) return line_list @staticmethod def get_patch_coord(patch_box: Tuple[float, float, float, float], patch_angle: float = 0.0) -> Polygon: """ Convert patch_box to shapely Polygon coordinates. :param patch_box: Patch box defined as [x_center, y_center, height, width]. :param patch_angle: Patch orientation in degrees. :return: Box Polygon for patch_box. """ patch_x, patch_y, patch_h, patch_w = patch_box x_min = patch_x - patch_w / 2.0 y_min = patch_y - patch_h / 2.0 x_max = patch_x + patch_w / 2.0 y_max = patch_y + patch_h / 2.0 patch = box(x_min, y_min, x_max, y_max) patch = affinity.rotate(patch, patch_angle, origin=(patch_x, patch_y), use_radians=False) return patch def _get_figsize(self, figsize: Union[None, float, Tuple[float, float]]) -> Tuple[float, float]: """ Utility function that scales the figure size by the map canvas size. If figsize is: - None => Return default scale. - Scalar => Scale canvas size. - Two-tuple => Use the specified figure size. :param figsize: The input figure size. :return: The output figure size. """ # Divide canvas size by arbitrary scalar to get into cm range. canvas_size = np.array(self.map_api.canvas_edge)[::-1] / 200 if figsize is None: return tuple(canvas_size) elif type(figsize) in [int, float]: return tuple(canvas_size * figsize) elif type(figsize) == tuple and len(figsize) == 2: return figsize else: raise Exception('Error: Invalid figsize: %s' % figsize) ================================================ FILE: mmdet3d/datasets/evals/metric_utils.py ================================================ import torch import math import numpy as np from typing import List, Dict, Tuple, Callable, Union def min_ade(traj: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Computes average displacement error for the best trajectory is a set, with respect to ground truth :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :return errs, inds: errors and indices for modes with min error, shape [batch_size] """ num_modes = traj.shape[1] traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1) masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1) err = traj_gt_rpt - traj[:, :, :, 0:2] err = torch.pow(err, exponent=2) err = torch.sum(err, dim=3) err = torch.pow(err, exponent=0.5) err = torch.sum(err * (1 - masks_rpt), dim=2) / \ torch.clip(torch.sum((1 - masks_rpt), dim=2), min=1) err, inds = torch.min(err, dim=1) return err, inds def min_fde(traj: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Computes final displacement error for the best trajectory is a set, with respect to ground truth :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :return errs, inds: errors and indices for modes with min error, shape [batch_size] """ num_modes = traj.shape[1] traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1) lengths = torch.sum(1 - masks, dim=1).long() inds = lengths.unsqueeze(1).unsqueeze( 2).unsqueeze(3).repeat(1, num_modes, 1, 2) - 1 traj_last = torch.gather(traj[..., :2], dim=2, index=inds).squeeze(2) traj_gt_last = torch.gather(traj_gt_rpt, dim=2, index=inds).squeeze(2) err = traj_gt_last - traj_last[..., 0:2] err = torch.pow(err, exponent=2) err = torch.sum(err, dim=2) err = torch.pow(err, exponent=0.5) err, inds = torch.min(err, dim=1) return err, inds def miss_rate( traj: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor, dist_thresh: float = 2) -> torch.Tensor: """ Computes miss rate for mini batch of trajectories, with respect to ground truth and given distance threshold :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :param dist_thresh: distance threshold for computing miss rate. :return errs, inds: errors and indices for modes with min error, shape [batch_size] """ num_modes = traj.shape[1] traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1) masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1) dist = traj_gt_rpt - traj[:, :, :, 0:2] dist = torch.pow(dist, exponent=2) dist = torch.sum(dist, dim=3) dist = torch.pow(dist, exponent=0.5) dist[masks_rpt.bool()] = -math.inf dist, _ = torch.max(dist, dim=2) dist, _ = torch.min(dist, dim=1) m_r = torch.sum(torch.as_tensor(dist > dist_thresh)) / len(dist) return m_r def traj_fde(gt_box, pred_box, final_step): if gt_box.traj.shape[0] <= 0: return np.inf final_step = min(gt_box.traj.shape[0], final_step) gt_final = gt_box.traj[None, final_step-1] pred_final = np.array(pred_box.traj)[:,final_step-1,:] err = gt_final - pred_final err = np.sqrt(np.sum(np.square(gt_final - pred_final), axis=-1)) return np.min(err) ================================================ FILE: mmdet3d/datasets/evals/nuscenes_eval_motion.py ================================================ import argparse import copy import json import os import time from typing import Tuple, Dict, Any import numpy as np from nuscenes import NuScenes from nuscenes.eval.common.config import config_factory from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.detection.data_classes import DetectionConfig from nuscenes.eval.detection.evaluate import NuScenesEval from pyquaternion import Quaternion from nuscenes import NuScenes from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.utils.data_classes import Box from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes import tqdm from nuscenes.utils.geometry_utils import view_points, BoxVisibility import pycocotools.mask as mask_util import argparse import json import os import random import time from typing import Tuple, Dict, Any import mmcv import numpy as np from nuscenes import NuScenes from nuscenes.eval.common.config import config_factory from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.common.loaders import add_center_dist, filter_eval_boxes from nuscenes.eval.detection.algo import calc_ap, calc_tp from nuscenes.eval.detection.constants import TP_METRICS from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \ DetectionMetricDataList from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample from nuscenes.eval.common.utils import quaternion_yaw, Quaternion from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D from IPython import embed import json from typing import Any import numpy as np from matplotlib import pyplot as plt from nuscenes import NuScenes from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.common.render import setup_axis from nuscenes.eval.common.utils import boxes_to_sensor from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \ PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList from nuscenes.utils.data_classes import LidarPointCloud from nuscenes.utils.geometry_utils import view_points from .eval_utils import load_prediction, load_gt, accumulate, accumulate_motion, \ DetectionMotionBox, DetectionMotionBox_modified, DetectionMotionMetricData, \ DetectionMotionMetrics, DetectionMotionMetricDataList from .metric_utils import traj_fde from prettytable import PrettyTable TP_METRICS = [ 'trans_err', 'scale_err', 'orient_err', 'vel_err', 'attr_err', 'min_ade_err', 'min_fde_err', 'miss_rate_err'] TP_TRAJ_METRICS = ['min_ade_err', 'min_fde_err', 'miss_rate_err'] Axis = Any def class_tp_curve(md_list: DetectionMetricDataList, metrics: DetectionMetrics, detection_name: str, min_recall: float, dist_th_tp: float, savepath: str = None, ax: Axis = None) -> None: """ Plot the true positive curve for the specified class. :param md_list: DetectionMetricDataList instance. :param metrics: DetectionMetrics instance. :param detection_name: :param min_recall: Minimum recall value. :param dist_th_tp: The distance threshold used to determine matches. :param savepath: If given, saves the the rendering here instead of displaying. :param ax: Axes onto which to render. """ # Get metric data for given detection class with tp distance threshold. md = md_list[(detection_name, dist_th_tp)] min_recall_ind = round(100 * min_recall) if min_recall_ind <= md.max_recall_ind: # For traffic_cone and barrier only a subset of the metrics are # plotted. rel_metrics = [ m for m in TP_METRICS if not np.isnan( metrics.get_label_tp( detection_name, m))] ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1 else: ylimit = 1.0 # Prepare axis. if ax is None: ax = setup_axis( title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1, min_recall=min_recall) ax.set_ylim(0, ylimit) # Plot the recall vs. error curve for each tp metric. for metric in TP_METRICS: tp = metrics.get_label_tp(detection_name, metric) # Plot only if we have valid data. if tp is not np.nan and min_recall_ind <= md.max_recall_ind: recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1] else: recall, error = [], [] # Change legend based on tp value if tp is np.nan: label = '{}: n/a'.format(PRETTY_TP_METRICS[metric]) elif min_recall_ind > md.max_recall_ind: label = '{}: nan'.format(PRETTY_TP_METRICS[metric]) else: label = '{}: {:.2f} ({})'.format( PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric]) if metric == 'trans_err': label += f' ({md.max_recall_ind})' # add recall print(f'Recall: {detection_name}: {md.max_recall_ind/100}') ax.plot(recall, error, label=label) ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3)) ax.legend(loc='best') if savepath is not None: plt.savefig(savepath) plt.close() def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: """ Check if a box is visible inside an image without accounting for occlusions. :param box: The box to be checked. :param intrinsic: . Intrinsic camera matrix. :param imsize: (width, height). :param vis_level: One of the enumerations of . :return True if visibility condition is satisfied. """ center_3d = box.center.reshape(3, 1) center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :] visible = np.logical_and( center_img[0, :] > 0, center_img[0, :] < imsize[0]) visible = np.logical_and(visible, center_img[1, :] < imsize[1]) visible = np.logical_and(visible, center_img[1, :] > 0) visible = np.logical_and(visible, center_3d[2, :] > 1) # True if a corner is at least 0.1 meter in front of the camera. in_front = center_3d[2, :] > 0.1 if vis_level == BoxVisibility.ALL: return all(visible) and all(in_front) elif vis_level == BoxVisibility.ANY: return any(visible) and all(in_front) elif vis_level == BoxVisibility.NONE: return True else: raise ValueError("vis_level: {} not valid".format(vis_level)) def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: """ Check if a box is visible in images but not all corners in image . :param box: The box to be checked. :param intrinsic: . Intrinsic camera matrix. :param imsize: (width, height). :param vis_level: One of the enumerations of . :return True if visibility condition is satisfied. """ corners_3d = box.corners() corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :] visible = np.logical_and( corners_img[0, :] > 0, corners_img[0, :] < imsize[0]) visible = np.logical_and(visible, corners_img[1, :] < imsize[1]) visible = np.logical_and(visible, corners_img[1, :] > 0) visible = np.logical_and(visible, corners_3d[2, :] > 1) # True if a corner is at least 0.1 meter in front of the camera. in_front = corners_3d[2, :] > 0.1 if any(visible) and not all(visible) and all(in_front): return True else: return False def filter_eval_boxes_by_id(nusc: NuScenes, eval_boxes: EvalBoxes, id=None, verbose: bool = False) -> EvalBoxes: """ Applies filtering to boxes. Distance, bike-racks and points per box. :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param is: the anns token set that used to keep bboxes. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) filtered_boxes = [] for box in eval_boxes[sample_token]: if box.token in id: filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes if verbose: print("=> Original number of boxes: %d" % total) print("=> After anns based filtering: %d" % anns_filter) return eval_boxes def filter_eval_boxes_by_visibility( ori_eval_boxes: EvalBoxes, visibility=None, verbose: bool = False) -> EvalBoxes: """ Applies filtering to boxes. Distance, bike-racks and points per box. :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param is: the anns token set that used to keep bboxes. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. eval_boxes = copy.deepcopy(ori_eval_boxes) total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) filtered_boxes = [] for box in eval_boxes[sample_token]: if box.visibility == visibility: filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes if verbose: print("=> Original number of boxes: %d" % total) print("=> After visibility based filtering: %d" % anns_filter) return eval_boxes def filter_by_sample_token( ori_eval_boxes, valid_sample_tokens=[], verbose=False): eval_boxes = copy.deepcopy(ori_eval_boxes) for sample_token in eval_boxes.sample_tokens: if sample_token not in valid_sample_tokens: eval_boxes.boxes.pop(sample_token) return eval_boxes def filter_eval_boxes_by_overlap(nusc: NuScenes, eval_boxes: EvalBoxes, verbose: bool = False) -> EvalBoxes: """ Applies filtering to boxes. basedon overlap . :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. cams = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'] total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) sample_record = nusc.get('sample', sample_token) filtered_boxes = [] for box in eval_boxes[sample_token]: count = 0 for cam in cams: ''' copy-paste form nuscens ''' sample_data_token = sample_record['data'][cam] sd_record = nusc.get('sample_data', sample_data_token) cs_record = nusc.get( 'calibrated_sensor', sd_record['calibrated_sensor_token']) sensor_record = nusc.get('sensor', cs_record['sensor_token']) pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) cam_intrinsic = np.array(cs_record['camera_intrinsic']) imsize = (sd_record['width'], sd_record['height']) new_box = Box( box.translation, box.size, Quaternion( box.rotation), name=box.detection_name, token='') # Move box to ego vehicle coord system. new_box.translate(-np.array(pose_record['translation'])) new_box.rotate(Quaternion(pose_record['rotation']).inverse) # Move box to sensor coord system. new_box.translate(-np.array(cs_record['translation'])) new_box.rotate(Quaternion(cs_record['rotation']).inverse) if center_in_image( new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): count += 1 # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): # count += 1 if count > 1: with open('center_overlap.txt', 'a') as f: try: f.write(box.token + '\n') except BaseException: pass filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes verbose = True if verbose: print("=> Original number of boxes: %d" % total) print("=> After anns based filtering: %d" % anns_filter) return eval_boxes class MotionEval(NuScenesEval): """ Dummy class for backward-compatibility. Same as DetectionEval. """ def __init__(self, nusc: NuScenes, config: DetectionConfig, result_path: str, eval_set: str, output_dir: str = None, verbose: bool = True, eval_mask=False, data_infos=None, ann_file=None, category_convert_type='motion_category', ): """ Initialize a DetectionEval object. :param nusc: A NuScenes object. :param config: A DetectionConfig object. :param result_path: Path of the nuScenes JSON result file. :param eval_set: The dataset split to evaluate on, e.g. train, val or test. :param output_dir: Folder to save plots and results to. :param verbose: Whether to print to stdout. """ self.nusc = nusc self.result_path = result_path self.eval_set = eval_set self.output_dir = output_dir self.verbose = verbose self.cfg = config self.eval_mask = eval_mask self.data_infos = data_infos # Check result file exists. assert os.path.exists( result_path), 'Error: The result file does not exist!' # Make dirs. self.plot_dir = os.path.join(self.output_dir, 'plots') if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) if not os.path.isdir(self.plot_dir): os.makedirs(self.plot_dir) # Load data. if verbose: print('Initializing nuScenes detection evaluation') self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionMotionBox, verbose=verbose, category_convert_type=category_convert_type) # data = mmcv.load(ann_file, file_format='pkl') # data_infos = {} # for info in data['infos']: # data_infos[info['token']] = info self.gt_boxes = load_gt( self.nusc, self.eval_set, DetectionMotionBox_modified, verbose=verbose, category_convert_type=category_convert_type) assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \ "Samples in split doesn't match samples in predictions." # Add center distances. self.pred_boxes = add_center_dist(nusc, self.pred_boxes) self.gt_boxes = add_center_dist(nusc, self.gt_boxes) # Filter boxes (distance, points per box, etc.). if verbose: print('Filtering predictions') self.pred_boxes = filter_eval_boxes( nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose) if verbose: print('Filtering ground truth annotations') self.gt_boxes = filter_eval_boxes( nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose) # if self.overlap_test: # self.pred_boxes = filter_eval_boxes_by_overlap( # self.nusc, self.pred_boxes) # self.gt_boxes = filter_eval_boxes_by_overlap( # self.nusc, self.gt_boxes, verbose=True) self.all_gt = copy.deepcopy(self.gt_boxes) self.all_preds = copy.deepcopy(self.pred_boxes) self.sample_tokens = self.gt_boxes.sample_tokens self.index_map = {} for scene in nusc.scene: first_sample_token = scene['first_sample_token'] sample = nusc.get('sample', first_sample_token) self.index_map[first_sample_token] = 1 index = 2 while sample['next'] != '': sample = nusc.get('sample', sample['next']) self.index_map[sample['token']] = index index += 1 def update_gt(self, type_='vis', visibility='1', index=1): if type_ == 'vis': self.visibility_test = True if self.visibility_test: '''[{'description': 'visibility of whole object is between 0 and 40%', 'token': '1', 'level': 'v0-40'}, {'description': 'visibility of whole object is between 40 and 60%', 'token': '2', 'level': 'v40-60'}, {'description': 'visibility of whole object is between 60 and 80%', 'token': '3', 'level': 'v60-80'}, {'description': 'visibility of whole object is between 80 and 100%', 'token': '4', 'level': 'v80-100'}]''' self.gt_boxes = filter_eval_boxes_by_visibility( self.all_gt, visibility, verbose=True) elif type_ == 'ord': valid_tokens = [ key for ( key, value) in self.index_map.items() if value == index] # from IPython import embed # embed() self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens) self.pred_boxes = filter_by_sample_token( self.all_preds, valid_tokens) self.sample_tokens = self.gt_boxes.sample_tokens def evaluate(self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]: """ Performs the actual evaluation. :return: A tuple of high-level and the raw metric data. """ start_time = time.time() # ----------------------------------- # Step 1: Accumulate metric data for all classes and distance thresholds. # ----------------------------------- if self.verbose: print('Accumulating metric data...') metric_data_list = DetectionMotionMetricDataList() # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths) # self.cfg.dist_ths = [0.3] # self.cfg.dist_fcn_callable for class_name in self.cfg.class_names: for dist_th in self.cfg.dist_ths: md, _, _, _ = accumulate( self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th) metric_data_list.set(class_name, dist_th, md) # from IPython import embed # embed() # exit() # ----------------------------------- # Step 2: Calculate metrics from the data. # ----------------------------------- if self.verbose: print('Calculating metrics...') metrics = DetectionMotionMetrics(self.cfg) traj_metrics = {} for class_name in self.cfg.class_names: # Compute APs. for dist_th in self.cfg.dist_ths: metric_data = metric_data_list[(class_name, dist_th)] ap = calc_ap( metric_data, self.cfg.min_recall, self.cfg.min_precision) metrics.add_label_ap(class_name, dist_th, ap) # Compute TP metrics. for metric_name in TP_METRICS: metric_data = metric_data_list[( class_name, self.cfg.dist_th_tp)] if class_name in ['traffic_cone'] and metric_name in [ 'attr_err', 'vel_err', 'orient_err']: tp = np.nan elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: tp = np.nan else: tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) if metric_name in TP_TRAJ_METRICS: if class_name not in traj_metrics: traj_metrics[class_name] = {} traj_metrics[class_name][metric_name] = tp metrics.add_label_tp(class_name, metric_name, tp) print_traj_metrics(traj_metrics) # Compute evaluation time. metrics.add_runtime(time.time() - start_time) return metrics, metric_data_list def evaluate_motion( self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]: """ Performs the actual evaluation. :return: A tuple of high-level and the raw metric data. """ start_time = time.time() self.cfg.dist_ths = [1.0] self.cfg.dist_th_tp = 1.0 # center dist for detection traj_dist_th = 2.0 # FDE for traj # ----------------------------------- # Step 1: Accumulate metric data for all classes and distance thresholds. # ----------------------------------- if self.verbose: print('Accumulating metric data...') metric_data_list = DetectionMotionMetricDataList() for class_name in self.cfg.class_names: for dist_th in self.cfg.dist_ths: md, _, _, _ = accumulate_motion( self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, traj_fde, dist_th, traj_dist_th) metric_data_list.set(class_name, dist_th, md) # ----------------------------------- # Step 2: Calculate metrics from the data. # ----------------------------------- if self.verbose: print('Calculating metrics...') metrics = DetectionMotionMetrics(self.cfg) traj_metrics = {} for class_name in self.cfg.class_names: # Compute APs. for dist_th in self.cfg.dist_ths: metric_data = metric_data_list[(class_name, dist_th)] ap = calc_ap( metric_data, self.cfg.min_recall, self.cfg.min_precision) metrics.add_label_ap(class_name, dist_th, ap) # Compute TP metrics. for metric_name in TP_METRICS: metric_data = metric_data_list[( class_name, self.cfg.dist_th_tp)] if class_name in ['traffic_cone'] and metric_name in [ 'attr_err', 'vel_err', 'orient_err']: tp = np.nan elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: tp = np.nan else: tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) if metric_name in TP_TRAJ_METRICS: if class_name not in traj_metrics: traj_metrics[class_name] = {} traj_metrics[class_name][metric_name] = tp metrics.add_label_tp(class_name, metric_name, tp) print_traj_metrics(traj_metrics) # Compute evaluation time. metrics.add_runtime(time.time() - start_time) return metrics, metric_data_list def evaluate_epa( self) -> Tuple[DetectionMotionMetrics, DetectionMotionMetricDataList]: """ Performs the actual evaluation. :return: A tuple of high-level and the raw metric data. """ start_time = time.time() self.cfg.dist_ths = [2.0] self.cfg.dist_th_tp = 2.0 # center dist for detection traj_dist_th = 2.0 # FDE for traj # ----------------------------------- # Step 1: Accumulate metric data for all classes and distance thresholds. # ----------------------------------- if self.verbose: print('Accumulating metric data...') metric_data_list = DetectionMotionMetricDataList() for class_name in self.cfg.class_names: for dist_th in self.cfg.dist_ths: md, N_det_tp, N_det_fp, N_det_gt = accumulate( self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th) md, N_det_traj_tp, N_det_traj_fp, N_det_traj_gt = accumulate_motion( self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, traj_fde, dist_th, traj_dist_th) metric_data_list.set(class_name, dist_th, md) EPA = (N_det_traj_tp - 0.5 * N_det_fp) / (N_det_gt + 1e-5) print(N_det_traj_tp, N_det_fp, N_det_gt) print('EPA ', class_name, EPA) # ----------------------------------- # Step 2: Calculate metrics from the data. # ----------------------------------- if self.verbose: print('Calculating metrics...') metrics = DetectionMotionMetrics(self.cfg) traj_metrics = {} for class_name in self.cfg.class_names: # Compute APs. for dist_th in self.cfg.dist_ths: metric_data = metric_data_list[(class_name, dist_th)] ap = calc_ap( metric_data, self.cfg.min_recall, self.cfg.min_precision) metrics.add_label_ap(class_name, dist_th, ap) # Compute TP metrics. for metric_name in TP_METRICS: metric_data = metric_data_list[( class_name, self.cfg.dist_th_tp)] if class_name in ['traffic_cone'] and metric_name in [ 'attr_err', 'vel_err', 'orient_err']: tp = np.nan elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: tp = np.nan else: tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) if metric_name in TP_TRAJ_METRICS: if class_name not in traj_metrics: traj_metrics[class_name] = {} traj_metrics[class_name][metric_name] = tp metrics.add_label_tp(class_name, metric_name, tp) print_traj_metrics(traj_metrics) # Compute evaluation time. metrics.add_runtime(time.time() - start_time) return metrics, metric_data_list def main(self, plot_examples: int = 0, render_curves: bool = True, eval_mode: str = 'standard') -> Dict[str, Any]: """ Main function that loads the evaluation code, visualizes samples, runs the evaluation and renders stat plots. :param plot_examples: How many example visualizations to write to disk. :param render_curves: Whether to render PR and TP curves to disk. :return: A dict that stores the high-level metrics and meta data. """ if plot_examples > 0: # Select a random but fixed subset to plot. random.seed(42) sample_tokens = list(self.sample_tokens) random.shuffle(sample_tokens) sample_tokens = sample_tokens[:plot_examples] # Visualize samples. example_dir = os.path.join(self.output_dir, 'examples') if not os.path.isdir(example_dir): os.mkdir(example_dir) for sample_token in sample_tokens: visualize_sample(self.nusc, sample_token, self.gt_boxes if self.eval_set != 'test' else EvalBoxes(), # Don't render test GT. self.pred_boxes, eval_range=max(self.cfg.class_range.values()), savepath=os.path.join(example_dir, '{}.png'.format(sample_token))) # Run evaluation. if eval_mode == 'motion_map': metrics, metric_data_list = self.evaluate_motion() elif eval_mode == 'standard': metrics, metric_data_list = self.evaluate() elif eval_mode == 'epa': metrics, metric_data_list = self.evaluate_epa() else: raise NotImplementedError # Render PR and TP curves. if render_curves: self.render(metrics, metric_data_list) # Dump the metric data, meta and metrics to disk. if self.verbose: print('Saving metrics to: %s' % self.output_dir) metrics_summary = metrics.serialize() metrics_summary['meta'] = self.meta.copy() with open(os.path.join(self.output_dir, 'metrics_summary.json'), 'w') as f: json.dump(metrics_summary, f, indent=2) with open(os.path.join(self.output_dir, 'metrics_details.json'), 'w') as f: json.dump(metric_data_list.serialize(), f, indent=2) # Print high-level metrics. print('mAP: %.4f' % (metrics_summary['mean_ap'])) err_name_mapping = { 'trans_err': 'mATE', 'scale_err': 'mASE', 'orient_err': 'mAOE', 'vel_err': 'mAVE', 'attr_err': 'mAAE' } for tp_name, tp_val in metrics_summary['tp_errors'].items(): print('%s: %.4f' % (err_name_mapping[tp_name], tp_val)) print('NDS: %.4f' % (metrics_summary['nd_score'])) print('Eval time: %.1fs' % metrics_summary['eval_time']) # Print per-class metrics. print() print('Per-class results:') print('Object Class\tAP\tATE\tASE\tAOE\tAVE\tAAE') class_aps = metrics_summary['mean_dist_aps'] class_tps = metrics_summary['label_tp_errors'] for class_name in class_aps.keys(): print('%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f' % (class_name, class_aps[class_name], class_tps[class_name]['trans_err'], class_tps[class_name]['scale_err'], class_tps[class_name]['orient_err'], class_tps[class_name]['vel_err'], class_tps[class_name]['attr_err'])) return metrics_summary def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None: """ Renders various PR and TP curves. :param metrics: DetectionMetrics instance. :param md_list: DetectionMetricDataList instance. """ if self.verbose: print('Rendering PR and TP curves') def savepath(name): return os.path.join(self.plot_dir, name + '.pdf') summary_plot( md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall, dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary')) for detection_name in self.cfg.class_names: class_pr_curve( md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall, savepath=savepath( detection_name + '_pr')) class_tp_curve( md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp, savepath=savepath( detection_name + '_tp')) for dist_th in self.cfg.dist_ths: dist_pr_curve( md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall, savepath=savepath( 'dist_pr_' + str(dist_th))) def print_traj_metrics(metrics): class_names = metrics.keys() x = PrettyTable() x.field_names = ["class names"] + TP_TRAJ_METRICS for class_name in metrics.keys(): row_data = [class_name] for m in TP_TRAJ_METRICS: row_data.append('%.4f' % metrics[class_name][m]) x.add_row(row_data) print(x) if __name__ == "__main__": # Settings. parser = argparse.ArgumentParser( description='Evaluate nuScenes detection results.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( 'result_path', type=str, help='The submission as a JSON file.') parser.add_argument( '--output_dir', type=str, default='~/nuscenes-metrics', help='Folder to store result metrics, graphs and example visualizations.') parser.add_argument( '--eval_set', type=str, default='val', help='Which dataset split to evaluate on, train, val or test.') parser.add_argument('--dataroot', type=str, default='data/nuscenes', help='Default nuScenes data directory.') parser.add_argument( '--version', type=str, default='v1.0-mini', help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.') parser.add_argument( '--config_path', type=str, default='', help='Path to the configuration file.' 'If no path given, the CVPR 2019 configuration will be used.') parser.add_argument( '--plot_examples', type=int, default=0, help='How many example visualizations to write to disk.') parser.add_argument('--render_curves', type=int, default=1, help='Whether to render PR and TP curves to disk.') parser.add_argument('--verbose', type=int, default=1, help='Whether to print to stdout.') args = parser.parse_args() result_path_ = os.path.expanduser(args.result_path) output_dir_ = os.path.expanduser(args.output_dir) eval_set_ = args.eval_set dataroot_ = args.dataroot version_ = args.version config_path = args.config_path plot_examples_ = args.plot_examples render_curves_ = bool(args.render_curves) verbose_ = bool(args.verbose) if config_path == '': cfg_ = config_factory('detection_cvpr_2019') else: with open(config_path, 'r') as _f: cfg_ = DetectionConfig.deserialize(json.load(_f)) nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_) nusc_eval = MotionEval( nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_, output_dir=output_dir_, verbose=verbose_) for vis in ['1', '2', '3', '4']: nusc_eval.update_gt(type_='vis', visibility=vis) print(f'================ {vis} ===============') nusc_eval.main( plot_examples=plot_examples_, render_curves=render_curves_) ================================================ FILE: mmdet3d/datasets/evaluation/AP.py ================================================ import numpy as np from .distance import chamfer_distance, frechet_distance, chamfer_distance_batch from typing import List, Tuple, Union from numpy.typing import NDArray import torch def average_precision(recalls, precisions, mode='area'): """Calculate average precision. Args: recalls (ndarray): shape (num_dets, ) precisions (ndarray): shape (num_dets, ) mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float: calculated average precision """ recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape and recalls.ndim == 2 num_scales = recalls.shape[0] ap = 0. if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) ind = np.where(mrec[0, 1:] != mrec[0, :-1])[0] ap = np.sum( (mrec[0, ind + 1] - mrec[0, ind]) * mpre[0, ind + 1]) elif mode == '11points': for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[0, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') return ap def instance_match(pred_lines: NDArray, scores: NDArray, gt_lines: NDArray, thresholds: Union[Tuple, List], metric: str='chamfer') -> List: """Compute whether detected lines are true positive or false positive. Args: pred_lines (array): Detected lines of a sample, of shape (M, INTERP_NUM, 2 or 3). scores (array): Confidence score of each line, of shape (M, ). gt_lines (array): GT lines of a sample, of shape (N, INTERP_NUM, 2 or 3). thresholds (list of tuple): List of thresholds. metric (str): Distance function for lines matching. Default: 'chamfer'. Returns: list_of_tp_fp (list): tp-fp matching result at all thresholds """ if metric == 'chamfer': distance_fn = chamfer_distance elif metric == 'frechet': distance_fn = frechet_distance else: raise ValueError(f'unknown distance function {metric}') num_preds = pred_lines.shape[0] num_gts = gt_lines.shape[0] # tp and fp tp_fp_list = [] tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) # if there is no gt lines in this sample, then all pred lines are false positives if num_gts == 0: fp[...] = 1 for thr in thresholds: tp_fp_list.append((tp.copy(), fp.copy())) return tp_fp_list if num_preds == 0: for thr in thresholds: tp_fp_list.append((tp.copy(), fp.copy())) return tp_fp_list assert pred_lines.shape[1] == gt_lines.shape[1], \ "sample points num should be the same" # distance matrix: M x N matrix = np.zeros((num_preds, num_gts)) # for i in range(num_preds): # for j in range(num_gts): # matrix[i, j] = distance_fn(pred_lines[i], gt_lines[j]) matrix = chamfer_distance_batch(pred_lines, gt_lines) # for each det, the min distance with all gts matrix_min = matrix.min(axis=1) # for each det, which gt is the closest to it matrix_argmin = matrix.argmin(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-scores) # match under different thresholds for thr in thresholds: tp = np.zeros((num_preds), dtype=np.float32) fp = np.zeros((num_preds), dtype=np.float32) gt_covered = np.zeros(num_gts, dtype=bool) for i in sort_inds: if matrix_min[i] <= thr: matched_gt = matrix_argmin[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 tp_fp_list.append((tp, fp)) return tp_fp_list ================================================ FILE: mmdet3d/datasets/evaluation/__init__.py ================================================ ================================================ FILE: mmdet3d/datasets/evaluation/distance.py ================================================ from scipy.spatial import distance from numpy.typing import NDArray import torch def chamfer_distance(line1: NDArray, line2: NDArray) -> float: ''' Calculate chamfer distance between two lines. Make sure the lines are interpolated. Args: line1 (array): coordinates of line1 line2 (array): coordinates of line2 Returns: distance (float): chamfer distance ''' dist_matrix = distance.cdist(line1, line2, 'euclidean') dist12 = dist_matrix.min(-1).sum() / len(line1) dist21 = dist_matrix.min(-2).sum() / len(line2) return (dist12 + dist21) / 2 def frechet_distance(line1: NDArray, line2: NDArray) -> float: ''' Calculate frechet distance between two lines. Make sure the lines are interpolated. Args: line1 (array): coordinates of line1 line2 (array): coordinates of line2 Returns: distance (float): frechet distance ''' raise NotImplementedError def chamfer_distance_batch(pred_lines, gt_lines): ''' Calculate chamfer distance between two group of lines. Make sure the lines are interpolated. Args: pred_lines (array or tensor): shape (m, num_pts, 2 or 3) gt_lines (array or tensor): shape (n, num_pts, 2 or 3) Returns: distance (array): chamfer distance ''' _, num_pts, coord_dims = pred_lines.shape if not isinstance(pred_lines, torch.Tensor): pred_lines = torch.tensor(pred_lines) if not isinstance(gt_lines, torch.Tensor): gt_lines = torch.tensor(gt_lines) dist_mat = torch.cdist(pred_lines.view(-1, coord_dims), gt_lines.view(-1, coord_dims), p=2) # (num_query*num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts)) # (num_query, num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) # (num_gt, num_q, num_pts, num_pts) dist1 = dist_mat.min(-1)[0].sum(-1) dist2 = dist_mat.min(-2)[0].sum(-1) dist_matrix = (dist1 + dist2).transpose(0, 1) / (2 * num_pts) return dist_matrix.numpy() ================================================ FILE: mmdet3d/datasets/evaluation/raster_eval.py ================================================ import torch from mmdet3d.datasets import build_dataset, build_dataloader import mmcv from functools import cached_property import prettytable from numpy.typing import NDArray from typing import Dict, Optional from logging import Logger from mmcv import Config from copy import deepcopy N_WORKERS = 16 class RasterEvaluate(object): """Evaluator for rasterized map. Args: dataset_cfg (Config): dataset cfg for gt n_workers (int): num workers to parallel """ def __init__(self, dataset_cfg: Config, n_workers: int=N_WORKERS): self.dataset = build_dataset(dataset_cfg) self.dataloader = build_dataloader( self.dataset, samples_per_gpu=1, workers_per_gpu=n_workers, shuffle=False, dist=False) self.cat2id = self.dataset.cat2id self.id2cat = {v: k for k, v in self.cat2id.items()} self.n_workers = n_workers @cached_property def gts(self) -> Dict[str, NDArray]: print('collecting gts...') gts = {} for data in mmcv.track_iter_progress(self.dataloader): token = deepcopy(data['img_metas'].data[0][0]['token']) gt = deepcopy(data['semantic_mask'].data[0][0]) gts[token] = gt del data # avoid dataloader memory crash return gts def evaluate(self, result_path: str, logger: Optional[Logger]=None) -> Dict[str, float]: ''' Do evaluation for a submission file and print evalution results to `logger` if specified. The submission will be aligned by tokens before evaluation. Args: result_path (str): path to submission file logger (Logger): logger to print evaluation result, Default: None Returns: result_dict (Dict): evaluation results. IoU by categories. ''' results = mmcv.load(result_path) meta = results['meta'] results = results['results'] result_dict = {} gts = [] preds = [] for token, gt in self.gts.items(): gts.append(gt) if token in results: pred = results[token]['semantic_mask'] else: pred = torch.zeros((len(self.cat2id), self.canvas_size[1], self.canvas_size[0])).bool() preds.append(pred) preds = torch.stack(preds).bool() gts = torch.stack(gts).bool() # for every label total = 0 for i in range(gts.shape[1]): category = self.id2cat[i] pred = preds[:, i] gt = gts[:, i] intersect = (pred & gt).sum().float().item() union = (pred | gt).sum().float().item() result_dict[category] = intersect / (union + 1e-7) total += result_dict[category] mIoU = total / gts.shape[1] result_dict['mIoU'] = mIoU categories = list(self.cat2id.keys()) table = prettytable.PrettyTable([' ', *categories, 'mean']) table.add_row(['IoU', *[round(result_dict[cat], 4) for cat in categories], round(mIoU, 4)]) if logger: from mmcv.utils import print_log print_log('\n'+str(table), logger=logger) print_log(f'mIoU = {mIoU:.4f}\n', logger=logger) return result_dict ================================================ FILE: mmdet3d/datasets/evaluation/vector_eval.py ================================================ from functools import partial import numpy as np from multiprocessing import Pool from mmdet3d.datasets import build_dataset, build_dataloader import mmcv from .AP import instance_match, average_precision import prettytable from time import time from functools import cached_property from shapely.geometry import LineString from numpy.typing import NDArray from typing import Dict, List, Optional from logging import Logger from mmcv import Config from copy import deepcopy import os from IPython import embed INTERP_NUM = 200 # number of points to interpolate during evaluation THRESHOLDS = [0.5, 1.0, 1.5] # AP thresholds N_WORKERS = 16 # num workers to parallel SAMPLE_DIST = 0.15 class VectorEvaluate(object): """Evaluator for vectorized map. Args: dataset_cfg (Config): dataset cfg for gt n_workers (int): num workers to parallel """ def __init__(self, dataset_cfg: Config, n_workers: int=N_WORKERS) -> None: self.dataset = build_dataset(dataset_cfg) self.cat2id = self.dataset.cat2id self.id2cat = {v: k for k, v in self.cat2id.items()} self.n_workers = n_workers self.new_split = 'newsplit' in self.dataset.ann_file self.roi_size = self.dataset.roi_size if self.roi_size == (60, 30): self.thresholds = [0.5, 1.0, 1.5] elif self.roi_size == (100, 50): self.thresholds = [1.0, 1.5, 2.0] @cached_property def gts(self) -> Dict[str, Dict[int, List[NDArray]]]: roi_size = self.dataset.roi_size if 'av2' in self.dataset.ann_file: dataset = 'av2' else: dataset = 'nusc' if self.new_split: tmp_file = f'./tmp_gts_{dataset}_{roi_size[0]}x{roi_size[1]}_newsplit.pkl' else: tmp_file = f'./tmp_gts_{dataset}_{roi_size[0]}x{roi_size[1]}.pkl' if os.path.exists(tmp_file): print(f'loading cached gts from {tmp_file}') gts = mmcv.load(tmp_file) return gts print('collecting gts...') gts = {} self.dataloader = build_dataloader( self.dataset, samples_per_gpu=1, workers_per_gpu=self.n_workers, shuffle=False, dist=False) pbar = mmcv.ProgressBar(len(self.dataloader)) for data in self.dataloader: token = deepcopy(data['img_metas'].data[0][0]['token']) gt = deepcopy(data['vectors'].data[0][0]) gts[token] = gt pbar.update() del data # avoid dataloader memory crash if not os.path.exists(tmp_file): print(f"saving gt to {tmp_file}") mmcv.dump(gts, tmp_file) return gts def interp_fixed_num(self, vector: NDArray, num_pts: int) -> NDArray: ''' Interpolate a polyline. Args: vector (array): line coordinates, shape (M, 2) num_pts (int): Returns: sampled_points (array): interpolated coordinates ''' line = LineString(vector) distances = np.linspace(0, line.length, num_pts) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def interp_fixed_dist(self, vector: NDArray, sample_dist: float) -> NDArray: ''' Interpolate a line at fixed interval. Args: vector (LineString): vector sample_dist (float): sample interval Returns: points (array): interpolated points, shape (N, 2) ''' line = LineString(vector) distances = list(np.arange(sample_dist, line.length, sample_dist)) # make sure to sample at least two points when sample_dist > line.length distances = [0,] + distances + [line.length,] sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).squeeze() return sampled_points def _evaluate_single(self, pred_vectors: List, scores: List, groundtruth: List, thresholds: List, metric: str='metric') -> Dict[int, NDArray]: ''' Do single-frame matching for one class. Args: pred_vectors (List): List[vector(ndarray) (different length)], scores (List): List[score(float)] groundtruth (List): List of vectors thresholds (List): List of thresholds Returns: tp_fp_score_by_thr (Dict): matching results at different thresholds e.g. {0.5: (M, 2), 1.0: (M, 2), 1.5: (M, 2)} ''' pred_lines = [] # interpolate predictions for vector in pred_vectors: vector = np.array(vector) vector_interp = self.interp_fixed_num(vector, INTERP_NUM) pred_lines.append(vector_interp) if pred_lines: pred_lines = np.stack(pred_lines) else: pred_lines = np.zeros((0, INTERP_NUM, 2)) # interpolate groundtruth gt_lines = [] for vector in groundtruth: vector_interp = self.interp_fixed_num(vector, INTERP_NUM) gt_lines.append(vector_interp) if gt_lines: gt_lines = np.stack(gt_lines) else: gt_lines = np.zeros((0, INTERP_NUM, 2)) scores = np.array(scores) tp_fp_list = instance_match(pred_lines, scores, gt_lines, thresholds, metric) # (M, 2) tp_fp_score_by_thr = {} for i, thr in enumerate(thresholds): tp, fp = tp_fp_list[i] tp_fp_score = np.hstack([tp[:, None], fp[:, None], scores[:, None]]) tp_fp_score_by_thr[thr] = tp_fp_score return tp_fp_score_by_thr # {0.5: (M, 2), 1.0: (M, 2), 1.5: (M, 2)} def evaluate(self, result_path: str, metric: str='chamfer', logger: Optional[Logger]=None) -> Dict[str, float]: ''' Do evaluation for a submission file and print evalution results to `logger` if specified. The submission will be aligned by tokens before evaluation. We use multi-worker to speed up. Args: result_path (str): path to submission file metric (str): distance metric. Default: 'chamfer' logger (Logger): logger to print evaluation result, Default: None Returns: new_result_dict (Dict): evaluation results. AP by categories. ''' results = mmcv.load(result_path) results = results['results'] # re-group samples and gt by label samples_by_cls = {label: [] for label in self.id2cat.keys()} num_gts = {label: 0 for label in self.id2cat.keys()} num_preds = {label: 0 for label in self.id2cat.keys()} # align by token for token, gt in self.gts.items(): if token in results.keys(): pred = results[token] else: pred = {'vectors': [], 'scores': [], 'labels': []} # for every sample vectors_by_cls = {label: [] for label in self.id2cat.keys()} scores_by_cls = {label: [] for label in self.id2cat.keys()} for i in range(len(pred['labels'])): # i-th pred line in sample label = pred['labels'][i] vector = pred['vectors'][i] score = pred['scores'][i] vectors_by_cls[label].append(vector) scores_by_cls[label].append(score) for label in self.id2cat.keys(): new_sample = (vectors_by_cls[label], scores_by_cls[label], gt[label]) num_gts[label] += len(gt[label]) num_preds[label] += len(scores_by_cls[label]) samples_by_cls[label].append(new_sample) result_dict = {} print(f'\nevaluating {len(self.id2cat)} categories...') start = time() if self.n_workers > 0: pool = Pool(self.n_workers) sum_mAP = 0 pbar = mmcv.ProgressBar(len(self.id2cat)) for label in self.id2cat.keys(): samples = samples_by_cls[label] # List[(pred_lines, scores, gts)] result_dict[self.id2cat[label]] = { 'num_gts': num_gts[label], 'num_preds': num_preds[label] } sum_AP = 0 fn = partial(self._evaluate_single, thresholds=self.thresholds, metric=metric) if self.n_workers > 0: tpfp_score_list = pool.starmap(fn, samples) else: tpfp_score_list = [] for sample in samples: tpfp_score_list.append(fn(*sample)) for thr in self.thresholds: tp_fp_score = [i[thr] for i in tpfp_score_list] tp_fp_score = np.vstack(tp_fp_score) # (num_dets, 3) sort_inds = np.argsort(-tp_fp_score[:, -1]) tp = tp_fp_score[sort_inds, 0] # (num_dets,) fp = tp_fp_score[sort_inds, 1] # (num_dets,) tp = np.cumsum(tp, axis=0) fp = np.cumsum(fp, axis=0) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts[label], eps) precisions = tp / np.maximum((tp + fp), eps) AP = average_precision(recalls, precisions, 'area') sum_AP += AP result_dict[self.id2cat[label]].update({f'AP@{thr}': AP}) pbar.update() AP = sum_AP / len(self.thresholds) sum_mAP += AP result_dict[self.id2cat[label]].update({f'AP': AP}) if self.n_workers > 0: pool.close() mAP = sum_mAP / len(self.id2cat.keys()) result_dict.update({'mAP': mAP}) print(f"finished in {time() - start:.2f}s") # print results table = prettytable.PrettyTable(['category', 'num_preds', 'num_gts'] + [f'AP@{thr}' for thr in self.thresholds] + ['AP']) for label in self.id2cat.keys(): table.add_row([ self.id2cat[label], result_dict[self.id2cat[label]]['num_preds'], result_dict[self.id2cat[label]]['num_gts'], *[round(result_dict[self.id2cat[label]][f'AP@{thr}'], 4) for thr in self.thresholds], round(result_dict[self.id2cat[label]]['AP'], 4), ]) from mmcv.utils import print_log print_log('\n'+str(table), logger=logger) mAP_normal = 0 for label in self.id2cat.keys(): for thr in self.thresholds: mAP_normal += result_dict[self.id2cat[label]][f'AP@{thr}'] mAP_normal = mAP_normal / 9 print_log(f'mAP_normal = {mAP_normal:.4f}\n', logger=logger) # print_log(f'mAP_hard = {mAP_easy:.4f}\n', logger=logger) new_result_dict = {} for name in self.cat2id: new_result_dict[name] = result_dict[name]['AP'] return new_result_dict ================================================ FILE: mmdet3d/datasets/kitti2d_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv import numpy as np from mmdet.datasets import CustomDataset from .builder import DATASETS @DATASETS.register_module() class Kitti2DDataset(CustomDataset): r"""KITTI 2D Dataset. This class serves as the API for experiments on the `KITTI Dataset `_. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR'. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('car', 'pedestrian', 'cyclist') """ Annotation format: [ { 'image': { 'image_idx': 0, 'image_path': 'training/image_2/000000.png', 'image_shape': array([ 370, 1224], dtype=int32) }, 'point_cloud': { 'num_features': 4, 'velodyne_path': 'training/velodyne/000000.bin' }, 'calib': { 'P0': (4, 4), 'P1': (4, 4), 'P2': (4, 4), 'P3': (4, 4), 'R0_rect':4x4 np.array, 'Tr_velo_to_cam': 4x4 np.array, 'Tr_imu_to_velo': 4x4 np.array }, 'annos': { 'name': (n), 'truncated': (n), 'occluded': (n), 'alpha': (n), 'bbox': (n, 4), 'dimensions': (n, 3), 'location': (n, 3), 'rotation_y': (n), 'score': (n), 'index': array([0], dtype=int32), 'group_ids': array([0], dtype=int32), 'difficulty': array([0], dtype=int32), 'num_points_in_gt': (n), } } ] """ def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations. """ self.data_infos = mmcv.load(ann_file) self.cat2label = { cat_name: i for i, cat_name in enumerate(self.CLASSES) } return self.data_infos def _filter_imgs(self, min_size=32): """Filter images without ground truths.""" valid_inds = [] for i, img_info in enumerate(self.data_infos): if len(img_info['annos']['name']) > 0: valid_inds.append(i) return valid_inds def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - bboxes (np.ndarray): Ground truth bboxes. - labels (np.ndarray): Labels of ground truths. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] annos = info['annos'] gt_names = annos['name'] gt_bboxes = annos['bbox'] difficulty = annos['difficulty'] # remove classes that is not needed selected = self.keep_arrays_by_name(gt_names, self.CLASSES) gt_bboxes = gt_bboxes[selected] gt_names = gt_names[selected] difficulty = difficulty[selected] gt_labels = np.array([self.cat2label[n] for n in gt_names]) anns_results = dict( bboxes=gt_bboxes.astype(np.float32), labels=gt_labels, ) return anns_results def prepare_train_img(self, idx): """Training image preparation. Args: index (int): Index for accessing the target image data. Returns: dict: Training image data dict after preprocessing corresponding to the index. """ img_raw_info = self.data_infos[idx]['image'] img_info = dict(filename=img_raw_info['image_path']) ann_info = self.get_ann_info(idx) if len(ann_info['bboxes']) == 0: return None results = dict(img_info=img_info, ann_info=ann_info) if self.proposals is not None: results['proposals'] = self.proposals[idx] self.pre_pipeline(results) return self.pipeline(results) def prepare_test_img(self, idx): """Prepare data for testing. Args: index (int): Index for accessing the target image data. Returns: dict: Testing image data dict after preprocessing corresponding to the index. """ img_raw_info = self.data_infos[idx]['image'] img_info = dict(filename=img_raw_info['image_path']) results = dict(img_info=img_info) if self.proposals is not None: results['proposals'] = self.proposals[idx] self.pre_pipeline(results) return self.pipeline(results) def drop_arrays_by_name(self, gt_names, used_classes): """Drop irrelevant ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be dropped. """ inds = [i for i, x in enumerate(gt_names) if x not in used_classes] inds = np.array(inds, dtype=np.int64) return inds def keep_arrays_by_name(self, gt_names, used_classes): """Keep useful ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be keeped. """ inds = [i for i, x in enumerate(gt_names) if x in used_classes] inds = np.array(inds, dtype=np.int64) return inds def reformat_bbox(self, outputs, out=None): """Reformat bounding boxes to KITTI 2D styles. Args: outputs (list[np.ndarray]): List of arrays storing the inferenced bounding boxes and scores. out (str, optional): The prefix of output file. Default: None. Returns: list[dict]: A list of dictionaries with the kitti 2D format. """ from mmdet3d.core.bbox.transforms import bbox2result_kitti2d sample_idx = [info['image']['image_idx'] for info in self.data_infos] result_files = bbox2result_kitti2d(outputs, self.CLASSES, sample_idx, out) return result_files def evaluate(self, result_files, eval_types=None): """Evaluation in KITTI protocol. Args: result_files (str): Path of result files. eval_types (str, optional): Types of evaluation. Default: None. KITTI dataset only support 'bbox' evaluation type. Returns: tuple (str, dict): Average precision results in str format and average precision results in dict format. """ from mmdet3d.core.evaluation import kitti_eval eval_types = ['bbox'] if not eval_types else eval_types assert eval_types in ('bbox', ['bbox' ]), 'KITTI data set only evaluate bbox' gt_annos = [info['annos'] for info in self.data_infos] ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bbox']) return ap_result_str, ap_dict ================================================ FILE: mmdet3d/datasets/kitti_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import os import tempfile from os import path as osp import mmcv import numpy as np import torch from mmcv.utils import print_log from ..core import show_multi_modality_result, show_result from ..core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, LiDARInstance3DBoxes, points_cam2img) from .builder import DATASETS from .custom_3d import Custom3DDataset from .pipelines import Compose @DATASETS.register_module() class KittiDataset(Custom3DDataset): r"""KITTI Dataset. This class serves as the API for experiments on the `KITTI Dataset `_. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. split (str): Split of input data. pts_prefix (str, optional): Prefix of points files. Defaults to 'velodyne'. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. pcd_limit_range (list, optional): The range of point cloud used to filter invalid predicted boxes. Default: [0, -40, -3, 70.4, 40, 0.0]. """ CLASSES = ('car', 'pedestrian', 'cyclist') def __init__(self, data_root, ann_file, split, pts_prefix='velodyne', pipeline=None, classes=None, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, pcd_limit_range=[0, -40, -3, 70.4, 40, 0.0], **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, **kwargs) self.split = split self.root_split = os.path.join(self.data_root, split) assert self.modality is not None self.pcd_limit_range = pcd_limit_range self.pts_prefix = pts_prefix def _get_pts_filename(self, idx): """Get point cloud filename according to the given index. Args: index (int): Index of the point cloud file to get. Returns: str: Name of the point cloud file. """ pts_filename = osp.join(self.root_split, self.pts_prefix, f'{idx:06d}.bin') return pts_filename def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - img_prefix (str): Prefix of image files. - img_info (dict): Image info. - lidar2img (list[np.ndarray], optional): Transformations from lidar to different cameras. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['image']['image_idx'] img_filename = os.path.join(self.data_root, info['image']['image_path']) # TODO: consider use torch.Tensor only rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P2 = info['calib']['P2'].astype(np.float32) lidar2img = P2 @ rect @ Trv2c pts_filename = self._get_pts_filename(sample_idx) input_dict = dict( sample_idx=sample_idx, pts_filename=pts_filename, img_prefix=None, img_info=dict(filename=img_filename), lidar2img=lidar2img) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes. - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_bboxes (np.ndarray): 2D ground truth bboxes. - gt_labels (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. - difficulty (int): Difficulty defined by KITTI. 0, 1, 2 represent xxxxx respectively. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) if 'plane' in info: # convert ground plane to velodyne coordinates reverse = np.linalg.inv(rect @ Trv2c) (plane_norm_cam, plane_off_cam) = (info['plane'][:3], -info['plane'][:3] * info['plane'][3]) plane_norm_lidar = \ (reverse[:3, :3] @ plane_norm_cam[:, None])[:, 0] plane_off_lidar = ( reverse[:3, :3] @ plane_off_cam[:, None][:, 0] + reverse[:3, 3]) plane_lidar = np.zeros_like(plane_norm_lidar, shape=(4, )) plane_lidar[:3] = plane_norm_lidar plane_lidar[3] = -plane_norm_lidar.T @ plane_off_lidar else: plane_lidar = None difficulty = info['annos']['difficulty'] annos = info['annos'] # we need other objects to avoid collision when sample annos = self.remove_dontcare(annos) loc = annos['location'] dims = annos['dimensions'] rots = annos['rotation_y'] gt_names = annos['name'] gt_bboxes_3d = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1).astype(np.float32) # convert gt_bboxes_3d to velodyne coordinates gt_bboxes_3d = CameraInstance3DBoxes(gt_bboxes_3d).convert_to( self.box_mode_3d, np.linalg.inv(rect @ Trv2c)) gt_bboxes = annos['bbox'] selected = self.drop_arrays_by_name(gt_names, ['DontCare']) gt_bboxes = gt_bboxes[selected].astype('float32') gt_names = gt_names[selected] gt_labels = [] for cat in gt_names: if cat in self.CLASSES: gt_labels.append(self.CLASSES.index(cat)) else: gt_labels.append(-1) gt_labels = np.array(gt_labels).astype(np.int64) gt_labels_3d = copy.deepcopy(gt_labels) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, bboxes=gt_bboxes, labels=gt_labels, gt_names=gt_names, plane=plane_lidar, difficulty=difficulty) return anns_results def drop_arrays_by_name(self, gt_names, used_classes): """Drop irrelevant ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be dropped. """ inds = [i for i, x in enumerate(gt_names) if x not in used_classes] inds = np.array(inds, dtype=np.int64) return inds def keep_arrays_by_name(self, gt_names, used_classes): """Keep useful ground truths by name. Args: gt_names (list[str]): Names of ground truths. used_classes (list[str]): Classes of interest. Returns: np.ndarray: Indices of ground truths that will be keeped. """ inds = [i for i, x in enumerate(gt_names) if x in used_classes] inds = np.array(inds, dtype=np.int64) return inds def remove_dontcare(self, ann_info): """Remove annotations that do not need to be cared. Args: ann_info (dict): Dict of annotation infos. The ``'DontCare'`` annotations will be removed according to ann_file['name']. Returns: dict: Annotations after filtering. """ img_filtered_annotations = {} relevant_annotation_indices = [ i for i, x in enumerate(ann_info['name']) if x != 'DontCare' ] for key in ann_info.keys(): img_filtered_annotations[key] = ( ann_info[key][relevant_annotation_indices]) return img_filtered_annotations def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None if not isinstance(outputs[0], dict): result_files = self.bbox2result_kitti2d(outputs, self.CLASSES, pklfile_prefix, submission_prefix) elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0]: result_files = dict() for name in outputs[0]: results_ = [out[name] for out in outputs] pklfile_prefix_ = pklfile_prefix + name if submission_prefix is not None: submission_prefix_ = submission_prefix + name else: submission_prefix_ = None if 'img' in name: result_files = self.bbox2result_kitti2d( results_, self.CLASSES, pklfile_prefix_, submission_prefix_) else: result_files_ = self.bbox2result_kitti( results_, self.CLASSES, pklfile_prefix_, submission_prefix_) result_files[name] = result_files_ else: result_files = self.bbox2result_kitti(outputs, self.CLASSES, pklfile_prefix, submission_prefix) return result_files, tmp_dir def evaluate(self, results, metric=None, logger=None, pklfile_prefix=None, submission_prefix=None, show=False, out_dir=None, pipeline=None): """Evaluation in KITTI protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str], optional): Metrics to be evaluated. Default: None. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. pklfile_prefix (str, optional): The prefix of pkl files, including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str, optional): The prefix of submission data. If not specified, the submission data will not be generated. Default: None. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir = self.format_results(results, pklfile_prefix) from mmdet3d.core.evaluation import kitti_eval gt_annos = [info['annos'] for info in self.data_infos] if isinstance(result_files, dict): ap_dict = dict() for name, result_files_ in result_files.items(): eval_types = ['bbox', 'bev', '3d'] if 'img' in name: eval_types = ['bbox'] ap_result_str, ap_dict_ = kitti_eval( gt_annos, result_files_, self.CLASSES, eval_types=eval_types) for ap_type, ap in ap_dict_.items(): ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap)) print_log( f'Results of {name}:\n' + ap_result_str, logger=logger) else: if metric == 'img_bbox': ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bbox']) else: ap_result_str, ap_dict = kitti_eval(gt_annos, result_files, self.CLASSES) print_log('\n' + ap_result_str, logger=logger) if tmp_dir is not None: tmp_dir.cleanup() if show or out_dir: self.show(results, out_dir, show=show, pipeline=pipeline) return ap_dict def bbox2result_kitti(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert 3D detection results to kitti format for evaluation and test submission. Args: net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. pklfile_prefix (str): The prefix of pkl file. submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries with the kitti format. """ assert len(net_outputs) == len(self.data_infos), \ 'invalid list length of network outputs' if submission_prefix is not None: mmcv.mkdir_or_exist(submission_prefix) det_annos = [] print('\nConverting prediction to KITTI format') for idx, pred_dicts in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] info = self.data_infos[idx] sample_idx = info['image']['image_idx'] image_shape = info['image']['image_shape'][:2] box_dict = self.convert_valid_bboxes(pred_dicts, info) anno = { 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [], 'score': [] } if len(box_dict['bbox']) > 0: box_2d_preds = box_dict['bbox'] box_preds = box_dict['box3d_camera'] scores = box_dict['scores'] box_preds_lidar = box_dict['box3d_lidar'] label_preds = box_dict['label_preds'] for box, box_lidar, bbox, score, label in zip( box_preds, box_preds_lidar, box_2d_preds, scores, label_preds): bbox[2:] = np.minimum(bbox[2:], image_shape[::-1]) bbox[:2] = np.maximum(bbox[:2], [0, 0]) anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append( -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6]) anno['bbox'].append(bbox) anno['dimensions'].append(box[3:6]) anno['location'].append(box[:3]) anno['rotation_y'].append(box[6]) anno['score'].append(score) anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) else: anno = { 'name': np.array([]), 'truncated': np.array([]), 'occluded': np.array([]), 'alpha': np.array([]), 'bbox': np.zeros([0, 4]), 'dimensions': np.zeros([0, 3]), 'location': np.zeros([0, 3]), 'rotation_y': np.array([]), 'score': np.array([]), } annos.append(anno) if submission_prefix is not None: curr_file = f'{submission_prefix}/{sample_idx:06d}.txt' with open(curr_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format( anno['name'][idx], anno['alpha'][idx], bbox[idx][0], bbox[idx][1], bbox[idx][2], bbox[idx][3], dims[idx][1], dims[idx][2], dims[idx][0], loc[idx][0], loc[idx][1], loc[idx][2], anno['rotation_y'][idx], anno['score'][idx]), file=f) annos[-1]['sample_idx'] = np.array( [sample_idx] * len(annos[-1]['score']), dtype=np.int64) det_annos += annos if pklfile_prefix is not None: if not pklfile_prefix.endswith(('.pkl', '.pickle')): out = f'{pklfile_prefix}.pkl' mmcv.dump(det_annos, out) print(f'Result is saved to {out}.') return det_annos def bbox2result_kitti2d(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert 2D detection results to kitti format for evaluation and test submission. Args: net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. pklfile_prefix (str): The prefix of pkl file. submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries have the kitti format """ assert len(net_outputs) == len(self.data_infos), \ 'invalid list length of network outputs' det_annos = [] print('\nConverting prediction to KITTI format') for i, bboxes_per_sample in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] anno = dict( name=[], truncated=[], occluded=[], alpha=[], bbox=[], dimensions=[], location=[], rotation_y=[], score=[]) sample_idx = self.data_infos[i]['image']['image_idx'] num_example = 0 for label in range(len(bboxes_per_sample)): bbox = bboxes_per_sample[label] for i in range(bbox.shape[0]): anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append(0.0) anno['bbox'].append(bbox[i, :4]) # set dimensions (height, width, length) to zero anno['dimensions'].append( np.zeros(shape=[3], dtype=np.float32)) # set the 3D translation to (-1000, -1000, -1000) anno['location'].append( np.ones(shape=[3], dtype=np.float32) * (-1000.0)) anno['rotation_y'].append(0.0) anno['score'].append(bbox[i, 4]) num_example += 1 if num_example == 0: annos.append( dict( name=np.array([]), truncated=np.array([]), occluded=np.array([]), alpha=np.array([]), bbox=np.zeros([0, 4]), dimensions=np.zeros([0, 3]), location=np.zeros([0, 3]), rotation_y=np.array([]), score=np.array([]), )) else: anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) annos[-1]['sample_idx'] = np.array( [sample_idx] * num_example, dtype=np.int64) det_annos += annos if pklfile_prefix is not None: # save file in pkl format pklfile_path = ( pklfile_prefix[:-4] if pklfile_prefix.endswith( ('.pkl', '.pickle')) else pklfile_prefix) mmcv.dump(det_annos, pklfile_path) if submission_prefix is not None: # save file in submission format mmcv.mkdir_or_exist(submission_prefix) print(f'Saving KITTI submission to {submission_prefix}') for i, anno in enumerate(det_annos): sample_idx = self.data_infos[i]['image']['image_idx'] cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt' with open(cur_det_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'][::-1] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} ' '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format( anno['name'][idx], anno['alpha'][idx], *bbox[idx], # 4 float *dims[idx], # 3 float *loc[idx], # 3 float anno['rotation_y'][idx], anno['score'][idx]), file=f, ) print(f'Result is saved to {submission_prefix}') return det_annos def convert_valid_bboxes(self, box_dict, info): """Convert the predicted boxes into valid ones. Args: box_dict (dict): Box dictionaries to be converted. - boxes_3d (:obj:`LiDARInstance3DBoxes`): 3D bounding boxes. - scores_3d (torch.Tensor): Scores of boxes. - labels_3d (torch.Tensor): Class labels of boxes. info (dict): Data info. Returns: dict: Valid predicted boxes. - bbox (np.ndarray): 2D bounding boxes. - box3d_camera (np.ndarray): 3D bounding boxes in camera coordinate. - box3d_lidar (np.ndarray): 3D bounding boxes in LiDAR coordinate. - scores (np.ndarray): Scores of boxes. - label_preds (np.ndarray): Class label predictions. - sample_idx (int): Sample index. """ # TODO: refactor this function box_preds = box_dict['boxes_3d'] scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] box_preds.limit_yaw(offset=0.5, period=np.pi * 2) if len(box_preds) == 0: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P2 = info['calib']['P2'].astype(np.float32) img_shape = info['image']['image_shape'] P2 = box_preds.tensor.new_tensor(P2) box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c) box_corners = box_preds_camera.corners box_corners_in_image = points_cam2img(box_corners, P2) # box_corners_in_image: [N, 8, 2] minxy = torch.min(box_corners_in_image, dim=1)[0] maxxy = torch.max(box_corners_in_image, dim=1)[0] box_2d_preds = torch.cat([minxy, maxxy], dim=1) # Post-processing # check box_preds_camera image_shape = box_preds.tensor.new_tensor(img_shape) valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) & (box_2d_preds[:, 1] < image_shape[0]) & (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0)) # check box_preds limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range) valid_pcd_inds = ((box_preds.center > limit_range[:3]) & (box_preds.center < limit_range[3:])) valid_inds = valid_cam_inds & valid_pcd_inds.all(-1) if valid_inds.sum() > 0: return dict( bbox=box_2d_preds[valid_inds, :].numpy(), box3d_camera=box_preds_camera[valid_inds].tensor.numpy(), box3d_lidar=box_preds[valid_inds].tensor.numpy(), scores=scores[valid_inds].numpy(), label_preds=labels[valid_inds].numpy(), sample_idx=sample_idx) else: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=dict(backend='disk')), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['points']) ] if self.modality['use_camera']: pipeline.insert(0, dict(type='LoadImageFromFile')) return Compose(pipeline) def show(self, results, out_dir, show=True, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Whether to visualize the results online. Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): if 'pts_bbox' in result.keys(): result = result['pts_bbox'] data_info = self.data_infos[i] pts_path = data_info['point_cloud']['velodyne_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points, img_metas, img = self._extract_data( i, pipeline, ['points', 'img_metas', 'img']) points = points.numpy() # for now we convert points into depth mode points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy() show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) pred_bboxes = result['boxes_3d'].tensor.numpy() show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir, file_name, show) # multi-modality visualization if self.modality['use_camera'] and 'lidar2img' in img_metas.keys(): img = img.numpy() # need to transpose channel to first dim img = img.transpose(1, 2, 0) show_pred_bboxes = LiDARInstance3DBoxes( pred_bboxes, origin=(0.5, 0.5, 0)) show_gt_bboxes = LiDARInstance3DBoxes( gt_bboxes, origin=(0.5, 0.5, 0)) show_multi_modality_result( img, show_gt_bboxes, show_pred_bboxes, img_metas['lidar2img'], out_dir, file_name, box_mode='lidar', show=show) ================================================ FILE: mmdet3d/datasets/kitti_mono_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import tempfile from os import path as osp import mmcv import numpy as np import torch from mmcv.utils import print_log from ..core.bbox import Box3DMode, CameraInstance3DBoxes, points_cam2img from .builder import DATASETS from .nuscenes_mono_dataset import NuScenesMonoDataset @DATASETS.register_module() class KittiMonoDataset(NuScenesMonoDataset): """Monocular 3D detection on KITTI Dataset. Args: data_root (str): Path of dataset root. info_file (str): Path of info file. load_interval (int, optional): Interval of loading the dataset. It is used to uniformly sample the dataset. Defaults to 1. with_velocity (bool, optional): Whether include velocity prediction into the experiments. Defaults to False. eval_version (str, optional): Configuration version of evaluation. Defaults to None. version (str, optional): Dataset version. Defaults to None. kwargs (dict): Other arguments are the same of NuScenesMonoDataset. """ CLASSES = ('Pedestrian', 'Cyclist', 'Car') def __init__(self, data_root, info_file, ann_file, pipeline, load_interval=1, with_velocity=False, eval_version=None, version=None, **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, load_interval=load_interval, with_velocity=with_velocity, eval_version=eval_version, version=version, **kwargs) self.anno_infos = mmcv.load(info_file) self.bbox_code_size = 7 def _parse_ann_info(self, img_info, ann_info): """Parse bbox and mask annotation. Args: ann_info (list[dict]): Annotation info of an image. with_mask (bool): Whether to parse mask annotations. Returns: dict: A dict containing the following keys: bboxes, bboxes_ignore, labels, masks, seg_map. "masks" are raw annotations and not decoded into binary masks. """ gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks_ann = [] gt_bboxes_cam3d = [] centers2d = [] depths = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) if inter_w * inter_h == 0: continue if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_id'] not in self.cat_ids: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) gt_masks_ann.append(ann.get('segmentation', None)) # 3D annotations in camera coordinates bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(-1, ) gt_bboxes_cam3d.append(bbox_cam3d) # 2.5D annotations in camera coordinates center2d = ann['center2d'][:2] depth = ann['center2d'][2] centers2d.append(center2d) depths.append(depth) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_cam3d: gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32) centers2d = np.array(centers2d, dtype=np.float32) depths = np.array(depths, dtype=np.float32) else: gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size), dtype=np.float32) centers2d = np.zeros((0, 2), dtype=np.float32) depths = np.zeros((0), dtype=np.float32) gt_bboxes_cam3d = CameraInstance3DBoxes( gt_bboxes_cam3d, box_dim=gt_bboxes_cam3d.shape[-1], origin=(0.5, 0.5, 0.5)) gt_labels_3d = copy.deepcopy(gt_labels) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) seg_map = img_info['filename'].replace('jpg', 'png') ann = dict( bboxes=gt_bboxes, labels=gt_labels, gt_bboxes_3d=gt_bboxes_cam3d, gt_labels_3d=gt_labels_3d, centers2d=centers2d, depths=depths, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann, seg_map=seg_map) return ann def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None if not isinstance(outputs[0], dict): result_files = self.bbox2result_kitti2d(outputs, self.CLASSES, pklfile_prefix, submission_prefix) elif 'pts_bbox' in outputs[0] or 'img_bbox' in outputs[0] or \ 'img_bbox2d' in outputs[0]: result_files = dict() for name in outputs[0]: results_ = [out[name] for out in outputs] pklfile_prefix_ = pklfile_prefix + name if submission_prefix is not None: submission_prefix_ = submission_prefix + name else: submission_prefix_ = None if '2d' in name: result_files_ = self.bbox2result_kitti2d( results_, self.CLASSES, pklfile_prefix_, submission_prefix_) else: result_files_ = self.bbox2result_kitti( results_, self.CLASSES, pklfile_prefix_, submission_prefix_) result_files[name] = result_files_ else: result_files = self.bbox2result_kitti(outputs, self.CLASSES, pklfile_prefix, submission_prefix) return result_files, tmp_dir def evaluate(self, results, metric=None, logger=None, pklfile_prefix=None, submission_prefix=None, show=False, out_dir=None, pipeline=None): """Evaluation in KITTI protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str], optional): Metrics to be evaluated. Defaults to None. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. pklfile_prefix (str, optional): The prefix of pkl files, including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str, optional): The prefix of submission data. If not specified, the submission data will not be generated. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir = self.format_results(results, pklfile_prefix) from mmdet3d.core.evaluation import kitti_eval gt_annos = [info['annos'] for info in self.anno_infos] if isinstance(result_files, dict): ap_dict = dict() for name, result_files_ in result_files.items(): eval_types = ['bbox', 'bev', '3d'] if '2d' in name: eval_types = ['bbox'] ap_result_str, ap_dict_ = kitti_eval( gt_annos, result_files_, self.CLASSES, eval_types=eval_types) for ap_type, ap in ap_dict_.items(): ap_dict[f'{name}/{ap_type}'] = float('{:.4f}'.format(ap)) print_log( f'Results of {name}:\n' + ap_result_str, logger=logger) else: if metric == 'img_bbox2d': ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bbox']) else: ap_result_str, ap_dict = kitti_eval(gt_annos, result_files, self.CLASSES) print_log('\n' + ap_result_str, logger=logger) if tmp_dir is not None: tmp_dir.cleanup() if show or out_dir: self.show(results, out_dir, show=show, pipeline=pipeline) return ap_dict def bbox2result_kitti(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert 3D detection results to kitti format for evaluation and test submission. Args: net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. pklfile_prefix (str): The prefix of pkl file. submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries with the kitti format. """ assert len(net_outputs) == len(self.anno_infos) if submission_prefix is not None: mmcv.mkdir_or_exist(submission_prefix) det_annos = [] print('\nConverting prediction to KITTI format') for idx, pred_dicts in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] info = self.anno_infos[idx] sample_idx = info['image']['image_idx'] image_shape = info['image']['image_shape'][:2] box_dict = self.convert_valid_bboxes(pred_dicts, info) anno = { 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [], 'score': [] } if len(box_dict['bbox']) > 0: box_2d_preds = box_dict['bbox'] box_preds = box_dict['box3d_camera'] scores = box_dict['scores'] box_preds_lidar = box_dict['box3d_lidar'] label_preds = box_dict['label_preds'] for box, box_lidar, bbox, score, label in zip( box_preds, box_preds_lidar, box_2d_preds, scores, label_preds): bbox[2:] = np.minimum(bbox[2:], image_shape[::-1]) bbox[:2] = np.maximum(bbox[:2], [0, 0]) anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append(-np.arctan2(box[0], box[2]) + box[6]) anno['bbox'].append(bbox) anno['dimensions'].append(box[3:6]) anno['location'].append(box[:3]) anno['rotation_y'].append(box[6]) anno['score'].append(score) anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) else: anno = { 'name': np.array([]), 'truncated': np.array([]), 'occluded': np.array([]), 'alpha': np.array([]), 'bbox': np.zeros([0, 4]), 'dimensions': np.zeros([0, 3]), 'location': np.zeros([0, 3]), 'rotation_y': np.array([]), 'score': np.array([]), } annos.append(anno) if submission_prefix is not None: curr_file = f'{submission_prefix}/{sample_idx:06d}.txt' with open(curr_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format( anno['name'][idx], anno['alpha'][idx], bbox[idx][0], bbox[idx][1], bbox[idx][2], bbox[idx][3], dims[idx][1], dims[idx][2], dims[idx][0], loc[idx][0], loc[idx][1], loc[idx][2], anno['rotation_y'][idx], anno['score'][idx]), file=f) annos[-1]['sample_idx'] = np.array( [sample_idx] * len(annos[-1]['score']), dtype=np.int64) det_annos += annos if pklfile_prefix is not None: if not pklfile_prefix.endswith(('.pkl', '.pickle')): out = f'{pklfile_prefix}.pkl' mmcv.dump(det_annos, out) print('Result is saved to %s' % out) return det_annos def bbox2result_kitti2d(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert 2D detection results to kitti format for evaluation and test submission. Args: net_outputs (list[np.ndarray]): List of array storing the inferenced bounding boxes and scores. class_names (list[String]): A list of class names. pklfile_prefix (str): The prefix of pkl file. submission_prefix (str): The prefix of submission file. Returns: list[dict]: A list of dictionaries have the kitti format """ assert len(net_outputs) == len(self.anno_infos) det_annos = [] print('\nConverting prediction to KITTI format') for i, bboxes_per_sample in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] anno = dict( name=[], truncated=[], occluded=[], alpha=[], bbox=[], dimensions=[], location=[], rotation_y=[], score=[]) sample_idx = self.anno_infos[i]['image']['image_idx'] num_example = 0 for label in range(len(bboxes_per_sample)): bbox = bboxes_per_sample[label] for i in range(bbox.shape[0]): anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append(-10) anno['bbox'].append(bbox[i, :4]) # set dimensions (height, width, length) to zero anno['dimensions'].append( np.zeros(shape=[3], dtype=np.float32)) # set the 3D translation to (-1000, -1000, -1000) anno['location'].append( np.ones(shape=[3], dtype=np.float32) * (-1000.0)) anno['rotation_y'].append(0.0) anno['score'].append(bbox[i, 4]) num_example += 1 if num_example == 0: annos.append( dict( name=np.array([]), truncated=np.array([]), occluded=np.array([]), alpha=np.array([]), bbox=np.zeros([0, 4]), dimensions=np.zeros([0, 3]), location=np.zeros([0, 3]), rotation_y=np.array([]), score=np.array([]), )) else: anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) annos[-1]['sample_idx'] = np.array( [sample_idx] * num_example, dtype=np.int64) det_annos += annos if pklfile_prefix is not None: if not pklfile_prefix.endswith(('.pkl', '.pickle')): out = f'{pklfile_prefix}.pkl' mmcv.dump(det_annos, out) print('Result is saved to %s' % out) if submission_prefix is not None: # save file in submission format mmcv.mkdir_or_exist(submission_prefix) print(f'Saving KITTI submission to {submission_prefix}') for i, anno in enumerate(det_annos): sample_idx = self.anno_infos[i]['image']['image_idx'] cur_det_file = f'{submission_prefix}/{sample_idx:06d}.txt' with open(cur_det_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'][::-1] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:4f} {:4f} {:4f} {:4f} {:4f} {:4f} ' '{:4f} {:4f} {:4f} {:4f} {:4f} {:4f} {:4f}'.format( anno['name'][idx], anno['alpha'][idx], *bbox[idx], # 4 float *dims[idx], # 3 float *loc[idx], # 3 float anno['rotation_y'][idx], anno['score'][idx]), file=f, ) print(f'Result is saved to {submission_prefix}') return det_annos def convert_valid_bboxes(self, box_dict, info): """Convert the predicted boxes into valid ones. Args: box_dict (dict): Box dictionaries to be converted. - boxes_3d (:obj:`CameraInstance3DBoxes`): 3D bounding boxes. - scores_3d (torch.Tensor): Scores of boxes. - labels_3d (torch.Tensor): Class labels of boxes. info (dict): Data info. Returns: dict: Valid predicted boxes. - bbox (np.ndarray): 2D bounding boxes. - box3d_camera (np.ndarray): 3D bounding boxes in camera coordinate. - scores (np.ndarray): Scores of boxes. - label_preds (np.ndarray): Class label predictions. - sample_idx (int): Sample index. """ box_preds = box_dict['boxes_3d'] scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] if len(box_preds) == 0: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P2 = info['calib']['P2'].astype(np.float32) img_shape = info['image']['image_shape'] P2 = box_preds.tensor.new_tensor(P2) box_preds_camera = box_preds box_preds_lidar = box_preds.convert_to(Box3DMode.LIDAR, np.linalg.inv(rect @ Trv2c)) box_corners = box_preds_camera.corners box_corners_in_image = points_cam2img(box_corners, P2) # box_corners_in_image: [N, 8, 2] minxy = torch.min(box_corners_in_image, dim=1)[0] maxxy = torch.max(box_corners_in_image, dim=1)[0] box_2d_preds = torch.cat([minxy, maxxy], dim=1) # Post-processing # check box_preds_camera image_shape = box_preds.tensor.new_tensor(img_shape) valid_cam_inds = ((box_2d_preds[:, 0] < image_shape[1]) & (box_2d_preds[:, 1] < image_shape[0]) & (box_2d_preds[:, 2] > 0) & (box_2d_preds[:, 3] > 0)) # check box_preds valid_inds = valid_cam_inds if valid_inds.sum() > 0: return dict( bbox=box_2d_preds[valid_inds, :].numpy(), box3d_camera=box_preds_camera[valid_inds].tensor.numpy(), box3d_lidar=box_preds_lidar[valid_inds].tensor.numpy(), scores=scores[valid_inds].numpy(), label_preds=labels[valid_inds].numpy(), sample_idx=sample_idx) else: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) ================================================ FILE: mmdet3d/datasets/lyft_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import tempfile from os import path as osp import mmcv import numpy as np import pandas as pd from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft from lyft_dataset_sdk.utils.data_classes import Box as LyftBox from pyquaternion import Quaternion from mmdet3d.core.evaluation.lyft_eval import lyft_eval from ..core import show_result from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes from .builder import DATASETS from .custom_3d import Custom3DDataset from .pipelines import Compose @DATASETS.register_module() class LyftDataset(Custom3DDataset): r"""Lyft Dataset. This class serves as the API for experiments on the Lyft Dataset. Please refer to ``_ for data downloading. Args: ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. data_root (str): Path of dataset root. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. load_interval (int, optional): Interval of loading the dataset. It is used to uniformly sample the dataset. Defaults to 1. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ # noqa: E501 NameMapping = { 'bicycle': 'bicycle', 'bus': 'bus', 'car': 'car', 'emergency_vehicle': 'emergency_vehicle', 'motorcycle': 'motorcycle', 'other_vehicle': 'other_vehicle', 'pedestrian': 'pedestrian', 'truck': 'truck', 'animal': 'animal' } DefaultAttribute = { 'car': 'is_stationary', 'truck': 'is_stationary', 'bus': 'is_stationary', 'emergency_vehicle': 'is_stationary', 'other_vehicle': 'is_stationary', 'motorcycle': 'is_stationary', 'bicycle': 'is_stationary', 'pedestrian': 'is_stationary', 'animal': 'is_stationary' } CLASSES = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal') def __init__(self, ann_file, pipeline=None, data_root=None, classes=None, load_interval=1, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, **kwargs): self.load_interval = load_interval super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, **kwargs) if self.modality is None: self.modality = dict( use_camera=False, use_lidar=True, use_radar=False, use_map=False, use_external=False, ) def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations sorted by timestamps. """ # loading data from a file-like object needs file format data = mmcv.load(ann_file, file_format='pkl') data_infos = list(sorted(data['infos'], key=lambda e: e['timestamp'])) data_infos = data_infos[::self.load_interval] self.metadata = data['metadata'] self.version = self.metadata['version'] return data_infos def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): sample index - pts_filename (str): filename of point clouds - sweeps (list[dict]): infos of sweeps - timestamp (float): sample timestamp - img_filename (str, optional): image filename - lidar2img (list[np.ndarray], optional): transformations from lidar to different cameras - ann_info (dict): annotation info """ info = self.data_infos[index] # standard protocol modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], timestamp=info['timestamp'] / 1e6, ) if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] for cam_type, cam_info in info['cams'].items(): image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, )) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes. - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. """ info = self.data_infos[index] gt_bboxes_3d = info['gt_boxes'] gt_names_3d = info['gt_names'] gt_labels_3d = [] for cat in gt_names_3d: if cat in self.CLASSES: gt_labels_3d.append(self.CLASSES.index(cat)) else: gt_labels_3d.append(-1) gt_labels_3d = np.array(gt_labels_3d) if 'gt_shape' in info: gt_shape = info['gt_shape'] gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_shape], axis=-1) # the lyft box center is [0.5, 0.5, 0.5], we change it to be # the same as KITTI (0.5, 0.5, 0) gt_bboxes_3d = LiDARInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, ) return anns_results def _format_bbox(self, results, jsonfile_prefix=None): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ lyft_annos = {} mapped_class_names = self.CLASSES print('Start to convert detection format...') for sample_id, det in enumerate(mmcv.track_iter_progress(results)): annos = [] boxes = output_to_lyft_box(det) sample_token = self.data_infos[sample_id]['token'] boxes = lidar_lyft_box_to_global(self.data_infos[sample_id], boxes) for i, box in enumerate(boxes): name = mapped_class_names[box.label] lyft_anno = dict( sample_token=sample_token, translation=box.center.tolist(), size=box.wlh.tolist(), rotation=box.orientation.elements.tolist(), name=name, score=box.score) annos.append(lyft_anno) lyft_annos[sample_token] = annos lyft_submissions = { 'meta': self.modality, 'results': lyft_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'results_lyft.json') print('Results writes to', res_path) mmcv.dump(lyft_submissions, res_path) return res_path def _evaluate_single(self, result_path, logger=None, metric='bbox', result_name='pts_bbox'): """Evaluation for a single model in Lyft protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. metric (str, optional): Metric name used for evaluation. Default: 'bbox'. result_name (str, optional): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ output_dir = osp.join(*osp.split(result_path)[:-1]) lyft = Lyft( data_path=osp.join(self.data_root, self.version), json_path=osp.join(self.data_root, self.version, self.version), verbose=True) eval_set_map = { 'v1.01-train': 'val', } metrics = lyft_eval(lyft, self.data_root, result_path, eval_set_map[self.version], output_dir, logger) # record metrics detail = dict() metric_prefix = f'{result_name}_Lyft' for i, name in enumerate(metrics['class_names']): AP = float(metrics['mAPs_cate'][i]) detail[f'{metric_prefix}/{name}_AP'] = AP detail[f'{metric_prefix}/mAP'] = metrics['Final mAP'] return detail def format_results(self, results, jsonfile_prefix=None, csv_savepath=None): """Format the results to json (standard format for COCO evaluation). Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. csv_savepath (str): The path for saving csv files. It includes the file path and the csv filename, e.g., "a/b/filename.csv". If not specified, the result will not be converted to csv file. Returns: tuple: Returns (result_files, tmp_dir), where `result_files` is a dict containing the json filepaths, `tmp_dir` is the temporal directory created for saving json files when `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None # currently the output prediction results could be in two formats # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) # 2. list of dict('pts_bbox' or 'img_bbox': # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) # this is a workaround to enable evaluation of both formats on Lyft # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): result_files = self._format_bbox(results, jsonfile_prefix) else: # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict result_files = dict() for name in results[0]: print(f'\nFormating bboxes of {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_files.update( {name: self._format_bbox(results_, tmp_file_)}) if csv_savepath is not None: self.json2csv(result_files['pts_bbox'], csv_savepath) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, jsonfile_prefix=None, csv_savepath=None, result_names=['pts_bbox'], show=False, out_dir=None, pipeline=None): """Evaluation in Lyft protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str], optional): Metrics to be evaluated. Default: 'bbox'. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str, optional): The prefix of json files including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. csv_savepath (str, optional): The path for saving csv files. It includes the file path and the csv filename, e.g., "a/b/filename.csv". If not specified, the result will not be converted to csv file. result_names (list[str], optional): Result names in the metric prefix. Default: ['pts_bbox']. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str, float]: Evaluation results. """ result_files, tmp_dir = self.format_results(results, jsonfile_prefix, csv_savepath) if isinstance(result_files, dict): results_dict = dict() for name in result_names: print(f'Evaluating bboxes of {name}') ret_dict = self._evaluate_single(result_files[name]) results_dict.update(ret_dict) elif isinstance(result_files, str): results_dict = self._evaluate_single(result_files) if tmp_dir is not None: tmp_dir.cleanup() if show or out_dir: self.show(results, out_dir, show=show, pipeline=pipeline) return results_dict def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=dict(backend='disk')), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=dict(backend='disk')), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['points']) ] return Compose(pipeline) def show(self, results, out_dir, show=False, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Whether to visualize the results online. Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): if 'pts_bbox' in result.keys(): result = result['pts_bbox'] data_info = self.data_infos[i] pts_path = data_info['lidar_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points = self._extract_data(i, pipeline, 'points').numpy() points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) inds = result['scores_3d'] > 0.1 gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy() show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) pred_bboxes = result['boxes_3d'][inds].tensor.numpy() show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir, file_name, show) def json2csv(self, json_path, csv_savepath): """Convert the json file to csv format for submission. Args: json_path (str): Path of the result json file. csv_savepath (str): Path to save the csv file. """ results = mmcv.load(json_path)['results'] sample_list_path = osp.join(self.data_root, 'sample_submission.csv') data = pd.read_csv(sample_list_path) Id_list = list(data['Id']) pred_list = list(data['PredictionString']) cnt = 0 print('Converting the json to csv...') for token in results.keys(): cnt += 1 predictions = results[token] prediction_str = '' for i in range(len(predictions)): prediction_str += \ str(predictions[i]['score']) + ' ' + \ str(predictions[i]['translation'][0]) + ' ' + \ str(predictions[i]['translation'][1]) + ' ' + \ str(predictions[i]['translation'][2]) + ' ' + \ str(predictions[i]['size'][0]) + ' ' + \ str(predictions[i]['size'][1]) + ' ' + \ str(predictions[i]['size'][2]) + ' ' + \ str(Quaternion(list(predictions[i]['rotation'])) .yaw_pitch_roll[0]) + ' ' + \ predictions[i]['name'] + ' ' prediction_str = prediction_str[:-1] idx = Id_list.index(token) pred_list[idx] = prediction_str df = pd.DataFrame({'Id': Id_list, 'PredictionString': pred_list}) mmcv.mkdir_or_exist(os.path.dirname(csv_savepath)) df.to_csv(csv_savepath, index=False) def output_to_lyft_box(detection): """Convert the output to the box class in the Lyft. Args: detection (dict): Detection results. Returns: list[:obj:`LyftBox`]: List of standard LyftBoxes. """ box3d = detection['boxes_3d'] scores = detection['scores_3d'].numpy() labels = detection['labels_3d'].numpy() box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() # our LiDAR coordinate system -> Lyft box coordinate system lyft_box_dims = box_dims[:, [1, 0, 2]] box_list = [] for i in range(len(box3d)): quat = Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) box = LyftBox( box_gravity_center[i], lyft_box_dims[i], quat, label=labels[i], score=scores[i]) box_list.append(box) return box_list def lidar_lyft_box_to_global(info, boxes): """Convert the box from ego to global coordinate. Args: info (dict): Info for a specific sample data, including the calibration information. boxes (list[:obj:`LyftBox`]): List of predicted LyftBoxes. Returns: list: List of standard LyftBoxes in the global coordinate. """ box_list = [] for box in boxes: # Move box to ego vehicle coord system box.rotate(Quaternion(info['lidar2ego_rotation'])) box.translate(np.array(info['lidar2ego_translation'])) # Move box to global coord system box.rotate(Quaternion(info['ego2global_rotation'])) box.translate(np.array(info['ego2global_translation'])) box_list.append(box) return box_list ================================================ FILE: mmdet3d/datasets/map_utils/mean_ap.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from multiprocessing import Pool from shapely.geometry import LineString, Polygon import mmcv import numpy as np from mmcv.utils import print_log from terminaltables import AsciiTable import json from os import path as osp import os from functools import partial from .tpfp import tpfp_gen, custom_tpfp_gen def average_precision(recalls, precisions, mode='area'): """Calculate average precision (for single or multiple scales). Args: recalls (ndarray): shape (num_scales, num_dets) or (num_dets, ) precisions (ndarray): shape (num_scales, num_dets) or (num_dets, ) mode (str): 'area' or '11points', 'area' means calculating the area under precision-recall curve, '11points' means calculating the average precision of recalls at [0, 0.1, ..., 1] Returns: float or ndarray: calculated average precision """ no_scale = False if recalls.ndim == 1: no_scale = True recalls = recalls[np.newaxis, :] precisions = precisions[np.newaxis, :] assert recalls.shape == precisions.shape and recalls.ndim == 2 num_scales = recalls.shape[0] ap = np.zeros(num_scales, dtype=np.float32) if mode == 'area': zeros = np.zeros((num_scales, 1), dtype=recalls.dtype) ones = np.ones((num_scales, 1), dtype=recalls.dtype) mrec = np.hstack((zeros, recalls, ones)) mpre = np.hstack((zeros, precisions, zeros)) for i in range(mpre.shape[1] - 1, 0, -1): mpre[:, i - 1] = np.maximum(mpre[:, i - 1], mpre[:, i]) for i in range(num_scales): ind = np.where(mrec[i, 1:] != mrec[i, :-1])[0] ap[i] = np.sum( (mrec[i, ind + 1] - mrec[i, ind]) * mpre[i, ind + 1]) elif mode == '11points': for i in range(num_scales): for thr in np.arange(0, 1 + 1e-3, 0.1): precs = precisions[i, recalls[i, :] >= thr] prec = precs.max() if precs.size > 0 else 0 ap[i] += prec ap /= 11 else: raise ValueError( 'Unrecognized mode, only "area" and "11points" are supported') if no_scale: ap = ap[0] return ap def get_cls_results(gen_results, annotations, num_sample=100, num_pred_pts_per_instance=30, eval_use_same_gt_sample_num_flag=False, class_id=0, fix_interval=False): """Get det results and gt information of a certain class. Args: gen_results (list[list]): Same as `eval_map()`. annotations (list[dict]): Same as `eval_map()`. class_id (int): ID of a specific class. Returns: tuple[list[np.ndarray]]: detected bboxes, gt bboxes """ # if len(gen_results) == 0 or cls_gens, cls_scores = [], [] for res in gen_results['vectors']: if res['type'] == class_id: if len(res['pts']) < 2: continue if not eval_use_same_gt_sample_num_flag: sampled_points = np.array(res['pts']) else: line = res['pts'] line = LineString(line) if fix_interval: distances = list(np.arange(1., line.length, 1.)) distances = [0,] + distances + [line.length,] sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) else: distances = np.linspace(0, line.length, num_sample) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) cls_gens.append(sampled_points) cls_scores.append(res['confidence_level']) num_res = len(cls_gens) if num_res > 0: cls_gens = np.stack(cls_gens).reshape(num_res,-1) cls_scores = np.array(cls_scores)[:,np.newaxis] cls_gens = np.concatenate([cls_gens,cls_scores],axis=-1) # print(f'for class {i}, cls_gens has shape {cls_gens.shape}') else: if not eval_use_same_gt_sample_num_flag: cls_gens = np.zeros((0,num_pred_pts_per_instance*2+1)) else: cls_gens = np.zeros((0,num_sample*2+1)) # print(f'for class {i}, cls_gens has shape {cls_gens.shape}') cls_gts = [] for ann in annotations['vectors']: if ann['type'] == class_id: # line = ann['pts'] + np.array((1,1)) # for hdmapnet line = ann['pts'] # line = ann['pts'].cumsum(0) line = LineString(line) distances = np.linspace(0, line.length, num_sample) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) cls_gts.append(sampled_points) num_gts = len(cls_gts) if num_gts > 0: cls_gts = np.stack(cls_gts).reshape(num_gts,-1) else: cls_gts = np.zeros((0,num_sample*2)) return cls_gens, cls_gts # ones = np.ones((num_gts,1)) # tmp_cls_gens = np.concatenate([cls_gts,ones],axis=-1) # return tmp_cls_gens, cls_gts def format_res_gt_by_classes(result_path, gen_results, annotations, cls_names=None, num_pred_pts_per_instance=30, eval_use_same_gt_sample_num_flag=False, pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0], nproc=24): assert cls_names is not None timer = mmcv.Timer() num_fixed_sample_pts = 100 fix_interval = False print('results path: {}'.format(result_path)) output_dir = osp.join(*osp.split(result_path)[:-1]) assert len(gen_results) == len(annotations) gen_results = [gen_results[each['sample_token']] for each in annotations] pool = Pool(nproc) cls_gens, cls_gts = {}, {} print('Formatting ...') formatting_file = 'cls_formatted.pkl' formatting_file = osp.join(output_dir,formatting_file) # for vis if False: from PIL import Image import matplotlib.pyplot as plt from matplotlib import transforms from matplotlib.patches import Rectangle show_dir = osp.join(output_dir,'vis_json') mmcv.mkdir_or_exist(osp.abspath(show_dir)) # import pdb;pdb.set_trace() car_img = Image.open('./figs/lidar_car.png') colors_plt = ['r', 'b', 'g'] for i in range(20): plt.figure(figsize=(2, 4)) plt.xlim(pc_range[0], pc_range[3]) plt.ylim(pc_range[1], pc_range[4]) plt.axis('off') for line in gen_results[i]['vectors']: l = np.array(line['pts']) plt.plot(l[:,0],l[:,1],'-', # color=colors[line['type']] color = 'red', ) for line in annotations[i]['vectors']: # l = np.array(line['pts']) + np.array((1,1)) l = np.array(line['pts']) # l = line['pts'] plt.plot(l[:,0],l[:,1],'-', # color=colors[line['type']], color = 'blue', ) plt.imshow(car_img, extent=[-1.2, 1.2, -1.5, 1.5]) map_path = osp.join(show_dir, 'COMPARE_MAP_{}.jpg'.format(i)) plt.savefig(map_path, bbox_inches='tight', dpi=400) plt.close() for i, clsname in enumerate(cls_names): gengts = pool.starmap( partial(get_cls_results, num_sample=num_fixed_sample_pts, num_pred_pts_per_instance=num_pred_pts_per_instance, eval_use_same_gt_sample_num_flag=eval_use_same_gt_sample_num_flag,class_id=i,fix_interval=fix_interval), zip(gen_results, annotations)) # gengts = map(partial(get_cls_results, num_sample=num_fixed_sample_pts, class_id=i,fix_interval=fix_interval), # zip(gen_results, annotations)) # import pdb;pdb.set_trace() gens, gts = tuple(zip(*gengts)) cls_gens[clsname] = gens cls_gts[clsname] = gts mmcv.dump([cls_gens, cls_gts],formatting_file) print('Cls data formatting done in {:2f}s!! with {}'.format(float(timer.since_start()),formatting_file)) pool.close() return cls_gens, cls_gts def eval_map(gen_results, annotations, cls_gens, cls_gts, threshold=0.5, cls_names=None, logger=None, tpfp_fn=None, pc_range=[-15.0, -30.0, -5.0, 15.0, 30.0, 3.0], metric=None, num_pred_pts_per_instance=30, nproc=24): timer = mmcv.Timer() pool = Pool(nproc) eval_results = [] for i, clsname in enumerate(cls_names): # get gt and det bboxes of this class cls_gen = cls_gens[clsname] cls_gt = cls_gts[clsname] # choose proper function according to datasets to compute tp and fp # XXX # func_name = cls2func[clsname] # tpfp_fn = tpfp_fn_dict[tpfp_fn_name] tpfp_fn = custom_tpfp_gen # Trick for serialized # only top-level function can be serized # somehow use partitial the return function is defined # at the top level. # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold, metric=metric) # import pdb; pdb.set_trace() # TODO this is a hack tpfp_fn = partial(tpfp_fn, threshold=threshold, metric=metric) args = [] # compute tp and fp for each image with multiple processes tpfp = pool.starmap( tpfp_fn, zip(cls_gen, cls_gt, *args)) # import pdb;pdb.set_trace() tp, fp = tuple(zip(*tpfp)) # map_results = map( # tpfp_fn, # cls_gen, cls_gt) # tp, fp = tuple(map(list, zip(*map_results))) # debug and testing # for i in range(len(cls_gen)): # # print(i) # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold) # print(i) # tpfp = (tpfp,) # print(tpfp) # i = 0 # tpfp = tpfp_fn(cls_gen[i], cls_gt[i],threshold=threshold) # import pdb; pdb.set_trace() # XXX num_gts = 0 for j, bbox in enumerate(cls_gt): num_gts += bbox.shape[0] # sort all det bboxes by score, also sort tp and fp # import pdb;pdb.set_trace() cls_gen = np.vstack(cls_gen) num_dets = cls_gen.shape[0] sort_inds = np.argsort(-cls_gen[:, -1]) #descending, high score front tp = np.hstack(tp)[sort_inds] fp = np.hstack(fp)[sort_inds] # calculate recall and precision with tp and fp # num_det*num_res tp = np.cumsum(tp, axis=0) fp = np.cumsum(fp, axis=0) eps = np.finfo(np.float32).eps recalls = tp / np.maximum(num_gts, eps) precisions = tp / np.maximum((tp + fp), eps) # calculate AP # if dataset != 'voc07' else '11points' mode = 'area' ap = average_precision(recalls, precisions, mode) eval_results.append({ 'num_gts': num_gts, 'num_dets': num_dets, 'recall': recalls, 'precision': precisions, 'ap': ap }) print('cls:{} done in {:2f}s!!'.format(clsname,float(timer.since_last_check()))) pool.close() aps = [] for cls_result in eval_results: if cls_result['num_gts'] > 0: aps.append(cls_result['ap']) mean_ap = np.array(aps).mean().item() if len(aps) else 0.0 print_map_summary( mean_ap, eval_results, class_name=cls_names, logger=logger) return mean_ap, eval_results def print_map_summary(mean_ap, results, class_name=None, scale_ranges=None, logger=None): """Print mAP and results of each class. A table will be printed to show the gts/dets/recall/AP of each class and the mAP. Args: mean_ap (float): Calculated from `eval_map()`. results (list[dict]): Calculated from `eval_map()`. dataset (list[str] | str | None): Dataset name or dataset classes. scale_ranges (list[tuple] | None): Range of scales to be evaluated. logger (logging.Logger | str | None): The way to print the mAP summary. See `mmcv.utils.print_log()` for details. Default: None. """ if logger == 'silent': return if isinstance(results[0]['ap'], np.ndarray): num_scales = len(results[0]['ap']) else: num_scales = 1 if scale_ranges is not None: assert len(scale_ranges) == num_scales num_classes = len(results) recalls = np.zeros((num_scales, num_classes), dtype=np.float32) aps = np.zeros((num_scales, num_classes), dtype=np.float32) num_gts = np.zeros((num_scales, num_classes), dtype=int) for i, cls_result in enumerate(results): if cls_result['recall'].size > 0: recalls[:, i] = np.array(cls_result['recall'], ndmin=2)[:, -1] aps[:, i] = cls_result['ap'] num_gts[:, i] = cls_result['num_gts'] label_names = class_name if not isinstance(mean_ap, list): mean_ap = [mean_ap] header = ['class', 'gts', 'dets', 'recall', 'ap'] for i in range(num_scales): if scale_ranges is not None: print_log(f'Scale range {scale_ranges[i]}', logger=logger) table_data = [header] for j in range(num_classes): row_data = [ label_names[j], num_gts[i, j], results[j]['num_dets'], f'{recalls[i, j]:.3f}', f'{aps[i, j]:.3f}' ] table_data.append(row_data) table_data.append(['mAP', '', '', '', f'{mean_ap[i]:.3f}']) table = AsciiTable(table_data) table.inner_footing_row_border = True print_log('\n' + table.table, logger=logger) ================================================ FILE: mmdet3d/datasets/map_utils/tpfp.py ================================================ import mmcv import numpy as np from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps from .tpfp_chamfer import vec_iou, convex_iou, rbbox_iou, polyline_score, custom_polyline_score from shapely.geometry import LineString, Polygon # from vecmapnet_ops.ops.iou import convex_iou def tpfp_bbox(det_bboxes, gt_bboxes, gt_bbox_masks, threshold=0.5): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, of shape (k, 4). Default: None iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. use_legacy_coordinate (bool): Whether to use coordinate system in mmdet v1.x. which means width, height should be calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. Default: False. Returns: tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of each array is (num_scales, m). """ num_dets = len(det_bboxes) num_gts = len(gt_bboxes) # tp and fp tp = np.zeros((num_dets), dtype=np.float32) fp = np.zeros((num_dets), dtype=np.float32) # if there is no gt bboxes in this image, then all det bboxes # within area range are false positives # XXX if num_gts == 0: fp[...] = 1 return tp, fp if num_dets == 0: return tp, fp # # distance matrix: n x m bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2) bbox_g = gt_bboxes.reshape(num_gts,-1,2) bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2) matrix = convex_iou(bbox_p,bbox_g,bbox_gm) # for each det, the max iou with all gts matrix_max = matrix.max(axis=1) # for each det, which gt overlaps most with it matrix_argmax = matrix.argmax(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-det_bboxes[:, -1]) gt_covered = np.zeros(num_gts, dtype=bool) # tp = 0 and fp = 0 means ignore this detected bbox, for i in sort_inds: if matrix_max[i] >= threshold: matched_gt = matrix_argmax[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 return tp, fp def tpfp_rbbox(det_bboxes, gt_bboxes, gt_bbox_masks, threshold=0.5): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, of shape (k, 4). Default: None iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. use_legacy_coordinate (bool): Whether to use coordinate system in mmdet v1.x. which means width, height should be calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. Default: False. Returns: tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of each array is (num_scales, m). """ num_dets = len(det_bboxes) num_gts = len(gt_bboxes) # tp and fp tp = np.zeros((num_dets), dtype=np.float32) fp = np.zeros((num_dets), dtype=np.float32) # if there is no gt bboxes in this image, then all det bboxes # within area range are false positives # XXX if num_gts == 0: fp[...] = 1 return tp, fp if num_dets == 0: return tp, fp # # distance matrix: n x m bbox_p = det_bboxes[:, :-1].reshape(num_dets,-1,2) bbox_g = gt_bboxes.reshape(num_gts,-1,2) bbox_gm = gt_bbox_masks.reshape(num_gts,-1,2) matrix = rbbox_iou(bbox_p,bbox_g,bbox_gm) # for each det, the max iou with all gts matrix_max = matrix.max(axis=1) # for each det, which gt overlaps most with it matrix_argmax = matrix.argmax(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-det_bboxes[:, -1]) gt_covered = np.zeros(num_gts, dtype=bool) # tp = 0 and fp = 0 means ignore this detected bbox, for i in sort_inds: if matrix_max[i] >= threshold: matched_gt = matrix_argmax[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 return tp, fp def tpfp_det(det_bboxes, gt_bboxes, threshold=0.5): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, of shape (k, 4). Default: None iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. use_legacy_coordinate (bool): Whether to use coordinate system in mmdet v1.x. which means width, height should be calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. Default: False. Returns: tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of each array is (num_scales, m). """ num_dets = det_bboxes.shape[0] num_gts = gt_bboxes.shape[0] # tp and fp tp = np.zeros((num_dets), dtype=np.float32) fp = np.zeros((num_dets), dtype=np.float32) # if there is no gt bboxes in this image, then all det bboxes # within area range are false positives # XXX if num_gts == 0: fp[...] = 1 return tp, fp if num_dets == 0: return tp, fp # # distance matrix: n x m matrix = vec_iou( det_bboxes[:, :-1].reshape(num_dets,-1,2), gt_bboxes.reshape(num_gts,-1,2)) # for each det, the max iou with all gts matrix_max = matrix.max(axis=1) # for each det, which gt overlaps most with it matrix_argmax = matrix.argmax(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-det_bboxes[:, -1]) gt_covered = np.zeros(num_gts, dtype=bool) # tp = 0 and fp = 0 means ignore this detected bbox, for i in sort_inds: if matrix_max[i] >= threshold: matched_gt = matrix_argmax[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 return tp, fp def tpfp_gen(gen_lines, gt_lines, threshold=0.5, metric='POR'): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, of shape (k, 4). Default: None iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. use_legacy_coordinate (bool): Whether to use coordinate system in mmdet v1.x. which means width, height should be calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. Default: False. Returns: tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of each array is (num_scales, m). """ num_gens = gen_lines.shape[0] num_gts = gt_lines.shape[0] # tp and fp tp = np.zeros((num_gens), dtype=np.float32) fp = np.zeros((num_gens), dtype=np.float32) # if there is no gt bboxes in this image, then all det bboxes # within area range are false positives if num_gts == 0: fp[...] = 1 return tp, fp if num_gens == 0: return tp, fp gen_scores = gen_lines[:,-1] # n # distance matrix: n x m # matrix = custom_polyline_score( # gen_lines[:,:-1].reshape(num_gens,-1,2), # gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) # TODO MAY bug here matrix = polyline_score( gen_lines[:,:-1].reshape(num_gens,-1,2), gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) # for each det, the max iou with all gts matrix_max = matrix.max(axis=1) # for each det, which gt overlaps most with it matrix_argmax = matrix.argmax(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-gen_scores) gt_covered = np.zeros(num_gts, dtype=bool) # tp = 0 and fp = 0 means ignore this detected bbox, for i in sort_inds: if matrix_max[i] >= threshold: matched_gt = matrix_argmax[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 return tp, fp def custom_tpfp_gen(gen_lines, gt_lines, threshold=0.5, metric='chamfer'): """Check if detected bboxes are true positive or false positive. Args: det_bbox (ndarray): Detected bboxes of this image, of shape (m, 5). gt_bboxes (ndarray): GT bboxes of this image, of shape (n, 4). gt_bboxes_ignore (ndarray): Ignored gt bboxes of this image, of shape (k, 4). Default: None iou_thr (float): IoU threshold to be considered as matched. Default: 0.5. use_legacy_coordinate (bool): Whether to use coordinate system in mmdet v1.x. which means width, height should be calculated as 'x2 - x1 + 1` and 'y2 - y1 + 1' respectively. Default: False. Returns: tuple[np.ndarray]: (tp, fp) whose elements are 0 and 1. The shape of each array is (num_scales, m). """ if metric == 'chamfer': if threshold >0: threshold= -threshold # else: # raise NotImplementedError # import pdb;pdb.set_trace() num_gens = gen_lines.shape[0] num_gts = gt_lines.shape[0] # tp and fp tp = np.zeros((num_gens), dtype=np.float32) fp = np.zeros((num_gens), dtype=np.float32) # if there is no gt bboxes in this image, then all det bboxes # within area range are false positives if num_gts == 0: fp[...] = 1 return tp, fp if num_gens == 0: return tp, fp gen_scores = gen_lines[:,-1] # n # distance matrix: n x m matrix = custom_polyline_score( gen_lines[:,:-1].reshape(num_gens,-1,2), gt_lines.reshape(num_gts,-1,2),linewidth=2.,metric=metric) # for each det, the max iou with all gts matrix_max = matrix.max(axis=1) # for each det, which gt overlaps most with it matrix_argmax = matrix.argmax(axis=1) # sort all dets in descending order by scores sort_inds = np.argsort(-gen_scores) gt_covered = np.zeros(num_gts, dtype=bool) # tp = 0 and fp = 0 means ignore this detected bbox, for i in sort_inds: if matrix_max[i] >= threshold: matched_gt = matrix_argmax[i] if not gt_covered[matched_gt]: gt_covered[matched_gt] = True tp[i] = 1 else: fp[i] = 1 else: fp[i] = 1 return tp, fp ================================================ FILE: mmdet3d/datasets/map_utils/tpfp_chamfer.py ================================================ # from ..chamfer_dist import ChamferDistance import numpy as np from shapely.geometry import LineString, Polygon from shapely.strtree import STRtree from shapely.geometry import CAP_STYLE, JOIN_STYLE from scipy.spatial import distance import similaritymeasures # def chamfer_distance(pred_bbox, gt_bbox): # cd_dist_func = ChamferDistance.vec_cd_dist( # pred, pred_mask, tgt, tgt_mask)() def vec_iou(pred_lines, gt_lines): ''' each line with 1 meter width pred_lines: num_preds, npts, 2 gt_lines: num_gts, npts, 2 ''' num_preds = pred_lines.shape[0] num_gts = gt_lines.shape[0] pred_lines_shapely = \ [LineString(i).buffer(1., cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) for i in pred_lines] gt_lines_shapely =\ [LineString(i).buffer(1., cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) for i in gt_lines] # construct tree tree = STRtree(gt_lines_shapely) index_by_id = dict((id(pt), i) for i, pt in enumerate(gt_lines_shapely)) iou_matrix = np.zeros((num_preds, num_gts)) for i, pline in enumerate(pred_lines_shapely): for o in tree.query(pline): if o.intersects(pline): gt_id = index_by_id[id(o)] inter = o.intersection(pline).area union = o.union(pline).area iou_matrix[i, gt_id] = inter / union return iou_matrix def convex_iou(pred_lines, gt_lines, gt_mask): ''' each line with 1 meter width pred_lines: num_preds, List [npts, 2] gt_lines: num_gts, npts, 2 gt_mask: num_gts, npts, 2 ''' num_preds = len(pred_lines) num_gts = len(gt_lines) pred_lines_shapely = \ [Polygon(i).convex_hull for i in pred_lines] gt_lines_shapely =\ [Polygon(i[m].reshape(-1,2)).convex_hull for i,m in zip(gt_lines,gt_mask)] # construct tree tree = STRtree(pred_lines_shapely) index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) iou_matrix = np.zeros((num_preds, num_gts)) for i, pline in enumerate(gt_lines_shapely): for o in tree.query(pline): if o.intersects(pline): pred_id = index_by_id[id(o)] inter = o.intersection(pline).area union = o.union(pline).area iou_matrix[pred_id, i] = inter / union return iou_matrix def rbbox_iou(pred_lines, gt_lines, gt_mask): ''' each line with 1 meter width pred_lines: num_preds, List [npts, 2] gt_lines: num_gts, npts, 2 gt_mask: num_gts, npts, 2 ''' num_preds = len(pred_lines) num_gts = len(gt_lines) pred_lines_shapely = \ [Polygon(i).minimum_rotated_rectangle for i in pred_lines] gt_lines_shapely =\ [Polygon(i[m].reshape(-1,2)) for i,m in zip(gt_lines,gt_mask)] # construct tree tree = STRtree(pred_lines_shapely) index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) iou_matrix = np.zeros((num_preds, num_gts)) for i, pline in enumerate(gt_lines_shapely): for o in tree.query(pline): if o.intersects(pline): pred_id = index_by_id[id(o)] inter = o.intersection(pline).area union = o.union(pline).area iou_matrix[pred_id, i] = inter / union return iou_matrix def polyline_score(pred_lines, gt_lines, linewidth=1., metric='POR'): ''' each line with 1 meter width pred_lines: num_preds, List [npts, 2] gt_lines: num_gts, npts, 2 gt_mask: num_gts, npts, 2 ''' positive_threshold = 1. num_preds = len(pred_lines) num_gts = len(gt_lines) line_length = pred_lines.shape[1] # gt_lines = gt_lines + np.array((1.,1.)) pred_lines_shapely = \ [LineString(i).buffer(linewidth, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) for i in pred_lines] gt_lines_shapely =\ [LineString(i).buffer(linewidth, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) for i in gt_lines] # construct tree tree = STRtree(pred_lines_shapely) index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) if metric=='POR': iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64) elif metric=='frechet': iou_matrix = np.full((num_preds, num_gts), -100.) elif metric=='chamfer': iou_matrix = np.full((num_preds, num_gts), -100.) elif metric=='chamfer_v2': iou_matrix = np.full((num_preds, num_gts), -100.) for i, pline in enumerate(gt_lines_shapely): for o in tree.query(pline): if o.intersects(pline): pred_id = index_by_id[id(o)] if metric=='POR': dist_mat = distance.cdist( pred_lines[pred_id], gt_lines[i], 'euclidean') valid_ab = (dist_mat.min(-1) < positive_threshold).sum() valid_ba = (dist_mat.min(-2) < positive_threshold).sum() iou_matrix[pred_id, i] = min(valid_ba,valid_ab) / line_length # iou_matrix[pred_id, i] = ((valid_ba+valid_ab)/2) / line_length # assert iou_matrix[pred_id, i] <= 1. and iou_matrix[pred_id, i] >= 0. elif metric=='frechet': fdistance_1 = \ -similaritymeasures.frechet_dist(pred_lines[pred_id], gt_lines[i]) fdistance_2 = \ -similaritymeasures.frechet_dist(pred_lines[pred_id][::-1], gt_lines[i]) fdistance = max(fdistance_1,fdistance_2) iou_matrix[pred_id, i] = fdistance elif metric=='chamfer': dist_mat = distance.cdist( pred_lines[pred_id], gt_lines[i], 'euclidean') valid_ab = dist_mat.min(-1).sum() valid_ba = dist_mat.min(-2).sum() iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/(2*line_length) # if iou_matrix[pred_id, i] == 0: # import ipdb; ipdb.set_trace() elif metric=='chamfer_v2': dist_mat = distance.cdist( pred_lines[pred_id], gt_lines[i], 'euclidean') valid_ab = dist_mat.min(-1).sum() valid_ba = dist_mat.min(-2).sum() iou_matrix[pred_id, i] = -(valid_ba/pred_lines[pred_id].shape[0] +valid_ab/gt_lines[i].shape[0])/2 # if iou_matrix[pred_id, i] == 0: # import ipdb; ipdb.set_trace() # if True: # import matplotlib.pyplot as plt # print('pred num', num_preds) # print('gt num', num_gts) # for i in range(num_preds): # plt.plot(pred_lines[i][:,0],pred_lines[i][:,1],'-',color='red',alpha=0.5) # for i in range(num_gts): # plt.plot(gt_lines[i][:,0],gt_lines[i][:,1],'-',color='blue',alpha=0.5) # plt.savefig('test.png') # plt.close() return iou_matrix def custom_polyline_score(pred_lines, gt_lines, linewidth=1., metric='chamfer'): ''' each line with 1 meter width pred_lines: num_preds, List [npts, 2] gt_lines: num_gts, npts, 2 gt_mask: num_gts, npts, 2 ''' if metric == 'iou': linewidth = 1.0 positive_threshold = 1. num_preds = len(pred_lines) num_gts = len(gt_lines) line_length = pred_lines.shape[1] # gt_lines = gt_lines + np.array((1.,1.)) pred_lines_shapely = \ [LineString(i).buffer(linewidth, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) for i in pred_lines] gt_lines_shapely =\ [LineString(i).buffer(linewidth, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) for i in gt_lines] # construct tree tree = STRtree(pred_lines_shapely) index_by_id = dict((id(pt), i) for i, pt in enumerate(pred_lines_shapely)) if metric=='chamfer': iou_matrix = np.full((num_preds, num_gts), -100.) elif metric=='iou': iou_matrix = np.zeros((num_preds, num_gts),dtype=np.float64) else: raise NotImplementedError for i, pline in enumerate(gt_lines_shapely): for o in tree.query(pline): if o.intersects(pline): pred_id = index_by_id[id(o)] if metric=='chamfer': dist_mat = distance.cdist( pred_lines[pred_id], gt_lines[i], 'euclidean') # import pdb;pdb.set_trace() valid_ab = dist_mat.min(-1).mean() valid_ba = dist_mat.min(-2).mean() iou_matrix[pred_id, i] = -(valid_ba+valid_ab)/2 elif metric=='iou': inter = o.intersection(pline).area union = o.union(pline).area iou_matrix[pred_id, i] = inter / union return iou_matrix if __name__ == '__main__': import torch line1 = torch.tensor([ [1, 5], [3, 5], [5, 5] ]) line0 = torch.tensor([ [3, 6], [4, 8], [5, 6] ]) line2 = torch.tensor([ [1, 4], [3, 4], [5, 4] ]) line3 = torch.tensor([ [4, 4], [3, 3], [5, 3] ]) gt = torch.stack((line2, line3), dim=0).type(torch.float32) pred = torch.stack((line0, line1), dim=0).type(torch.float32) # import ipdb; ipdb.set_trace() import mmcv # with mmcv.Timer(): # gt = upsampler(gt, pts=10) # pred = upsampler(pred, pts=10) import matplotlib.pyplot as plt from shapely.geometry import LineString from descartes import PolygonPatch iou_matrix = vec_iou(pred,gt) print(iou_matrix) # import pdb;pdb.set_trace() score_matrix = custom_polyline_score(pred, gt, linewidth=1., metric='chamfer') print(score_matrix) fig, ax = plt.subplots() for i in gt: i = i.numpy() plt.plot(i[:, 0], i[:, 1], 'o', color='red') plt.plot(i[:, 0], i[:, 1], '-', color='red') dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.round, join_style=JOIN_STYLE.round) patch1 = PolygonPatch(dilated, fc='red', ec='red', alpha=0.5, zorder=-1) ax.add_patch(patch1) for i in pred: i = i.numpy() plt.plot(i[:, 0], i[:, 1], 'o', color='blue') plt.plot(i[:, 0], i[:, 1], '-', color='blue') dilated = LineString(i).buffer(1, cap_style=CAP_STYLE.flat, join_style=JOIN_STYLE.mitre) patch1 = PolygonPatch(dilated, fc='blue', ec='blue', alpha=0.5, zorder=-1) ax.add_patch(patch1) ax.axis('equal') plt.savefig('test3.png') ================================================ FILE: mmdet3d/datasets/nuscenes_dataset.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE # Copyright (c) OpenMMLab. All rights reserved. import tempfile import copy from os import path as osp import os import mmcv import sys import numpy as np import pyquaternion from nuscenes.utils.data_classes import Box as NuScenesBox from .utils import nuscenes_get_rt_matrix from ..core import show_result from ..core.bbox import Box3DMode, Coord3DMode, LiDARInstance3DBoxes from .builder import DATASETS from .custom_3d import Custom3DDataset from .pipelines import Compose from tqdm import tqdm import csv import math import torch from nuscenes.eval.common.utils import quaternion_yaw, Quaternion # from .vad_custom_nuscenes_eval import NuScenesEval_custom from nuscenes.eval.common.utils import center_distance # from projects.mmdet3d_plugin.models.utils.visual import save_tensor from mmcv.parallel import DataContainer as DC import random from mmdet3d.core import LiDARInstance3DBoxes from nuscenes.utils.data_classes import Box as NuScenesBox # from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox from shapely import affinity, ops from shapely.geometry import LineString, box, MultiPolygon, MultiLineString from mmdet.datasets.pipelines import to_tensor from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer from nuscenes.eval.detection.constants import DETECTION_NAMES from .vector_map import VectorizedLocalMap @DATASETS.register_module() class NuScenesDataset(Custom3DDataset): r"""NuScenes Dataset. This class serves as the API for experiments on the NuScenes Dataset. Please refer to `NuScenes Dataset `_ for data downloading. Args: ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. data_root (str): Path of dataset root. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. load_interval (int, optional): Interval of loading the dataset. It is used to uniformly sample the dataset. Defaults to 1. with_velocity (bool, optional): Whether include velocity prediction into the experiments. Defaults to True. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes. - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. eval_version (bool, optional): Configuration version of evaluation. Defaults to 'detection_cvpr_2019'. use_valid_flag (bool, optional): Whether to use `use_valid_flag` key in the info file as mask to filter gt_boxes and gt_names. Defaults to False. img_info_prototype (str, optional): Type of img information. Based on 'img_info_prototype', the dataset will prepare the image data info in the type of 'mmcv' for official image infos, 'bevdet' for BEVDet, and 'bevdet4d' for BEVDet4D. Defaults to 'mmcv'. multi_adj_frame_id_cfg (tuple[int]): Define the selected index of reference adjcacent frames. ego_cam (str): Specify the ego coordinate relative to a specified camera by its name defined in NuScenes. Defaults to None, which use the mean of all cameras. """ NameMapping = { 'movable_object.barrier': 'barrier', 'vehicle.bicycle': 'bicycle', 'vehicle.bus.bendy': 'bus', 'vehicle.bus.rigid': 'bus', 'vehicle.car': 'car', 'vehicle.construction': 'construction_vehicle', 'vehicle.motorcycle': 'motorcycle', 'human.pedestrian.adult': 'pedestrian', 'human.pedestrian.child': 'pedestrian', 'human.pedestrian.construction_worker': 'pedestrian', 'human.pedestrian.police_officer': 'pedestrian', 'movable_object.trafficcone': 'traffic_cone', 'vehicle.trailer': 'trailer', 'vehicle.truck': 'truck' } DefaultAttribute = { 'car': 'vehicle.parked', 'pedestrian': 'pedestrian.moving', 'trailer': 'vehicle.parked', 'truck': 'vehicle.parked', 'bus': 'vehicle.moving', 'motorcycle': 'cycle.without_rider', 'construction_vehicle': 'vehicle.parked', 'bicycle': 'cycle.without_rider', 'barrier': '', 'traffic_cone': '', } AttrMapping = { 'cycle.with_rider': 0, 'cycle.without_rider': 1, 'pedestrian.moving': 2, 'pedestrian.standing': 3, 'pedestrian.sitting_lying_down': 4, 'vehicle.moving': 5, 'vehicle.parked': 6, 'vehicle.stopped': 7, } AttrMapping_rev = [ 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', 'pedestrian.standing', 'pedestrian.sitting_lying_down', 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', ] # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa ErrNameMapping = { 'trans_err': 'mATE', 'scale_err': 'mASE', 'orient_err': 'mAOE', 'vel_err': 'mAVE', 'attr_err': 'mAAE' } CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier') TRACKING_CLASSES = ['car', 'truck', 'bus', 'trailer', 'motorcycle', 'bicycle', 'pedestrian'] def __init__(self, ann_file=None, pipeline=None, data_root=None, classes=None, load_interval=1, with_velocity=True, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, eval_version='detection_cvpr_2019', use_valid_flag=False, img_info_prototype='mmcv', multi_adj_frame_id_cfg=None, occupancy_path='/mount/dnn_data/occupancy_2023/gts', ego_cam='CAM_FRONT', # SOLLOFusion use_sequence_group_flag=False, sequences_split_num=1, # MAP map_classes = ['divider', 'ped_crossing', 'boundary'], map_ann_file= '', point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], map_eval_cfg=dict(), load_fut_bbox_info=False, ): self.load_interval = load_interval self.use_valid_flag = use_valid_flag super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) self.load_fut_bbox_info = load_fut_bbox_info self.occupancy_path = occupancy_path self.with_velocity = with_velocity self.eval_version = eval_version from nuscenes.eval.detection.config import config_factory self.eval_detection_configs = config_factory(self.eval_version) if self.modality is None: self.modality = dict( use_camera=False, use_lidar=True, use_radar=False, use_map=False, use_external=False, ) self.map_eval_cfg = map_eval_cfg self.map_ann_file = map_ann_file self.MAPCLASSES = self.get_map_classes(map_classes) self.NUM_MAPCLASSES = len(self.MAPCLASSES) self.pc_range = point_cloud_range self.img_info_prototype = img_info_prototype self.multi_adj_frame_id_cfg = multi_adj_frame_id_cfg self.ego_cam = ego_cam self.nusc = None # SOLOFusion self.use_sequence_group_flag = use_sequence_group_flag self.sequences_split_num = sequences_split_num # sequences_split_num splits eacgh sequence into sequences_split_num parts. # if self.test_mode: # assert self.sequences_split_num == 1 if self.use_sequence_group_flag: self._set_sequence_group_flag() # Must be called after load_annotations b/c load_annotations does sorting. def get_cat_ids(self, idx): """Get category distribution of single scene. Args: idx (int): Index of the data_info. Returns: dict[list]: for each category, if the current scene contains such boxes, store a list containing idx, otherwise, store empty list. """ info = self.data_infos[idx] if self.use_valid_flag: mask = info['valid_flag'] gt_names = set(info['gt_names'][mask]) else: gt_names = set(info['gt_names']) cat_ids = [] for name in gt_names: if name in self.CLASSES: cat_ids.append(self.cat2id[name]) return cat_ids def load_annotations(self, ann_file): """Load annotations from ann_file. Args: ann_file (str): Path of the annotation file. Returns: list[dict]: List of annotations sorted by timestamps. """ data = mmcv.load(ann_file, file_format='pkl') data_infos = data['infos'][::self.load_interval] self.metadata = data['metadata'] self.version = self.metadata['version'] if len(data_infos) < 100: self.version = 'v1.0-mini' return data_infos def _set_sequence_group_flag(self): """ Set each sequence to be a different group """ res = [] curr_sequence = 0 for idx in range(len(self.data_infos)): if idx != 0 and len(self.data_infos[idx]['prev']) == 0: # Not first frame and # of sweeps is 0 -> new sequence curr_sequence += 1 res.append(curr_sequence) self.flag = np.array(res, dtype=np.int64) if self.sequences_split_num != 1: if self.sequences_split_num == 'all': self.flag = np.array(range(len(self.data_infos)), dtype=np.int64) else: bin_counts = np.bincount(self.flag) new_flags = [] curr_new_flag = 0 for curr_flag in range(len(bin_counts)): curr_sequence_length = np.array( list(range(0, bin_counts[curr_flag], math.ceil(bin_counts[curr_flag] / self.sequences_split_num))) + [bin_counts[curr_flag]]) for sub_seq_idx in (curr_sequence_length[1:] - curr_sequence_length[:-1]): for _ in range(sub_seq_idx): new_flags.append(curr_new_flag) curr_new_flag += 1 assert len(new_flags) == len(self.flag) assert len(np.bincount(new_flags)) == len(np.bincount(self.flag)) * self.sequences_split_num self.flag = np.array(new_flags, dtype=np.int64) def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - sweeps (list[dict]): Infos of sweeps. - timestamp (float): Sample timestamp. - img_filename (str, optional): Image filename. - lidar2img (list[np.ndarray], optional): Transformations from lidar to different cameras. - ann_info (dict): Annotation info. """ info = copy.deepcopy(self.data_infos[index]) # standard protocol modified from SECOND.Pytorch input_dict = dict( index=index, sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], scene_name=info['scene_name'], timestamp=info['timestamp'] / 1e6, lidarseg_filename=info.get('lidarseg_filename', 'None') ) if 'instance_inds' in info.keys(): assert len(info['instance_inds']) == len(info['valid_flag']) if len(info['instance_inds'])>0: input_dict['instance_inds'] = np.array(info['instance_inds'])[info['valid_flag']] else: input_dict['instance_inds'] = np.array(info['instance_inds']) if 'ann_infos' in info: input_dict['ann_infos'] = info['ann_infos'] if self.modality['use_camera']: if self.img_info_prototype == 'mmcv': image_paths = [] lidar2img_rts = [] for cam_type, cam_info in info['cams'].items(): image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv( cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic. shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) cam_position = np.linalg.inv(lidar2cam_rt.T) @ np.array([0., 0., 0., 1.]).reshape([4, 1]) cam_positions.append(cam_position.flatten()[:3]) input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, )) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos else: assert 'bevdet' in self.img_info_prototype input_dict.update(dict(curr=info)) if '4d' in self.img_info_prototype: info_adj_list = self.get_adj_info(info, index) input_dict.update(dict(adjacent=info_adj_list)) if self.use_sequence_group_flag: input_dict['sample_index'] = index input_dict['sequence_group_idx'] = self.flag[index] input_dict['start_of_sequence'] = index == 0 or self.flag[index - 1] != self.flag[index] # Get a transformation matrix from current keyframe lidar to previous keyframe lidar # if they belong to same sequence. can_bus_info = info['gt_ego_lcf_feat'] input_dict['can_bus_info'] = can_bus_info input_dict['nuscenes_get_rt_matrix'] = dict( lidar2ego_rotation = info['lidar2ego_rotation'], lidar2ego_translation = info['lidar2ego_translation'], ego2global_rotation = info['ego2global_rotation'], ego2global_translation = info['ego2global_translation'], ) input_dict['ego_pose_inv'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, info, "global", "ego")) input_dict['ego_pose'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, info, "ego", "global")) if not input_dict['start_of_sequence']: input_dict['curr_to_prev_lidar_rt'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, self.data_infos[index - 1], "lidar", "lidar")) input_dict['prev_lidar_to_global_rt'] = torch.FloatTensor(nuscenes_get_rt_matrix( self.data_infos[index - 1], info, "lidar", "global")) # TODO: Note that global is same for all. input_dict['curr_to_prev_ego_rt'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, self.data_infos[index - 1], "ego", "ego")) else: input_dict['curr_to_prev_lidar_rt'] = torch.eye(4).float() input_dict['prev_lidar_to_global_rt'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, info, "lidar", "global") ) input_dict['curr_to_prev_ego_rt'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, info, "ego", "ego")) input_dict['global_to_curr_lidar_rt'] = torch.FloatTensor(nuscenes_get_rt_matrix( info, info, "global", "lidar")) if self.load_fut_bbox_info: fut_boxes_info, fut_labels_info = self.get_fut_bbox_info(info, index) input_dict['fut_boxes_info'] = fut_boxes_info input_dict['fut_labels_info'] = fut_labels_info return input_dict def get_fut_bbox_info(self, info, index): fut_boxes_info = [] fut_labels_info = [] for select_id in range(1, 7): select_id = min(index + select_id, len(self.data_infos)-1) if not self.data_infos[select_id]['scene_token'] == info[ 'scene_token']: fut_boxes_info.append([]) fut_labels_info.append([]) else: fut_boxes_info.append(self.data_infos[select_id]['ann_infos']['gt_boxes_3d_in_global']) fut_labels_info.append(self.data_infos[select_id]['ann_infos']['gt_labels_3d']) return fut_boxes_info, fut_labels_info def get_adj_info(self, info, index): info_adj_list = [] for select_id in range(*self.multi_adj_frame_id_cfg): if select_id == 0: continue select_id = min(max(index - select_id, 0), len(self.data_infos)-1) if not self.data_infos[select_id]['scene_token'] == info[ 'scene_token']: info_adj_list.append(info) else: info_adj_list.append(self.data_infos[select_id]) return info_adj_list def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: Annotation information consists of the following keys: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - gt_names (list[str]): Class names of ground truths. """ info = self.data_infos[index] # filter out bbox containing no points if self.use_valid_flag: mask = info['valid_flag'] else: mask = info['num_lidar_pts'] > 0 gt_bboxes_3d = info['gt_boxes'][mask] gt_names_3d = info['gt_names'][mask] gt_labels_3d = [] for cat in gt_names_3d: if cat in self.CLASSES: gt_labels_3d.append(self.CLASSES.index(cat)) else: gt_labels_3d.append(-1) gt_labels_3d = np.array(gt_labels_3d) if self.with_velocity: gt_velocity = info['gt_velocity'][mask] nan_mask = np.isnan(gt_velocity[:, 0]) gt_velocity[nan_mask] = [0.0, 0.0] gt_bboxes_3d = np.concatenate([gt_bboxes_3d, gt_velocity], axis=-1) # the nuscenes box center is [0.5, 0.5, 0.5], we change it to be # the same as KITTI (0.5, 0.5, 0) gt_bboxes_3d = LiDARInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, gt_names=gt_names_3d) return anns_results def format_map_results(self, results, jsonfile_prefix=None): """Format the results to json (standard format for COCO evaluation). Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: Returns (result_files, tmp_dir), where `result_files` is a \ dict containing the json filepaths, `tmp_dir` is the temporal \ directory created for saving json files when \ `jsonfile_prefix` is not specified. """ if isinstance(results, dict): results = results['map_results'] assert isinstance(results, list) assert len(results) >= len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None # currently the output prediction results could be in two formats # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) # 2. list of dict('pts_bbox' or 'img_bbox': # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) # this is a workaround to enable evaluation of both formats on nuScenes # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 if not ('pred_map' in results[0]): result_files = self._format_map(results, jsonfile_prefix) else: # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict result_files = dict() for name in ['pred_map']: print(f'\nFormating {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_files.update( {name: self._format_map(results_, tmp_file_)}) return result_files, tmp_dir @classmethod def get_map_classes(cls, map_classes=None): """Get class names of current dataset. Args: classes (Sequence[str] | str | None): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. Return: list[str]: A list of class names. """ if map_classes is None: return cls.MAPCLASSES if isinstance(map_classes, str): # take it as a file path class_names = mmcv.list_from_file(map_classes) elif isinstance(map_classes, (tuple, list)): class_names = map_classes else: raise ValueError(f'Unsupported type {type(map_classes)} of map classes.') return class_names def _format_map(self, results, jsonfile_prefix=None, score_thresh=0.2): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ # assert self.map_ann_file is not None map_pred_annos = {} map_mapped_class_names = self.MAPCLASSES processed_set = set() for sample_id, det in enumerate(mmcv.track_iter_progress(results)): sample_id = det.get('index', sample_id) if sample_id in processed_set: continue processed_set.add(sample_id) map_pred_anno = {} vecs = output_to_vecs(det) sample_token = self.data_infos[sample_id]['token'] map_pred_anno['sample_token'] = sample_token pred_vec_list=[] for i, vec in enumerate(vecs): name = map_mapped_class_names[vec['label']] anno = dict( sample_token=sample_token, pts=vec['pts'], pts_num=len(vec['pts']), cls_name=name, type=vec['label'], confidence_level=vec['score']) pred_vec_list.append(anno) # annos.append(nusc_anno) # nusc_annos[sample_token] = annos map_pred_anno['vectors'] = pred_vec_list map_pred_annos[sample_token] = map_pred_anno # self._format_map_gt() if not os.path.exists(self.map_ann_file): self._format_map_gt() else: print(f'{self.map_ann_file} exist, not update') # with open(self.map_ann_file,'r') as f: # GT_anns = json.load(f) # gt_annos = GT_anns['GTs'] nusc_submissions = { 'meta': self.modality, 'map_results': map_pred_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'map_results_nusc.json') print('Map Results writes to', res_path) mmcv.dump(nusc_submissions, res_path) return res_path def vectormap_pipeline(self, location, ego2global_translation, patch_angle, flip_dx=False, flip_dy=False): ''' `example` type: keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img'; all keys type is 'DataContainer'; 'img_metas' cpu_only=True, type is dict, others are false; 'gt_labels_3d' shape torch.size([num_samples]), stack=False, padding_value=0, cpu_only=False 'gt_bboxes_3d': stack=False, cpu_only=True ''' anns_results = self.vector_map.gen_vectorized_samples( location, ego2global_translation, patch_angle, flip_dx, flip_dy ) ''' anns_results, type: dict 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates 'gt_vecs_pts_num': list[num_vecs], vec with num_points 'gt_vecs_label': list[num_vecs], vec with cls index ''' gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) from .vector_map import LiDARInstanceLines if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] else: gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) try: gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) except: # empty tensor, will be passed in train, # but we preserve it for test gt_vecs_pts_loc = gt_vecs_pts_loc return dict( map_gt_labels_3d = DC(gt_vecs_label, cpu_only=False), map_gt_bboxes_3d = DC(gt_vecs_pts_loc, cpu_only=True), ) def _format_map_gt(self): gt_annos = [] print('Start to convert gt map format...') # assert self.map_ann_file is not None if (not os.path.exists(self.map_ann_file)) : patch_h, patch_w = self.map_eval_cfg['region'] patch_h = min(patch_h, 50) self.vector_map = VectorizedLocalMap(self.data_root, patch_size=(patch_h, patch_w), map_classes=self.MAPCLASSES, fixed_ptsnum_per_line=20, padding_value=-10000) dataset_length = len(self) prog_bar = mmcv.ProgressBar(dataset_length) mapped_class_names = self.MAPCLASSES for sample_id in range(dataset_length): sample_token = self.data_infos[sample_id]['token'] gt_anno = {} gt_anno['sample_token'] = sample_token # gt_sample_annos = [] gt_sample_dict = {} ego_pose = torch.FloatTensor(nuscenes_get_rt_matrix( self.data_infos[sample_id], self.data_infos[sample_id], "ego", "global")) ego2global_translation = list(ego_pose[:3,3].numpy()) v = np.dot(ego_pose[:3,:3].numpy(), np.array([1, 0, 0])) yaw = np.arctan2(v[1], v[0]) patch_angle = yaw / np.pi * 180 location = self.data_infos[sample_id]['map_location'] gt_sample_dict = self.vectormap_pipeline(location, ego2global_translation, patch_angle) gt_labels = gt_sample_dict['map_gt_labels_3d'].data.numpy() gt_vecs = gt_sample_dict['map_gt_bboxes_3d'].data.instance_list gt_vec_list = [] for i, (gt_label, gt_vec) in enumerate(zip(gt_labels, gt_vecs)): name = mapped_class_names[gt_label] anno = dict( pts=np.array(list(gt_vec.coords)), pts_num=len(list(gt_vec.coords)), cls_name=name, type=gt_label, ) gt_vec_list.append(anno) gt_anno['vectors']=gt_vec_list gt_annos.append(gt_anno) prog_bar.update() nusc_submissions = { 'GTs': gt_annos } print('\n GT anns writes to', self.map_ann_file) mmcv.dump(nusc_submissions, self.map_ann_file) else: print(f'{self.map_ann_file} exist, not update') def _evaluate_single(self, result_path, logger=None, metric='bbox', result_name='pts_bbox'): """Evaluation for a single model in nuScenes protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. metric (str, optional): Metric name used for evaluation. Default: 'bbox'. result_name (str, optional): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ from nuscenes import NuScenes from nuscenes.eval.detection.evaluate import NuScenesEval output_dir = osp.join(*osp.split(result_path)[:-1]) self.nusc = NuScenes( version=self.version, dataroot=self.data_root, verbose=False) eval_set_map = { 'v1.0-mini': 'mini_val', 'v1.0-trainval': 'val', } nusc_eval = NuScenesEval( self.nusc, config=self.eval_detection_configs, result_path=result_path, eval_set=eval_set_map[self.version], output_dir=output_dir, verbose=False) nusc_eval.main(render_curves=False) # record metrics metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) detail = dict() metric_prefix = f'{result_name}_NuScenes' for name in self.CLASSES: for k, v in metrics['label_aps'][name].items(): val = float('{:.4f}'.format(v)) detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val for k, v in metrics['label_tp_errors'][name].items(): val = float('{:.4f}'.format(v)) detail['{}/{}_{}'.format(metric_prefix, name, k)] = val for k, v in metrics['tp_errors'].items(): val = float('{:.4f}'.format(v)) detail['{}/{}'.format(metric_prefix, self.ErrNameMapping[k])] = val detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] return detail def format_results(self, results, jsonfile_prefix=None): """Format the results to json (standard format for COCO evaluation). Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: Returns (result_files, tmp_dir), where `result_files` is a dict containing the json filepaths, `tmp_dir` is the temporal directory created for saving json files when `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) >= len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None # currently the output prediction results could be in two formats # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) # 2. list of dict('pts_bbox' or 'img_bbox': # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) # this is a workaround to enable evaluation of both formats on nuScenes # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): result_files = self._format_bbox(results, jsonfile_prefix) else: # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict result_files = dict() for name in ['pts_bbox']: print(f'\nFormating bboxes of {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_files.update( {name: self._format_bbox(results_, tmp_file_)}) return result_files, tmp_dir # def format_motion_results(self, results, jsonfile_prefix=None): # """Format the results to json (standard format for COCO evaluation). # Args: # results (list[dict]): Testing results of the dataset. # jsonfile_prefix (str): The prefix of json files. It includes # the file path and the prefix of filename, e.g., "a/b/prefix". # If not specified, a temp file will be created. Default: None. # Returns: # tuple: Returns (result_files, tmp_dir), where `result_files` is a # dict containing the json filepaths, `tmp_dir` is the temporal # directory created for saving json files when # `jsonfile_prefix` is not specified. # """ # assert isinstance(results, list), 'results must be a list' # assert len(results) >= len(self), ( # 'The length of results is not equal to the dataset len: {} != {}'. # format(len(results), len(self))) # if jsonfile_prefix is None: # tmp_dir = tempfile.TemporaryDirectory() # jsonfile_prefix = osp.join(tmp_dir.name, 'results') # else: # tmp_dir = None # # currently the output prediction results could be in two formats # # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) # # 2. list of dict('pts_bbox' or 'img_bbox': # # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) # # this is a workaround to enable evaluation of both formats on nuScenes # # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 # if not ('pred_motion' in results[0]): # result_files = self._format_motion_bbox(results, jsonfile_prefix) # else: # # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict # result_files = dict() # for name in ['pred_motion']: # print(f'\nFormating bboxes of {name}') # results_ = [out[name] for out in results] # tmp_file_ = osp.join(jsonfile_prefix, name) # result_files.update( # {name: self._format_motion_bbox(results_, tmp_file_)}) # return result_files, tmp_dir def evaluate(self, results, logger=None, metric='bbox', jsonfile_prefix='test', result_names=['pts_bbox'], show=False, out_dir=None, pipeline=None, save=False, ): results_dict = {} mmcv.mkdir_or_exist(jsonfile_prefix) if results[0].get('pred_ego_traj', None) is not None: results_dict.update( self.evaluate_ego_traj( results, jsonfile_prefix=jsonfile_prefix, logger=logger ) ) if results[0].get('pred_occupancy', None) is not None: results_dict.update(self.evaluate_occupancy(results, show_dir=jsonfile_prefix, save=save)) if results[0].get('iou', None) is not None: results_dict.update(self.evaluate_mask(results)) if results[0].get('pred_map', None) is not None: results_dict.update(self.evaluate_map(results, jsonfile_prefix=jsonfile_prefix, out_dir=out_dir)) if results[0].get('pts_bbox', None) is not None: results_dict.update(self.evaluate_bbox(results, logger=logger, metric=metric, jsonfile_prefix=jsonfile_prefix, result_names=result_names, show=show, out_dir=out_dir, pipeline=pipeline)) """if the output information has no tracking info, this func dose nothing""" results_dict.update(self.evaluate_tracking(results, logger=logger, metric=metric, jsonfile_prefix=jsonfile_prefix, result_names=result_names, show=show, out_dir=out_dir, pipeline=pipeline)) with open(osp.join(jsonfile_prefix, 'results.csv'), 'w', newline='') as f: writer = csv.writer(f) for key in results_dict.keys(): writer.writerow([key, results_dict[key]]) return results_dict def evaluate_ego_traj(self, results, jsonfile_prefix=None, logger=None): print('Start to convert traj format...') l2_dist_list = [] res = torch.zeros(1, 6) res_c = torch.zeros(1, 6) processed_set = set() ego_trajs_in_global_dict = dict( trajs=dict(), map_lane=dict(), map_label=dict(), ) c = 0 gen_global_map = False if gen_global_map: self.vector_map = VectorizedLocalMap(self.data_root, patch_size=(400, 400), map_classes=self.MAPCLASSES, fixed_ptsnum_per_line=200, padding_value=-10000) for sample_id, traj in enumerate(mmcv.track_iter_progress(results)): sample_id = traj['pred_ego_traj']['index'] l2_dist = traj['pred_ego_traj']['metric_dict'].pop('l2_dist') if sample_id in processed_set: continue # if traj['pred_ego_traj']['gt_ego_fut_cmd'][-1] == 1: continue processed_set.add(sample_id) c += 1 ego_trajs_in_global = traj['pred_ego_traj']['ego_trajs_in_global'].numpy() ego_trajs_in_global_dict['trajs'][traj['pred_ego_traj']['index_w_scene']] = ego_trajs_in_global mask = l2_dist >= 0 res[mask] += l2_dist[mask] res_c[mask] += 1 info = self.data_infos[sample_id] # print(traj['pred_ego_traj']['index_w_scene'], info['prev']=='', sample_id, traj['pred_ego_traj']['index']) if gen_global_map and info['prev']=='': ego_pose = torch.FloatTensor(nuscenes_get_rt_matrix( self.data_infos[sample_id], self.data_infos[sample_id], "ego", "global")) ego2global_translation = list(ego_pose[:3,3].numpy()) map_res = self.vectormap_pipeline(info['map_location'], ego2global_translation, 0) lanes = map_res['map_gt_bboxes_3d'].data.fixed_num_sampled_points.cpu().numpy() lanes = lanes + ego2global_translation[:2] lanes_label = map_res['map_gt_labels_3d'].data.cpu().numpy() ego_trajs_in_global_dict['map_lane'][traj['pred_ego_traj']['index_w_scene']] = lanes # results[sample_id]['pred_map']['gt_lane_in_global'] ego_trajs_in_global_dict['map_label'][traj['pred_ego_traj']['index_w_scene']] = lanes_label # results[sample_id]['pred_map']['gt_lane_label'] print('valid: ', c) l2_dist = (res/res_c).cpu().numpy() print('++++++++++++++') print('l2_dist') print(l2_dist) print('--------------') metric_dict = [None, None, None] for i in range(3): num_valid = 0 processed_set = set() for sample_id, traj in enumerate(mmcv.track_iter_progress(results)): sample_id = traj['pred_ego_traj']['index'] if sample_id in processed_set: continue if i == 1 and traj['pred_ego_traj']['gt_ego_fut_cmd'][-1] == 1: continue if i == 2 and traj['pred_ego_traj']['gt_ego_fut_cmd'][-1] != 1: continue processed_set.add(sample_id) if not traj['pred_ego_traj']['metric_dict']['fut_valid_flag']: continue else: num_valid += 1 if metric_dict[i] is None: metric_dict[i] = copy.deepcopy(traj['pred_ego_traj']['metric_dict']) else: for k in traj['pred_ego_traj']['metric_dict'].keys(): metric_dict[i][k] += traj['pred_ego_traj']['metric_dict'][k] print('valid_after: ', num_valid, i) for k in metric_dict[i]: metric_dict[i][k] = str(metric_dict[i][k] / num_valid) print("{}:{}:{}".format(i, k, metric_dict[i][k])) res_path = osp.join(jsonfile_prefix, 'results_nusc_planning.json') print('Results writes to', res_path) mmcv.dump(ego_trajs_in_global_dict, res_path) metric_dict[0].update(self.smoothness(ego_trajs_in_global_dict['trajs'])) # l2_dist_1s = traj['pred_ego_traj']['metric_dict']['plan_L2_1s'] # l2_dist_2s = traj['pred_ego_traj']['metric_dict']['plan_L2_2s'] # l2_dist_3s = traj['pred_ego_traj']['metric_dict']['plan_L2_3s'] # res2[0] = res2[0] + l2_dist_1s # res2[1] = res2[1] + l2_dist_2s # res2[2] = res2[2] + l2_dist_3s # res2_c += 1 # l2_dist_v2 = res2/res2_c # print('++++++++++++++') # print('l2_dist_v2') # print(l2_dist_v2) avg_l2 = 0 avg_col = 0 for i in range(1,4): avg_l2 += float(metric_dict[0][f'plan_L2_{i}s']) avg_col += float(metric_dict[0][f'plan_obj_box_col_{i}s']) avg_l2 /= 3 avg_col /= 3 print(f'avg_l2 {avg_l2}, avg_col {avg_col}') print('--------------') # metric_dict['l2_dist'] = l2_dist metric_dict[0]['avg_l2'] = avg_l2 metric_dict[0]['avg_col'] = avg_col return metric_dict[0] def smoothness(self, data): keys = list(data.keys()) # print(keys) new_keys = [] for key in keys: s = key.split("-") new_keys.append([int(s[1]),int(s[2])]) new_keys=sorted(new_keys,key=(lambda x:(x[0], x[1]))) sorted_keys = [] for key in new_keys: v = ['scene', str(key[0]).zfill(4), str(key[1]) ] k='-'.join(v) sorted_keys.append(k) all_scene_keys=[] key='-'.join(sorted_keys[0].split("-")[:2]) scene=[] for k in sorted_keys: if(key in k): # print(True) scene.append(k) else: s =k.split("-") key='-'.join(s[:2]) all_scene_keys.append(scene) scene=[k] #tranform raw data new_data={} for keys in all_scene_keys: l = len(keys) for i in range(l): val = [] index = i for j in range(i+1): if index>6: index-=1 else: val.append(data[keys[j]][index]) index-=1 new_data[keys[i]]=val #compute mean and var res = { 'stable_mean_distance_1s': [], 'stable_variance_distance_1s': [], 'stable_mean_distance_2s': [], 'stable_variance_distance_2s': [], 'stable_mean_distance_3s': [], 'stable_variance_distance_3s': [], } for key, value in new_data.items(): #filter unstable data if(len(value)!=7): continue assert len(value)==7 #compute mean for window in [1, 2, 3]: gt = value[-1] pred = value[6-window*2:-1] #compute var data_array = np.array(pred) distances = np.linalg.norm(data_array - gt, axis=1) mean_distance = np.mean(distances) variance_distance = np.var(distances) res[f'stable_mean_distance_{window}s'].append(mean_distance) res[f'stable_variance_distance_{window}s'].append(variance_distance) for key in res.keys(): res[key] = np.mean(res[key]) print(res) return res def _format_bbox(self, results, jsonfile_prefix=None): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ nusc_annos = {} mapped_class_names = self.CLASSES print('Start to convert detection format...') for sample_id, det in enumerate(mmcv.track_iter_progress(results)): boxes = det['boxes_3d'].tensor.numpy() scores = det['scores_3d'].numpy() labels = det['labels_3d'].numpy() sample_id = det.get('index', sample_id) sample_token = self.data_infos[sample_id]['token'] trans = self.data_infos[sample_id]['cams'][ self.ego_cam]['ego2global_translation'] rot = self.data_infos[sample_id]['cams'][ self.ego_cam]['ego2global_rotation'] rot = pyquaternion.Quaternion(rot) annos = list() for i, box in enumerate(boxes): name = mapped_class_names[labels[i]] center = box[:3] wlh = box[[4, 3, 5]] box_yaw = box[6] box_vel = box[7:].tolist() box_vel.append(0) quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw) nusc_box = NuScenesBox(center, wlh, quat, velocity=box_vel) nusc_box.rotate(rot) nusc_box.translate(trans) if np.sqrt(nusc_box.velocity[0]**2 + nusc_box.velocity[1]**2) > 0.2: if name in [ 'car', 'construction_vehicle', 'bus', 'truck', 'trailer', ]: attr = 'vehicle.moving' elif name in ['bicycle', 'motorcycle']: attr = 'cycle.with_rider' else: attr = self.DefaultAttribute[name] else: if name in ['pedestrian']: attr = 'pedestrian.standing' elif name in ['bus']: attr = 'vehicle.stopped' else: attr = self.DefaultAttribute[name] nusc_anno = dict( sample_token=sample_token, translation=nusc_box.center.tolist(), size=nusc_box.wlh.tolist(), rotation=nusc_box.orientation.elements.tolist(), velocity=nusc_box.velocity[:2], detection_name=name, detection_score=float(scores[i]), attribute_name=attr, ) annos.append(nusc_anno) # other views results of the same frame should be concatenated if sample_token in nusc_annos: pass # nusc_annos[sample_token].extend(annos) else: nusc_annos[sample_token] = annos nusc_submissions = { 'meta': self.modality, 'results': nusc_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'results_nusc.json') print('Results writes to', res_path) mmcv.dump(nusc_submissions, res_path) return res_path def evaluate_tracking(self, results, metric='bbox', logger=None, jsonfile_prefix=None, result_names=['pts_bbox'], show=False, out_dir=None, pipeline=None): """Evaluation in nuScenes protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str]): Metrics to be evaluated. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str | None): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. show (bool): Whether to visualize. Default: False. out_dir (str): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir, with_motion = self.format_tracking_results(results, jsonfile_prefix) if isinstance(result_files, dict): results_dict = dict() for name in result_names: print('Evaluating tracking bboxes of {}'.format(name)) ret_dict = self._evaluate_tracking_single(result_files[name]) results_dict.update(ret_dict) if with_motion: print('Evaluating motion bboxes of {}'.format(name)) ret_dict = self._evaluate_motion_single(result_files[name]) results_dict.update(ret_dict) elif isinstance(result_files, str): results_dict = self._evaluate_tracking_single(result_files) if with_motion: print('Evaluating motion bboxes of') ret_dict = self._evaluate_motion_single(result_files) results_dict.update(ret_dict) if tmp_dir is not None: tmp_dir.cleanup() if show: self.show(results, out_dir, pipeline=pipeline) return results_dict def format_tracking_results(self, results, jsonfile_prefix=None): """Format the results to json (standard format for COCO evaluation). Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: Returns (result_files, tmp_dir), where `result_files` is a dict containing the json filepaths, `tmp_dir` is the temporal directory created for saving json files when `jsonfile_prefix` is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) >= len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None # currently the output prediction results could be in two formats # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) # 2. list of dict('pts_bbox' or 'img_bbox': # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) # this is a workaround to enable evaluation of both formats on nuScenes # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): result_files, with_motion = self._format_tracking_bbox(results, jsonfile_prefix) else: # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict result_files = dict() for name in ['pts_bbox']: print(f'\nFormating tracking bboxes of {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_file, with_motion = self._format_tracking_bbox(results_, tmp_file_) result_files.update( {name: result_file}) return result_files, tmp_dir, with_motion def _format_tracking_bbox(self, results, jsonfile_prefix=None): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ nusc_annos = {} mapped_class_names = self.CLASSES print('Start to convert tracking format...') processed_set = set() with_motion = False for sample_id, det in enumerate(mmcv.track_iter_progress(results)): boxes = det['boxes_3d'].tensor.numpy() # scores = det['scores_3d'].numpy() labels = det['labels_3d'].numpy() sample_id = det.get('index', sample_id) if 'track_scores' not in det: print('no tracking info') return None, with_motion tracking_scores = det['track_scores'].numpy() obj_idxes = det['obj_idxes'].numpy() if sample_id in processed_set: continue processed_set.add(sample_id) sample_token = self.data_infos[sample_id]['token'] trans = self.data_infos[sample_id]['cams'][ self.ego_cam]['ego2global_translation'] rot = self.data_infos[sample_id]['cams'][ self.ego_cam]['ego2global_rotation'] rot = pyquaternion.Quaternion(rot) annos = list() for i, box in enumerate(boxes): if tracking_scores[i] < 0: continue name = mapped_class_names[labels[i]] if name not in self.TRACKING_CLASSES: continue center = box[:3] wlh = box[[4, 3, 5]] box_yaw = box[6] box_vel = box[7:].tolist() box_vel.append(0) quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw) nusc_box = NuScenesBox(center, wlh, quat, velocity=box_vel) nusc_box.rotate(rot) nusc_box.translate(trans) if np.sqrt(nusc_box.velocity[0]**2 + nusc_box.velocity[1]**2) > 0.2: if name in [ 'car', 'construction_vehicle', 'bus', 'truck', 'trailer', ]: attr = 'vehicle.moving' elif name in ['bicycle', 'motorcycle']: attr = 'cycle.with_rider' else: attr = self.DefaultAttribute[name] else: if name in ['pedestrian']: attr = 'pedestrian.standing' elif name in ['bus']: attr = 'vehicle.stopped' else: attr = self.DefaultAttribute[name] nusc_anno = dict( sample_token=sample_token, translation=nusc_box.center.tolist(), size=nusc_box.wlh.tolist(), rotation=nusc_box.orientation.elements.tolist(), velocity=nusc_box.velocity[:2], tracking_name=name, detection_name=name, detection_score=float(tracking_scores[i]), attribute_name=attr, tracking_score=float(tracking_scores[i]), tracking_id=obj_idxes[i] ) if 'motion_traj' in det: with_motion = True nusc_anno['traj'] = det['motion_traj'][i] nusc_anno['traj_scores'] = det['motion_cls'][i] annos.append(nusc_anno) # other views results of the same frame should be concatenated if sample_token in nusc_annos: pass # nusc_annos[sample_token].extend(annos) else: nusc_annos[sample_token] = annos nusc_submissions = { 'meta': self.modality, 'results': nusc_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'results_nusc_tracking.json') print('Results writes to', res_path) mmcv.dump(nusc_submissions, res_path) return res_path, with_motion def _evaluate_motion_single(self, result_path, logger=None, metric='bbox', result_name='pts_bbox'): """Evaluation for a single model in nuScenes protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. metric (str): Metric name used for evaluation. Default: 'bbox'. result_name (str): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ if result_path is None: return {} from nuscenes import NuScenes output_dir = osp.join(*osp.split(result_path)[:-1]) eval_set_map = { 'v1.0-mini': 'mini_val', 'v1.0-trainval': 'val', } from .evals.nuscenes_eval_motion import MotionEval if self.nusc is None: self.nusc = NuScenes(version=self.version, dataroot=self.data_root, verbose=False) self.nusc_eval_motion = MotionEval( self.nusc, config=self.eval_detection_configs, result_path=result_path, eval_set=eval_set_map[self.version], output_dir=output_dir, verbose=True, data_infos=self.data_infos, ann_file=self.ann_file, category_convert_type='motion_category' ) print('-'*50) print( 'Evaluate on motion category, merge class for vehicles and pedestrians...') print('evaluate standard motion metrics...') self.nusc_eval_motion.main( plot_examples=0, render_curves=False, eval_mode='standard') print('evaluate motion mAP-minFDE metrics...') self.nusc_eval_motion.main( plot_examples=0, render_curves=False, eval_mode='motion_map') print('evaluate EPA motion metrics...') self.nusc_eval_motion.main( plot_examples=0, render_curves=False, eval_mode='epa') print('-'*50) print('Evaluate on detection category...') self.nusc_eval_motion = MotionEval( self.nusc, config=self.eval_detection_configs, result_path=result_path, eval_set=eval_set_map[self.version], output_dir=output_dir, verbose=True, data_infos=self.data_infos, category_convert_type='detection_category' ) print('evaluate standard motion metrics...') self.nusc_eval_motion.main( plot_examples=0, render_curves=False, eval_mode='standard') print('evaluate EPA motion metrics...') self.nusc_eval_motion.main( plot_examples=0, render_curves=False, eval_mode='motion_map') print('evaluate EPA motion metrics...') self.nusc_eval_motion.main( plot_examples=0, render_curves=False, eval_mode='epa') return {} def _evaluate_tracking_single(self, result_path, logger=None, metric='bbox', result_name='pts_bbox'): """Evaluation for a single model in nuScenes protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. metric (str): Metric name used for evaluation. Default: 'bbox'. result_name (str): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ if result_path is None: return {} from nuscenes import NuScenes output_dir = osp.join(*osp.split(result_path)[:-1]) eval_set_map = { 'v1.0-mini': 'mini_val', 'v1.0-trainval': 'val', } from nuscenes.eval.tracking.evaluate import TrackingEval from nuscenes.eval.common.config import config_factory as track_configs cfg = track_configs("tracking_nips_2019") nusc_eval = TrackingEval( config=cfg, result_path=result_path, eval_set=eval_set_map[self.version], output_dir=output_dir, verbose=True, nusc_version=self.version, nusc_dataroot=self.data_root ) metrics = nusc_eval.main() # record metrics metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) print(metrics) detail = dict() metric_prefix = f'{result_name}_NuScenes' keys = ['amota', 'amotp', 'recall', 'motar', 'gt', 'mota', 'motp', 'mt', 'ml', 'faf', 'tp', 'fp', 'fn', 'ids', 'frag', 'tid', 'lgd'] for key in keys: detail['{}/{}'.format(metric_prefix, key)] = metrics[key] return detail def evaluate_occupancy(self, occ_results, runner=None, show_dir=None, save=False, **eval_kwargs): from .occ_metrics import Metric_mIoU, Metric_FScore if show_dir is not None: # import os # if not os.path.exists(show_dir): mmcv.mkdir_or_exist(show_dir) mmcv.mkdir_or_exist(os.path.join(show_dir, 'occupancy_pred')) print('\nSaving output and gt in {} for visualization.'.format(show_dir)) begin= 0 # eval_kwargs.get('begin',None) end=1 if not save else len(occ_results) # eval_kwargs.get('end',None) self.occ_eval_metrics = Metric_mIoU( num_classes=18, use_lidar_mask=False, use_image_mask=True) self.eval_fscore = False if self.eval_fscore: self.fscore_eval_metrics = Metric_FScore( leaf_size=10, threshold_acc=0.4, threshold_complete=0.4, voxel_size=[0.4, 0.4, 0.4], range=[-40, -40, -1, 40, 40, 5.4], void=[17, 255], use_lidar_mask=False, use_image_mask=True, ) count = 0 print('\nStarting Evaluation...') processed_set = set() for occ_pred_w_index in tqdm(occ_results): index = occ_pred_w_index['index'] if index in processed_set: continue processed_set.add(index) occ_pred = occ_pred_w_index['pred_occupancy'] info = self.data_infos[index] scene_name = info['scene_name'] sample_token = info['token'] occupancy_file_path = osp.join(self.occupancy_path, scene_name, sample_token, 'labels.npz') occ_gt = np.load(occupancy_file_path) gt_semantics = occ_gt['semantics'] mask_lidar = occ_gt['mask_lidar'].astype(bool) mask_camera = occ_gt['mask_camera'].astype(bool) # if show_dir is not None: # if begin is not None and end is not None: # if index>= begin and index=0).all() & (each<2000).all())] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 255)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=colors[label], thickness=1) mmcv.imwrite(bev_img, f'map_{index}_{t}.png') print('saved') def _evaluate_map_single(self, result_path, logger=None, metric='bbox', map_metric='chamfer', result_name='pts_bbox'): """Evaluation for a single model in nuScenes protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str | None): Logger used for printing related information during evaluation. Default: None. metric (str): Metric name used for evaluation. Default: 'bbox'. result_name (str): Result name in the metric prefix. Default: 'pts_bbox'. Returns: dict: Dictionary of evaluation details. """ detail = dict() output_dir = osp.join(*osp.split(result_path)[:-1]) from .map_utils.mean_ap import eval_map from .map_utils.mean_ap import format_res_gt_by_classes result_path = osp.abspath(result_path) print('Formating results & gts by classes') pred_results = mmcv.load(result_path) map_results = pred_results['map_results'] gt_anns = mmcv.load(self.map_ann_file) map_annotations = gt_anns['GTs'] cls_gens, cls_gts = format_res_gt_by_classes(result_path, map_results, map_annotations, cls_names=self.MAPCLASSES, num_pred_pts_per_instance=20, eval_use_same_gt_sample_num_flag=True, pc_range=self.pc_range) # for i in range(10): # self.__map_visual__(map_annotations[i]['vectors'], map_results[map_annotations[i]['sample_token']]['vectors'], index=i) map_metrics = map_metric if isinstance(map_metric, list) else [map_metric] allowed_metrics = ['chamfer', 'iou'] for metric in map_metrics: if metric not in allowed_metrics: raise KeyError(f'metric {metric} is not supported') for metric in map_metrics: print('-*'*10+f'use metric:{metric}'+'-*'*10) if metric == 'chamfer': thresholds = [0.5,1.0,1.5] elif metric == 'iou': thresholds= np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) cls_aps = np.zeros((len(thresholds),self.NUM_MAPCLASSES)) for i, thr in enumerate(thresholds): print('-*'*10+f'threshhold:{thr}'+'-*'*10) mAP, cls_ap = eval_map( map_results, map_annotations, cls_gens, cls_gts, threshold=thr, cls_names=self.MAPCLASSES, logger=logger, num_pred_pts_per_instance=20, pc_range=self.pc_range, metric=metric) for j in range(self.NUM_MAPCLASSES): cls_aps[i, j] = cls_ap[j]['ap'] for i, name in enumerate(self.MAPCLASSES): print('{}: {}'.format(name, cls_aps.mean(0)[i])) detail['NuscMap_{}/{}_AP'.format(metric,name)] = cls_aps.mean(0)[i] print('map: {}'.format(cls_aps.mean(0).mean())) detail['NuscMap_{}/mAP'.format(metric)] = cls_aps.mean(0).mean() for i, name in enumerate(self.MAPCLASSES): for j, thr in enumerate(thresholds): if metric == 'chamfer': detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] elif metric == 'iou': if thr == 0.5 or thr == 0.75: detail['NuscMap_{}/{}_AP_thr_{}'.format(metric,name,thr)]=cls_aps[j][i] return detail def evaluate_bbox(self, results, metric='bbox', logger=None, jsonfile_prefix='test', result_names=['pts_bbox'], show=False, out_dir=None, pipeline=None): """Evaluation in nuScenes protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str], optional): Metrics to be evaluated. Default: 'bbox'. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str, optional): The prefix of json files including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir = self.format_results(results, jsonfile_prefix) if isinstance(result_files, dict): results_dict = dict() for name in result_names: print('Evaluating bboxes of {}'.format(name)) ret_dict = self._evaluate_single(result_files[name]) results_dict.update(ret_dict) elif isinstance(result_files, str): results_dict = self._evaluate_single(result_files) if tmp_dir is not None: tmp_dir.cleanup() if show or out_dir: self.show(results, out_dir, show=show, pipeline=pipeline) return results_dict def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=dict(backend='disk')), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=dict(backend='disk')), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['points']) ] return Compose(pipeline) def show(self, results, out_dir, show=False, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Whether to visualize the results online. Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): if 'pts_bbox' in result.keys(): result = result['pts_bbox'] data_info = self.data_infos[i] pts_path = data_info['lidar_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points = self._extract_data(i, pipeline, 'points').numpy() # for now we convert points into depth mode points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) inds = result['scores_3d'] > 0.1 gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy() show_gt_bboxes = Box3DMode.convert(gt_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) pred_bboxes = result['boxes_3d'][inds].tensor.numpy() show_pred_bboxes = Box3DMode.convert(pred_bboxes, Box3DMode.LIDAR, Box3DMode.DEPTH) show_result(points, show_gt_bboxes, show_pred_bboxes, out_dir, file_name, show) def output_to_nusc_box(detection, with_velocity=True): """Convert the output to the box class in the nuScenes. Args: detection (dict): Detection results. - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. - scores_3d (torch.Tensor): Detection scores. - labels_3d (torch.Tensor): Predicted box labels. Returns: list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. """ box3d = detection['boxes_3d'] scores = detection['scores_3d'].numpy() labels = detection['labels_3d'].numpy() box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() # our LiDAR coordinate system -> nuScenes box coordinate system nus_box_dims = box_dims[:, [1, 0, 2]] box_list = [] for i in range(len(box3d)): quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) if with_velocity: velocity = (*box3d.tensor[i, 7:9], 0.0) else: velocity = (0, 0, 0) # velo_val = np.linalg.norm(box3d[i, 7:9]) # velo_ori = box3d[i, 6] # velocity = ( # velo_val * np.cos(velo_ori), velo_val * np.sin(velo_ori), 0.0) box = NuScenesBox( box_gravity_center[i], nus_box_dims[i], quat, label=labels[i], score=scores[i], velocity=velocity) box_list.append(box) return box_list @DATASETS.register_module() class NuscenesOccupancy(NuScenesDataset): CLASSES = [ "empty", "barrier", "bicycle", "bus", "car", "construction", "motorcycle", "pedestrian", "trafficcone", "trailer", "truck", "driveable_surface", "other", "sidewalk", "terrain", "mannade", "vegetation", ] def __init__(self, occupancy_info='data/nuscenes/occupancy_category.json', **kwargs): super().__init__(**kwargs) self.CLASSES = [ "empty", "barrier", "bicycle", "bus", "car", "construction", "motorcycle", "pedestrian", "trafficcone", "trailer", "truck", "driveable_surface", "other", "sidewalk", "terrain", "mannade", "vegetation", ] self.occupancy_info = mmcv.load(occupancy_info) def get_cat_ids(self, idx): """Get category distribution of single scene. Args: idx (int): Index of the data_info. Returns: dict[list]: for each category, if the current scene contains such boxes, store a list containing idx, otherwise, store empty list. """ info = self.data_infos[idx] token = info['token'] category = self.occupancy_info[token] cat_ids = [] for k, v in category.items(): k = int(k) if k == 17: continue logv = max((np.log(v)/np.log(100)).round(),1) cat_ids.extend([k] * int(logv)) return cat_ids def lidar_nusc_box_to_global(info, boxes, classes, eval_configs, eval_version='detection_cvpr_2019'): """Convert the box from ego to global coordinate. Args: info (dict): Info for a specific sample data, including the calibration information. boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. eval_version (str, optional): Evaluation version. Default: 'detection_cvpr_2019' Returns: list: List of standard NuScenesBoxes in the global coordinate. """ box_list = [] for box in boxes: # Move box to ego vehicle coord system box.rotate(pyquaternion.Quaternion(info['lidar2ego_rotation'])) box.translate(np.array(info['lidar2ego_translation'])) # filter det in ego. cls_range_map = eval_configs.class_range radius = np.linalg.norm(box.center[:2], 2) det_range = cls_range_map[classes[box.label]] if radius > det_range: continue # Move box to global coord system box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) box.translate(np.array(info['ego2global_translation'])) box_list.append(box) return box_list def invert_matrix_egopose_numpy(egopose): """ Compute the inverse transformation of a 4x4 egopose numpy matrix.""" inverse_matrix = np.zeros((4, 4), dtype=np.float32) rotation = egopose[:3, :3] translation = egopose[:3, 3] inverse_matrix[:3, :3] = rotation.T inverse_matrix[:3, 3] = -np.dot(rotation.T, translation) inverse_matrix[3, 3] = 1.0 return inverse_matrix def convert_egopose_to_matrix_numpy(rotation, translation): transformation_matrix = np.zeros((4, 4), dtype=np.float32) transformation_matrix[:3, :3] = rotation transformation_matrix[:3, 3] = translation transformation_matrix[3, 3] = 1.0 return transformation_matrix def output_to_vecs(detection): # box3d = detection['map_boxes_3d'].numpy() scores = detection['map_scores_3d'].numpy() labels = detection['map_labels_3d'].numpy() pts = detection['map_pts_3d'].numpy() vec_list = [] # import pdb;pdb.set_trace() for i in range(pts.shape[0]): vec = dict( bbox =[], # box3d[i], # xyxy label=labels[i], score=scores[i], pts=pts[i], ) vec_list.append(vec) return vec_list ================================================ FILE: mmdet3d/datasets/nuscenes_eval.py ================================================ import argparse import copy import json import os import time from typing import Tuple, Dict, Any import torch import numpy as np from nuscenes import NuScenes from nuscenes.eval.common.config import config_factory from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.detection.data_classes import DetectionConfig from nuscenes.eval.detection.evaluate import NuScenesEval from pyquaternion import Quaternion from nuscenes import NuScenes from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.detection.data_classes import DetectionBox from nuscenes.eval.detection.utils import category_to_detection_name from nuscenes.eval.tracking.data_classes import TrackingBox from nuscenes.utils.data_classes import Box from nuscenes.utils.geometry_utils import points_in_box from nuscenes.utils.splits import create_splits_scenes from nuscenes.eval.common.loaders import load_prediction, add_center_dist, filter_eval_boxes import tqdm from nuscenes.utils.geometry_utils import view_points, box_in_image, BoxVisibility, transform_matrix from torchvision.transforms.functional import rotate import pycocotools.mask as mask_util # from projects.mmdet3d_plugin.models.utils.visual import save_tensor from torchvision.transforms.functional import rotate import cv2 import argparse import json import os import random import time from typing import Tuple, Dict, Any import math import numpy as np from nuscenes import NuScenes from nuscenes.eval.common.config import config_factory from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.common.loaders import load_prediction, load_gt, add_center_dist, filter_eval_boxes from nuscenes.eval.detection.algo import accumulate, calc_ap, calc_tp from nuscenes.eval.detection.constants import TP_METRICS from nuscenes.eval.detection.data_classes import DetectionConfig, DetectionMetrics, DetectionBox, \ DetectionMetricDataList from nuscenes.eval.detection.render import summary_plot, class_pr_curve, dist_pr_curve, visualize_sample from nuscenes.eval.common.utils import quaternion_yaw, Quaternion from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D from IPython import embed import json from typing import Any import numpy as np from matplotlib import pyplot as plt from nuscenes import NuScenes from nuscenes.eval.common.data_classes import EvalBoxes from nuscenes.eval.common.render import setup_axis from nuscenes.eval.common.utils import boxes_to_sensor from nuscenes.eval.detection.constants import TP_METRICS, DETECTION_NAMES, DETECTION_COLORS, TP_METRICS_UNITS, \ PRETTY_DETECTION_NAMES, PRETTY_TP_METRICS from nuscenes.eval.detection.data_classes import DetectionMetrics, DetectionMetricData, DetectionMetricDataList from nuscenes.utils.data_classes import LidarPointCloud from nuscenes.utils.geometry_utils import view_points Axis = Any def class_tp_curve(md_list: DetectionMetricDataList, metrics: DetectionMetrics, detection_name: str, min_recall: float, dist_th_tp: float, savepath: str = None, ax: Axis = None) -> None: """ Plot the true positive curve for the specified class. :param md_list: DetectionMetricDataList instance. :param metrics: DetectionMetrics instance. :param detection_name: :param min_recall: Minimum recall value. :param dist_th_tp: The distance threshold used to determine matches. :param savepath: If given, saves the the rendering here instead of displaying. :param ax: Axes onto which to render. """ # Get metric data for given detection class with tp distance threshold. md = md_list[(detection_name, dist_th_tp)] min_recall_ind = round(100 * min_recall) if min_recall_ind <= md.max_recall_ind: # For traffic_cone and barrier only a subset of the metrics are plotted. rel_metrics = [m for m in TP_METRICS if not np.isnan(metrics.get_label_tp(detection_name, m))] ylimit = max([max(getattr(md, metric)[min_recall_ind:md.max_recall_ind + 1]) for metric in rel_metrics]) * 1.1 else: ylimit = 1.0 # Prepare axis. if ax is None: ax = setup_axis(title=PRETTY_DETECTION_NAMES[detection_name], xlabel='Recall', ylabel='Error', xlim=1, min_recall=min_recall) ax.set_ylim(0, ylimit) # Plot the recall vs. error curve for each tp metric. for metric in TP_METRICS: tp = metrics.get_label_tp(detection_name, metric) # Plot only if we have valid data. if tp is not np.nan and min_recall_ind <= md.max_recall_ind: recall, error = md.recall[:md.max_recall_ind + 1], getattr(md, metric)[:md.max_recall_ind + 1] else: recall, error = [], [] # Change legend based on tp value if tp is np.nan: label = '{}: n/a'.format(PRETTY_TP_METRICS[metric]) elif min_recall_ind > md.max_recall_ind: label = '{}: nan'.format(PRETTY_TP_METRICS[metric]) else: label = '{}: {:.2f} ({})'.format(PRETTY_TP_METRICS[metric], tp, TP_METRICS_UNITS[metric]) if metric == 'trans_err': label += f' ({md.max_recall_ind})' # add recall print(f'Recall: {detection_name}: {md.max_recall_ind/100}') ax.plot(recall, error, label=label) ax.axvline(x=md.max_recall, linestyle='-.', color=(0, 0, 0, 0.3)) ax.legend(loc='best') if savepath is not None: plt.savefig(savepath) plt.close() class DetectionBox_modified(DetectionBox): def __init__(self, *args, token=None, visibility=None, index=None, **kwargs): ''' add annotation token ''' super().__init__(*args, **kwargs) self.token = token self.visibility = visibility self.index = index def serialize(self) -> dict: """ Serialize instance into json-friendly format. """ return { 'token': self.token, 'sample_token': self.sample_token, 'translation': self.translation, 'size': self.size, 'rotation': self.rotation, 'velocity': self.velocity, 'ego_translation': self.ego_translation, 'num_pts': self.num_pts, 'detection_name': self.detection_name, 'detection_score': self.detection_score, 'attribute_name': self.attribute_name, 'visibility': self.visibility, 'index': self.index } @classmethod def deserialize(cls, content: dict): """ Initialize from serialized content. """ return cls( token=content['token'], sample_token=content['sample_token'], translation=tuple(content['translation']), size=tuple(content['size']), rotation=tuple(content['rotation']), velocity=tuple(content['velocity']), ego_translation=(0.0, 0.0, 0.0) if 'ego_translation' not in content else tuple(content['ego_translation']), num_pts=-1 if 'num_pts' not in content else int(content['num_pts']), detection_name=content['detection_name'], detection_score=-1.0 if 'detection_score' not in content else float(content['detection_score']), attribute_name=content['attribute_name'], visibility=content['visibility'], index=content['index'], ) def center_in_image(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: """ Check if a box is visible inside an image without accounting for occlusions. :param box: The box to be checked. :param intrinsic: . Intrinsic camera matrix. :param imsize: (width, height). :param vis_level: One of the enumerations of . :return True if visibility condition is satisfied. """ center_3d = box.center.reshape(3, 1) center_img = view_points(center_3d, intrinsic, normalize=True)[:2, :] visible = np.logical_and(center_img[0, :] > 0, center_img[0, :] < imsize[0]) visible = np.logical_and(visible, center_img[1, :] < imsize[1]) visible = np.logical_and(visible, center_img[1, :] > 0) visible = np.logical_and(visible, center_3d[2, :] > 1) in_front = center_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. if vis_level == BoxVisibility.ALL: return all(visible) and all(in_front) elif vis_level == BoxVisibility.ANY: return any(visible) and all(in_front) elif vis_level == BoxVisibility.NONE: return True else: raise ValueError("vis_level: {} not valid".format(vis_level)) def exist_corners_in_image_but_not_all(box, intrinsic: np.ndarray, imsize: Tuple[int, int], vis_level: int = BoxVisibility.ANY) -> bool: """ Check if a box is visible in images but not all corners in image . :param box: The box to be checked. :param intrinsic: . Intrinsic camera matrix. :param imsize: (width, height). :param vis_level: One of the enumerations of . :return True if visibility condition is satisfied. """ corners_3d = box.corners() corners_img = view_points(corners_3d, intrinsic, normalize=True)[:2, :] visible = np.logical_and(corners_img[0, :] > 0, corners_img[0, :] < imsize[0]) visible = np.logical_and(visible, corners_img[1, :] < imsize[1]) visible = np.logical_and(visible, corners_img[1, :] > 0) visible = np.logical_and(visible, corners_3d[2, :] > 1) in_front = corners_3d[2, :] > 0.1 # True if a corner is at least 0.1 meter in front of the camera. if any(visible) and not all(visible) and all(in_front): return True else: return False def load_gt(nusc: NuScenes, eval_split: str, box_cls, verbose: bool = False): """ Loads ground truth boxes from DB. :param nusc: A NuScenes instance. :param eval_split: The evaluation split for which we load GT boxes. :param box_cls: Type of box to load, e.g. DetectionBox or TrackingBox. :param verbose: Whether to print messages to stdout. :return: The GT boxes. """ # Init. if box_cls == DetectionBox_modified: attribute_map = {a['token']: a['name'] for a in nusc.attribute} if verbose: print('Loading annotations for {} split from nuScenes version: {}'.format(eval_split, nusc.version)) # Read out all sample_tokens in DB. sample_tokens_all = [s['token'] for s in nusc.sample] assert len(sample_tokens_all) > 0, "Error: Database has no samples!" # Only keep samples from this split. splits = create_splits_scenes() # Check compatibility of split with nusc_version. version = nusc.version if eval_split in {'train', 'val', 'train_detect', 'train_track'}: assert version.endswith('trainval'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split in {'mini_train', 'mini_val'}: assert version.endswith('mini'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) elif eval_split == 'test': assert version.endswith('test'), \ 'Error: Requested split {} which is not compatible with NuScenes version {}'.format(eval_split, version) else: raise ValueError('Error: Requested split {} which this function cannot map to the correct NuScenes version.' .format(eval_split)) if eval_split == 'test': # Check that you aren't trying to cheat :). assert len(nusc.sample_annotation) > 0, \ 'Error: You are trying to evaluate on the test set but you do not have the annotations!' index_map = {} for scene in nusc.scene: first_sample_token = scene['first_sample_token'] sample = nusc.get('sample', first_sample_token) index_map[first_sample_token] = 1 index = 2 while sample['next'] != '': sample = nusc.get('sample', sample['next']) index_map[sample['token']] = index index += 1 sample_tokens = [] for sample_token in sample_tokens_all: scene_token = nusc.get('sample', sample_token)['scene_token'] scene_record = nusc.get('scene', scene_token) if scene_record['name'] in splits[eval_split]: sample_tokens.append(sample_token) all_annotations = EvalBoxes() # Load annotations and filter predictions and annotations. tracking_id_set = set() for sample_token in tqdm.tqdm(sample_tokens, leave=verbose): sample = nusc.get('sample', sample_token) sample_annotation_tokens = sample['anns'] sample_boxes = [] for sample_annotation_token in sample_annotation_tokens: sample_annotation = nusc.get('sample_annotation', sample_annotation_token) if box_cls == DetectionBox_modified: # Get label name in detection task and filter unused labels. detection_name = category_to_detection_name(sample_annotation['category_name']) if detection_name is None: continue # Get attribute_name. attr_tokens = sample_annotation['attribute_tokens'] attr_count = len(attr_tokens) if attr_count == 0: attribute_name = '' elif attr_count == 1: attribute_name = attribute_map[attr_tokens[0]] else: raise Exception('Error: GT annotations must not have more than one attribute!') sample_boxes.append( box_cls( token=sample_annotation_token, sample_token=sample_token, translation=sample_annotation['translation'], size=sample_annotation['size'], rotation=sample_annotation['rotation'], velocity=nusc.box_velocity(sample_annotation['token'])[:2], num_pts=sample_annotation['num_lidar_pts'] + sample_annotation['num_radar_pts'], detection_name=detection_name, detection_score=-1.0, # GT samples do not have a score. attribute_name=attribute_name, visibility=sample_annotation['visibility_token'], index=index_map[sample_token] ) ) elif box_cls == TrackingBox: assert False else: raise NotImplementedError('Error: Invalid box_cls %s!' % box_cls) all_annotations.add_boxes(sample_token, sample_boxes) if verbose: print("Loaded ground truth annotations for {} samples.".format(len(all_annotations.sample_tokens))) return all_annotations def filter_eval_boxes_by_id(nusc: NuScenes, eval_boxes: EvalBoxes, id=None, verbose: bool = False) -> EvalBoxes: """ Applies filtering to boxes. Distance, bike-racks and points per box. :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param is: the anns token set that used to keep bboxes. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) filtered_boxes = [] for box in eval_boxes[sample_token]: if box.token in id: filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes if verbose: print("=> Original number of boxes: %d" % total) print("=> After anns based filtering: %d" % anns_filter) return eval_boxes def filter_eval_boxes_by_visibility( ori_eval_boxes: EvalBoxes, visibility=None, verbose: bool = False) -> EvalBoxes: """ Applies filtering to boxes. Distance, bike-racks and points per box. :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param is: the anns token set that used to keep bboxes. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. eval_boxes = copy.deepcopy(ori_eval_boxes) total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) filtered_boxes = [] for box in eval_boxes[sample_token]: if box.visibility == visibility: filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes if verbose: print("=> Original number of boxes: %d" % total) print("=> After visibility based filtering: %d" % anns_filter) return eval_boxes def filter_by_sample_token(ori_eval_boxes, valid_sample_tokens=[], verbose=False): eval_boxes = copy.deepcopy(ori_eval_boxes) for sample_token in eval_boxes.sample_tokens: if sample_token not in valid_sample_tokens: eval_boxes.boxes.pop(sample_token) return eval_boxes def filter_eval_boxes_by_overlap(nusc: NuScenes, eval_boxes: EvalBoxes, verbose: bool = False) -> EvalBoxes: """ Applies filtering to boxes. basedon overlap . :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. cams = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'] total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) sample_record = nusc.get('sample', sample_token) filtered_boxes = [] for box in eval_boxes[sample_token]: count = 0 for cam in cams: ''' copy-paste form nuscens ''' sample_data_token = sample_record['data'][cam] sd_record = nusc.get('sample_data', sample_data_token) cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) sensor_record = nusc.get('sensor', cs_record['sensor_token']) pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) cam_intrinsic = np.array(cs_record['camera_intrinsic']) imsize = (sd_record['width'], sd_record['height']) new_box = Box(box.translation, box.size, Quaternion(box.rotation), name=box.detection_name, token='') # Move box to ego vehicle coord system. new_box.translate(-np.array(pose_record['translation'])) new_box.rotate(Quaternion(pose_record['rotation']).inverse) # Move box to sensor coord system. new_box.translate(-np.array(cs_record['translation'])) new_box.rotate(Quaternion(cs_record['rotation']).inverse) if center_in_image(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): count += 1 # if exist_corners_in_image_but_not_all(new_box, cam_intrinsic, imsize, vis_level=BoxVisibility.ANY): # count += 1 if count > 1: with open('center_overlap.txt', 'a') as f: try: f.write(box.token + '\n') except: pass filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes verbose = True if verbose: print("=> Original number of boxes: %d" % total) print("=> After anns based filtering: %d" % anns_filter) return eval_boxes def filter_eval_boxes_by_range(nusc: NuScenes, eval_boxes: EvalBoxes, verbose: bool = True, min_=0, max_=60, ) -> EvalBoxes: """ Applies filtering to boxes. basedon overlap . :param nusc: An instance of the NuScenes class. :param eval_boxes: An instance of the EvalBoxes class. :param verbose: Whether to print to stdout. """ # Accumulators for number of filtered boxes. cams = ['CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_FRONT_LEFT'] total, anns_filter = 0, 0 for ind, sample_token in enumerate(eval_boxes.sample_tokens): # Filter on anns total += len(eval_boxes[sample_token]) sample_record = nusc.get('sample', sample_token) filtered_boxes = [] for box in eval_boxes[sample_token]: count = 0 sample_data_token = sample_record['data'][cams[0]] sd_record = nusc.get('sample_data', sample_data_token) cs_record = nusc.get('calibrated_sensor', sd_record['calibrated_sensor_token']) sensor_record = nusc.get('sensor', cs_record['sensor_token']) pose_record = nusc.get('ego_pose', sd_record['ego_pose_token']) cam_intrinsic = np.array(cs_record['camera_intrinsic']) imsize = (sd_record['width'], sd_record['height']) new_box = Box(box.translation, box.size, Quaternion(box.rotation), name=box.detection_name, token='') # Move box to ego vehicle coord system. new_box.translate(-np.array(pose_record['translation'])) new_box.rotate(Quaternion(pose_record['rotation']).inverse) x, y = new_box.center[:2] dist = math.sqrt(x**2+y**2) if dist min_: filtered_boxes.append(box) anns_filter += len(filtered_boxes) eval_boxes.boxes[sample_token] = filtered_boxes if verbose: print("=> Original number of boxes: %d" % total) print("=> After range filtering: %d" % anns_filter) return eval_boxes class NuScenesEval_custom(NuScenesEval): """ Dummy class for backward-compatibility. Same as DetectionEval. """ def __init__(self, nusc: NuScenes, config: DetectionConfig, result_path: str, eval_set: str, output_dir: str = None, verbose: bool = True, overlap_test=False, eval_mask=False, data_infos=None ): """ Initialize a DetectionEval object. :param nusc: A NuScenes object. :param config: A DetectionConfig object. :param result_path: Path of the nuScenes JSON result file. :param eval_set: The dataset split to evaluate on, e.g. train, val or test. :param output_dir: Folder to save plots and results to. :param verbose: Whether to print to stdout. """ self.nusc = nusc self.result_path = result_path self.eval_set = eval_set self.output_dir = output_dir self.verbose = verbose self.cfg = config self.overlap_test = overlap_test self.eval_mask = eval_mask self.data_infos = data_infos # Check result file exists. assert os.path.exists(result_path), 'Error: The result file does not exist!' # Make dirs. self.plot_dir = os.path.join(self.output_dir, 'plots') if not os.path.isdir(self.output_dir): os.makedirs(self.output_dir) if not os.path.isdir(self.plot_dir): os.makedirs(self.plot_dir) # Load data. if verbose: print('Initializing nuScenes detection evaluation') self.pred_boxes, self.meta = load_prediction(self.result_path, self.cfg.max_boxes_per_sample, DetectionBox, verbose=verbose) self.gt_boxes = load_gt(self.nusc, self.eval_set, DetectionBox_modified, verbose=verbose) assert set(self.pred_boxes.sample_tokens) == set(self.gt_boxes.sample_tokens), \ "Samples in split doesn't match samples in predictions." # Add center distances. self.pred_boxes = add_center_dist(nusc, self.pred_boxes) self.gt_boxes = add_center_dist(nusc, self.gt_boxes) # Filter boxes (distance, points per box, etc.). if verbose: print('Filtering predictions') self.pred_boxes = filter_eval_boxes(nusc, self.pred_boxes, self.cfg.class_range, verbose=verbose) if verbose: print('Filtering ground truth annotations') self.gt_boxes = filter_eval_boxes(nusc, self.gt_boxes, self.cfg.class_range, verbose=verbose) if self.overlap_test: self.pred_boxes = filter_eval_boxes_by_overlap(self.nusc, self.pred_boxes) self.gt_boxes = filter_eval_boxes_by_overlap(self.nusc, self.gt_boxes, verbose=True) min_ = 25 max_ = 60 self.pred_boxes = filter_eval_boxes_by_range(self.nusc, self.pred_boxes, min_=min_-2, max_=max_+2) self.gt_boxes = filter_eval_boxes_by_range(self.nusc, self.gt_boxes, min_=min_, max_=max_) self.all_gt = copy.deepcopy(self.gt_boxes) self.all_preds = copy.deepcopy(self.pred_boxes) self.sample_tokens = self.gt_boxes.sample_tokens self.index_map = {} for scene in nusc.scene: first_sample_token = scene['first_sample_token'] sample = nusc.get('sample', first_sample_token) self.index_map[first_sample_token] = 1 index = 2 while sample['next'] != '': sample = nusc.get('sample', sample['next']) self.index_map[sample['token']] = index index += 1 def update_gt(self, type_='vis', visibility='1', index=1): if type_ == 'vis': self.visibility_test = True if self.visibility_test: '''[{'description': 'visibility of whole object is between 0 and 40%', 'token': '1', 'level': 'v0-40'}, {'description': 'visibility of whole object is between 40 and 60%', 'token': '2', 'level': 'v40-60'}, {'description': 'visibility of whole object is between 60 and 80%', 'token': '3', 'level': 'v60-80'}, {'description': 'visibility of whole object is between 80 and 100%', 'token': '4', 'level': 'v80-100'}]''' self.gt_boxes = filter_eval_boxes_by_visibility(self.all_gt, visibility, verbose=True) elif type_ == 'ord': valid_tokens = [key for (key, value) in self.index_map.items() if value == index] # from IPython import embed # embed() self.gt_boxes = filter_by_sample_token(self.all_gt, valid_tokens) self.pred_boxes = filter_by_sample_token(self.all_preds, valid_tokens) self.sample_tokens = self.gt_boxes.sample_tokens def evaluate(self) -> Tuple[DetectionMetrics, DetectionMetricDataList]: """ Performs the actual evaluation. :return: A tuple of high-level and the raw metric data. """ start_time = time.time() # ----------------------------------- # Step 1: Accumulate metric data for all classes and distance thresholds. # ----------------------------------- if self.verbose: print('Accumulating metric data...') metric_data_list = DetectionMetricDataList() # print(self.cfg.dist_fcn_callable, self.cfg.dist_ths) # self.cfg.dist_ths = [0.3] # self.cfg.dist_fcn_callable for class_name in self.cfg.class_names: for dist_th in self.cfg.dist_ths: md = accumulate(self.gt_boxes, self.pred_boxes, class_name, self.cfg.dist_fcn_callable, dist_th) metric_data_list.set(class_name, dist_th, md) # ----------------------------------- # Step 2: Calculate metrics from the data. # ----------------------------------- if self.verbose: print('Calculating metrics...') metrics = DetectionMetrics(self.cfg) for class_name in self.cfg.class_names: # Compute APs. for dist_th in self.cfg.dist_ths: metric_data = metric_data_list[(class_name, dist_th)] ap = calc_ap(metric_data, self.cfg.min_recall, self.cfg.min_precision) metrics.add_label_ap(class_name, dist_th, ap) # Compute TP metrics. for metric_name in TP_METRICS: metric_data = metric_data_list[(class_name, self.cfg.dist_th_tp)] if class_name in ['traffic_cone'] and metric_name in ['attr_err', 'vel_err', 'orient_err']: tp = np.nan elif class_name in ['barrier'] and metric_name in ['attr_err', 'vel_err']: tp = np.nan else: tp = calc_tp(metric_data, self.cfg.min_recall, metric_name) metrics.add_label_tp(class_name, metric_name, tp) # Compute evaluation time. metrics.add_runtime(time.time() - start_time) return metrics, metric_data_list def render(self, metrics: DetectionMetrics, md_list: DetectionMetricDataList) -> None: """ Renders various PR and TP curves. :param metrics: DetectionMetrics instance. :param md_list: DetectionMetricDataList instance. """ if self.verbose: print('Rendering PR and TP curves') def savepath(name): return os.path.join(self.plot_dir, name + '.pdf') summary_plot(md_list, metrics, min_precision=self.cfg.min_precision, min_recall=self.cfg.min_recall, dist_th_tp=self.cfg.dist_th_tp, savepath=savepath('summary')) for detection_name in self.cfg.class_names: class_pr_curve(md_list, metrics, detection_name, self.cfg.min_precision, self.cfg.min_recall, savepath=savepath(detection_name + '_pr')) class_tp_curve(md_list, metrics, detection_name, self.cfg.min_recall, self.cfg.dist_th_tp, savepath=savepath(detection_name + '_tp')) for dist_th in self.cfg.dist_ths: dist_pr_curve(md_list, metrics, dist_th, self.cfg.min_precision, self.cfg.min_recall, savepath=savepath('dist_pr_' + str(dist_th))) def evaluate_mask(self, preds, HDMap, Bbox_mask): if preds is None: return {} self.HDMap = HDMap self.Bbox_mask = Bbox_mask tokens = [each['token'] for each in self.data_infos] mask_shape_flag = f'{HDMap.canvas_size[0]}_{HDMap.canvas_size[1]}_{HDMap.grid_length}' # print(mask_shape) try: masks_gt = np.load(f'.cache/mask_gt_{mask_shape_flag}.npy') except: map_masks_gt = self.prepare_map_mask_gt(tokens) bbox_masks_gt = self.prepare_bbox_mask_gt(tokens) masks_gt = np.concatenate([bbox_masks_gt, map_masks_gt], 1) np.save(f'.cache/mask_gt_{mask_shape_flag}.npy', masks_gt) gt_list = [[], [], [], [], [], []] pred_list = [[], [], [], [], [], []] for i, (token, pred, gt) in enumerate(zip(tokens, preds, masks_gt)): preds_mask = [] for mask in pred: preds_mask.append(mask_util.decode(mask)) preds_mask = np.stack(preds_mask) gt = torch.tensor(gt) gt = torch.flip(gt, [1, 2]) preds_mask = torch.tensor(preds_mask) preds_mask = torch.flip(preds_mask, [1]) for j in range(6): gt_list[j].append(gt[j].reshape(-1)) pred_list[j].append(preds_mask[j].reshape(-1)) # ti = time.time() # save_tensor(preds_mask, f'masks/{token}preds.png') # save_tensor(gt, f'masks/{token}gt.png') # bbox_masks = self.repare_bbox_mask_gt() class_names = [ 'car', 'vehicle', 'ped', 'divider', 'boundary', 'drivable', 'lane' ] results = {} for i, name in enumerate(class_names[:-1]): results[name] = self.get_batch_iou(torch.stack(gt_list[i]), torch.stack(pred_list[i])) # embed() # exit() results['lane'] = self.get_batch_iou(torch.stack(gt_list[-3]) | torch.stack(gt_list[-2]), torch.stack(pred_list[-2]) | torch.stack(pred_list[-3])) return results if __name__ == "__main__": # Settings. parser = argparse.ArgumentParser(description='Evaluate nuScenes detection results.', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('result_path', type=str, help='The submission as a JSON file.') parser.add_argument('--output_dir', type=str, default='~/nuscenes-metrics', help='Folder to store result metrics, graphs and example visualizations.') parser.add_argument('--eval_set', type=str, default='val', help='Which dataset split to evaluate on, train, val or test.') parser.add_argument('--dataroot', type=str, default='data/nuscenes', help='Default nuScenes data directory.') parser.add_argument('--version', type=str, default='v1.0-trainval', help='Which version of the nuScenes dataset to evaluate on, e.g. v1.0-trainval.') parser.add_argument('--config_path', type=str, default='', help='Path to the configuration file.' 'If no path given, the CVPR 2019 configuration will be used.') parser.add_argument('--plot_examples', type=int, default=0, help='How many example visualizations to write to disk.') parser.add_argument('--render_curves', type=int, default=1, help='Whether to render PR and TP curves to disk.') parser.add_argument('--verbose', type=int, default=1, help='Whether to print to stdout.') args = parser.parse_args() result_path_ = os.path.expanduser(args.result_path) output_dir_ = os.path.expanduser(args.output_dir) eval_set_ = args.eval_set dataroot_ = args.dataroot version_ = args.version config_path = args.config_path plot_examples_ = args.plot_examples render_curves_ = bool(args.render_curves) verbose_ = bool(args.verbose) if config_path == '': cfg_ = config_factory('detection_cvpr_2019') else: with open(config_path, 'r') as _f: cfg_ = DetectionConfig.deserialize(json.load(_f)) nusc_ = NuScenes(version=version_, verbose=verbose_, dataroot=dataroot_) nusc_eval = NuScenesEval_custom(nusc_, config=cfg_, result_path=result_path_, eval_set=eval_set_, output_dir=output_dir_, verbose=verbose_) for vis in ['1', '2', '3', '4']: nusc_eval.update_gt(type_='vis', visibility=vis) print(f'================ {vis} ===============') nusc_eval.main(plot_examples=plot_examples_, render_curves=render_curves_) #for index in range(1, 41): # nusc_eval.update_gt(type_='ord', index=index) # ================================================ FILE: mmdet3d/datasets/nuscenes_mono_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import tempfile import warnings from os import path as osp import mmcv import numpy as np import pyquaternion import torch from nuscenes.utils.data_classes import Box as NuScenesBox from mmdet3d.core import bbox3d2result, box3d_multiclass_nms, xywhr2xyxyr from mmdet.datasets import CocoDataset from ..core import show_multi_modality_result from ..core.bbox import CameraInstance3DBoxes, get_box_type from .builder import DATASETS from .pipelines import Compose from .utils import extract_result_dict, get_loading_pipeline @DATASETS.register_module() class NuScenesMonoDataset(CocoDataset): r"""Monocular 3D detection on NuScenes Dataset. This class serves as the API for experiments on the NuScenes Dataset. Please refer to `NuScenes Dataset `_ for data downloading. Args: ann_file (str): Path of annotation file. data_root (str): Path of dataset root. load_interval (int, optional): Interval of loading the dataset. It is used to uniformly sample the dataset. Defaults to 1. with_velocity (bool, optional): Whether include velocity prediction into the experiments. Defaults to True. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'Camera' in this class. Available options includes. - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. eval_version (str, optional): Configuration version of evaluation. Defaults to 'detection_cvpr_2019'. use_valid_flag (bool, optional): Whether to use `use_valid_flag` key in the info file as mask to filter gt_boxes and gt_names. Defaults to False. version (str, optional): Dataset version. Defaults to 'v1.0-trainval'. """ CLASSES = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier') DefaultAttribute = { 'car': 'vehicle.parked', 'pedestrian': 'pedestrian.moving', 'trailer': 'vehicle.parked', 'truck': 'vehicle.parked', 'bus': 'vehicle.moving', 'motorcycle': 'cycle.without_rider', 'construction_vehicle': 'vehicle.parked', 'bicycle': 'cycle.without_rider', 'barrier': '', 'traffic_cone': '', } # https://github.com/nutonomy/nuscenes-devkit/blob/57889ff20678577025326cfc24e57424a829be0a/python-sdk/nuscenes/eval/detection/evaluate.py#L222 # noqa ErrNameMapping = { 'trans_err': 'mATE', 'scale_err': 'mASE', 'orient_err': 'mAOE', 'vel_err': 'mAVE', 'attr_err': 'mAAE' } def __init__(self, data_root, ann_file, pipeline, load_interval=1, with_velocity=True, modality=None, box_type_3d='Camera', eval_version='detection_cvpr_2019', use_valid_flag=False, version='v1.0-trainval', classes=None, img_prefix='', seg_prefix=None, proposal_file=None, test_mode=False, filter_empty_gt=True, file_client_args=dict(backend='disk')): self.ann_file = ann_file self.data_root = data_root self.img_prefix = img_prefix self.seg_prefix = seg_prefix self.proposal_file = proposal_file self.test_mode = test_mode self.filter_empty_gt = filter_empty_gt self.CLASSES = self.get_classes(classes) self.file_client = mmcv.FileClient(**file_client_args) # load annotations (and proposals) with self.file_client.get_local_path(self.ann_file) as local_path: self.data_infos = self.load_annotations(local_path) if self.proposal_file is not None: with self.file_client.get_local_path( self.proposal_file) as local_path: self.proposals = self.load_proposals(local_path) else: self.proposals = None # filter images too small and containing no annotations if not test_mode: valid_inds = self._filter_imgs() self.data_infos = [self.data_infos[i] for i in valid_inds] if self.proposals is not None: self.proposals = [self.proposals[i] for i in valid_inds] # set group flag for the sampler self._set_group_flag() # processing pipeline self.pipeline = Compose(pipeline) self.load_interval = load_interval self.with_velocity = with_velocity self.modality = modality self.box_type_3d, self.box_mode_3d = get_box_type(box_type_3d) self.eval_version = eval_version self.use_valid_flag = use_valid_flag self.bbox_code_size = 9 self.version = version if self.eval_version is not None: from nuscenes.eval.detection.config import config_factory self.eval_detection_configs = config_factory(self.eval_version) if self.modality is None: self.modality = dict( use_camera=True, use_lidar=False, use_radar=False, use_map=False, use_external=False) def pre_pipeline(self, results): """Initialization before data preparation. Args: results (dict): Dict before data preprocessing. - img_fields (list): Image fields. - bbox3d_fields (list): 3D bounding boxes fields. - pts_mask_fields (list): Mask fields of points. - pts_seg_fields (list): Mask fields of point segments. - bbox_fields (list): Fields of bounding boxes. - mask_fields (list): Fields of masks. - seg_fields (list): Segment fields. - box_type_3d (str): 3D box type. - box_mode_3d (str): 3D box mode. """ results['img_prefix'] = self.img_prefix results['seg_prefix'] = self.seg_prefix results['proposal_file'] = self.proposal_file results['img_fields'] = [] results['bbox3d_fields'] = [] results['pts_mask_fields'] = [] results['pts_seg_fields'] = [] results['bbox_fields'] = [] results['mask_fields'] = [] results['seg_fields'] = [] results['box_type_3d'] = self.box_type_3d results['box_mode_3d'] = self.box_mode_3d def _parse_ann_info(self, img_info, ann_info): """Parse bbox annotation. Args: img_info (list[dict]): Image info. ann_info (list[dict]): Annotation info of an image. Returns: dict: A dict containing the following keys: bboxes, labels, gt_bboxes_3d, gt_labels_3d, attr_labels, centers2d, depths, bboxes_ignore, masks, seg_map """ gt_bboxes = [] gt_labels = [] attr_labels = [] gt_bboxes_ignore = [] gt_masks_ann = [] gt_bboxes_cam3d = [] centers2d = [] depths = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] inter_w = max(0, min(x1 + w, img_info['width']) - max(x1, 0)) inter_h = max(0, min(y1 + h, img_info['height']) - max(y1, 0)) if inter_w * inter_h == 0: continue if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_id'] not in self.cat_ids: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_labels.append(self.cat2label[ann['category_id']]) attr_labels.append(ann['attribute_id']) gt_masks_ann.append(ann.get('segmentation', None)) # 3D annotations in camera coordinates bbox_cam3d = np.array(ann['bbox_cam3d']).reshape(1, -1) velo_cam3d = np.array(ann['velo_cam3d']).reshape(1, 2) nan_mask = np.isnan(velo_cam3d[:, 0]) velo_cam3d[nan_mask] = [0.0, 0.0] bbox_cam3d = np.concatenate([bbox_cam3d, velo_cam3d], axis=-1) gt_bboxes_cam3d.append(bbox_cam3d.squeeze()) # 2.5D annotations in camera coordinates center2d = ann['center2d'][:2] depth = ann['center2d'][2] centers2d.append(center2d) depths.append(depth) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) attr_labels = np.array(attr_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) attr_labels = np.array([], dtype=np.int64) if gt_bboxes_cam3d: gt_bboxes_cam3d = np.array(gt_bboxes_cam3d, dtype=np.float32) centers2d = np.array(centers2d, dtype=np.float32) depths = np.array(depths, dtype=np.float32) else: gt_bboxes_cam3d = np.zeros((0, self.bbox_code_size), dtype=np.float32) centers2d = np.zeros((0, 2), dtype=np.float32) depths = np.zeros((0), dtype=np.float32) gt_bboxes_cam3d = CameraInstance3DBoxes( gt_bboxes_cam3d, box_dim=gt_bboxes_cam3d.shape[-1], origin=(0.5, 0.5, 0.5)) gt_labels_3d = copy.deepcopy(gt_labels) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) seg_map = img_info['filename'].replace('jpg', 'png') ann = dict( bboxes=gt_bboxes, labels=gt_labels, gt_bboxes_3d=gt_bboxes_cam3d, gt_labels_3d=gt_labels_3d, attr_labels=attr_labels, centers2d=centers2d, depths=depths, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann, seg_map=seg_map) return ann def get_attr_name(self, attr_idx, label_name): """Get attribute from predicted index. This is a workaround to predict attribute when the predicted velocity is not reliable. We map the predicted attribute index to the one in the attribute set. If it is consistent with the category, we will keep it. Otherwise, we will use the default attribute. Args: attr_idx (int): Attribute index. label_name (str): Predicted category name. Returns: str: Predicted attribute name. """ # TODO: Simplify the variable name AttrMapping_rev2 = [ 'cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', 'pedestrian.standing', 'pedestrian.sitting_lying_down', 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None' ] if label_name == 'car' or label_name == 'bus' \ or label_name == 'truck' or label_name == 'trailer' \ or label_name == 'construction_vehicle': if AttrMapping_rev2[attr_idx] == 'vehicle.moving' or \ AttrMapping_rev2[attr_idx] == 'vehicle.parked' or \ AttrMapping_rev2[attr_idx] == 'vehicle.stopped': return AttrMapping_rev2[attr_idx] else: return NuScenesMonoDataset.DefaultAttribute[label_name] elif label_name == 'pedestrian': if AttrMapping_rev2[attr_idx] == 'pedestrian.moving' or \ AttrMapping_rev2[attr_idx] == 'pedestrian.standing' or \ AttrMapping_rev2[attr_idx] == \ 'pedestrian.sitting_lying_down': return AttrMapping_rev2[attr_idx] else: return NuScenesMonoDataset.DefaultAttribute[label_name] elif label_name == 'bicycle' or label_name == 'motorcycle': if AttrMapping_rev2[attr_idx] == 'cycle.with_rider' or \ AttrMapping_rev2[attr_idx] == 'cycle.without_rider': return AttrMapping_rev2[attr_idx] else: return NuScenesMonoDataset.DefaultAttribute[label_name] else: return NuScenesMonoDataset.DefaultAttribute[label_name] def _format_bbox(self, results, jsonfile_prefix=None): """Convert the results to the standard format. Args: results (list[dict]): Testing results of the dataset. jsonfile_prefix (str): The prefix of the output jsonfile. You can specify the output directory/filename by modifying the jsonfile_prefix. Default: None. Returns: str: Path of the output json file. """ nusc_annos = {} mapped_class_names = self.CLASSES print('Start to convert detection format...') CAM_NUM = 6 for sample_id, det in enumerate(mmcv.track_iter_progress(results)): if sample_id % CAM_NUM == 0: boxes_per_frame = [] attrs_per_frame = [] # need to merge results from images of the same sample annos = [] boxes, attrs = output_to_nusc_box(det) sample_token = self.data_infos[sample_id]['token'] boxes, attrs = cam_nusc_box_to_global(self.data_infos[sample_id], boxes, attrs, mapped_class_names, self.eval_detection_configs, self.eval_version) boxes_per_frame.extend(boxes) attrs_per_frame.extend(attrs) # Remove redundant predictions caused by overlap of images if (sample_id + 1) % CAM_NUM != 0: continue boxes = global_nusc_box_to_cam( self.data_infos[sample_id + 1 - CAM_NUM], boxes_per_frame, mapped_class_names, self.eval_detection_configs, self.eval_version) cam_boxes3d, scores, labels = nusc_box_to_cam_box3d(boxes) # box nms 3d over 6 images in a frame # TODO: move this global setting into config nms_cfg = dict( use_rotate_nms=True, nms_across_levels=False, nms_pre=4096, nms_thr=0.05, score_thr=0.01, min_bbox_size=0, max_per_frame=500) from mmcv import Config nms_cfg = Config(nms_cfg) cam_boxes3d_for_nms = xywhr2xyxyr(cam_boxes3d.bev) boxes3d = cam_boxes3d.tensor # generate attr scores from attr labels attrs = labels.new_tensor([attr for attr in attrs_per_frame]) boxes3d, scores, labels, attrs = box3d_multiclass_nms( boxes3d, cam_boxes3d_for_nms, scores, nms_cfg.score_thr, nms_cfg.max_per_frame, nms_cfg, mlvl_attr_scores=attrs) cam_boxes3d = CameraInstance3DBoxes(boxes3d, box_dim=9) det = bbox3d2result(cam_boxes3d, scores, labels, attrs) boxes, attrs = output_to_nusc_box(det) boxes, attrs = cam_nusc_box_to_global( self.data_infos[sample_id + 1 - CAM_NUM], boxes, attrs, mapped_class_names, self.eval_detection_configs, self.eval_version) for i, box in enumerate(boxes): name = mapped_class_names[box.label] attr = self.get_attr_name(attrs[i], name) nusc_anno = dict( sample_token=sample_token, translation=box.center.tolist(), size=box.wlh.tolist(), rotation=box.orientation.elements.tolist(), velocity=box.velocity[:2].tolist(), detection_name=name, detection_score=box.score, attribute_name=attr) annos.append(nusc_anno) # other views results of the same frame should be concatenated if sample_token in nusc_annos: nusc_annos[sample_token].extend(annos) else: nusc_annos[sample_token] = annos nusc_submissions = { 'meta': self.modality, 'results': nusc_annos, } mmcv.mkdir_or_exist(jsonfile_prefix) res_path = osp.join(jsonfile_prefix, 'results_nusc.json') print('Results writes to', res_path) mmcv.dump(nusc_submissions, res_path) return res_path def _evaluate_single(self, result_path, logger=None, metric='bbox', result_name='img_bbox'): """Evaluation for a single model in nuScenes protocol. Args: result_path (str): Path of the result file. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. metric (str, optional): Metric name used for evaluation. Default: 'bbox'. result_name (str, optional): Result name in the metric prefix. Default: 'img_bbox'. Returns: dict: Dictionary of evaluation details. """ from nuscenes import NuScenes from nuscenes.eval.detection.evaluate import NuScenesEval output_dir = osp.join(*osp.split(result_path)[:-1]) nusc = NuScenes( version=self.version, dataroot=self.data_root, verbose=False) eval_set_map = { 'v1.0-mini': 'mini_val', 'v1.0-trainval': 'val', } nusc_eval = NuScenesEval( nusc, config=self.eval_detection_configs, result_path=result_path, eval_set=eval_set_map[self.version], output_dir=output_dir, verbose=False) nusc_eval.main(render_curves=True) # record metrics metrics = mmcv.load(osp.join(output_dir, 'metrics_summary.json')) detail = dict() metric_prefix = f'{result_name}_NuScenes' for name in self.CLASSES: for k, v in metrics['label_aps'][name].items(): val = float('{:.4f}'.format(v)) detail['{}/{}_AP_dist_{}'.format(metric_prefix, name, k)] = val for k, v in metrics['label_tp_errors'][name].items(): val = float('{:.4f}'.format(v)) detail['{}/{}_{}'.format(metric_prefix, name, k)] = val for k, v in metrics['tp_errors'].items(): val = float('{:.4f}'.format(v)) detail['{}/{}'.format(metric_prefix, self.ErrNameMapping[k])] = val detail['{}/NDS'.format(metric_prefix)] = metrics['nd_score'] detail['{}/mAP'.format(metric_prefix)] = metrics['mean_ap'] return detail def format_results(self, results, jsonfile_prefix=None, **kwargs): """Format the results to json (standard format for COCO evaluation). Args: results (list[tuple | numpy.ndarray]): Testing results of the dataset. jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ assert isinstance(results, list), 'results must be a list' assert len(results) == len(self), ( 'The length of results is not equal to the dataset len: {} != {}'. format(len(results), len(self))) if jsonfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() jsonfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None # currently the output prediction results could be in two formats # 1. list of dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...) # 2. list of dict('pts_bbox' or 'img_bbox': # dict('boxes_3d': ..., 'scores_3d': ..., 'labels_3d': ...)) # this is a workaround to enable evaluation of both formats on nuScenes # refer to https://github.com/open-mmlab/mmdetection3d/issues/449 if not ('pts_bbox' in results[0] or 'img_bbox' in results[0]): result_files = self._format_bbox(results, jsonfile_prefix) else: # should take the inner dict out of 'pts_bbox' or 'img_bbox' dict result_files = dict() for name in results[0]: # not evaluate 2D predictions on nuScenes if '2d' in name: continue print(f'\nFormating bboxes of {name}') results_ = [out[name] for out in results] tmp_file_ = osp.join(jsonfile_prefix, name) result_files.update( {name: self._format_bbox(results_, tmp_file_)}) return result_files, tmp_dir def evaluate(self, results, metric='bbox', logger=None, jsonfile_prefix=None, result_names=['img_bbox'], show=False, out_dir=None, pipeline=None): """Evaluation in nuScenes protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str], optional): Metrics to be evaluated. Default: 'bbox'. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. jsonfile_prefix (str): The prefix of json files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. result_names (list[str], optional): Result names in the metric prefix. Default: ['img_bbox']. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str, float]: Results of each evaluation metric. """ result_files, tmp_dir = self.format_results(results, jsonfile_prefix) if isinstance(result_files, dict): results_dict = dict() for name in result_names: print('Evaluating bboxes of {}'.format(name)) ret_dict = self._evaluate_single(result_files[name]) results_dict.update(ret_dict) elif isinstance(result_files, str): results_dict = self._evaluate_single(result_files) if tmp_dir is not None: tmp_dir.cleanup() if show or out_dir: self.show(results, out_dir, pipeline=pipeline) return results_dict def _extract_data(self, index, pipeline, key, load_annos=False): """Load data using input pipeline and extract data according to key. Args: index (int): Index for accessing the target data. pipeline (:obj:`Compose`): Composed data loading pipeline. key (str | list[str]): One single or a list of data key. load_annos (bool): Whether to load data annotations. If True, need to set self.test_mode as False before loading. Returns: np.ndarray | torch.Tensor | list[np.ndarray | torch.Tensor]: A single or a list of loaded data. """ assert pipeline is not None, 'data loading pipeline is not provided' img_info = self.data_infos[index] input_dict = dict(img_info=img_info) if load_annos: ann_info = self.get_ann_info(index) input_dict.update(dict(ann_info=ann_info)) self.pre_pipeline(input_dict) example = pipeline(input_dict) # extract data items according to keys if isinstance(key, str): data = extract_result_dict(example, key) else: data = [extract_result_dict(example, k) for k in key] return data def _get_pipeline(self, pipeline): """Get data loading pipeline in self.show/evaluate function. Args: pipeline (list[dict]): Input pipeline. If None is given, get from self.pipeline. """ if pipeline is None: if not hasattr(self, 'pipeline') or self.pipeline is None: warnings.warn( 'Use default pipeline for data loading, this may cause ' 'errors when data is on ceph') return self._build_default_pipeline() loading_pipeline = get_loading_pipeline(self.pipeline.transforms) return Compose(loading_pipeline) return Compose(pipeline) def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict(type='LoadImageFromFileMono3D'), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['img']) ] return Compose(pipeline) def show(self, results, out_dir, show=False, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Whether to visualize the results online. Default: False. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): if 'img_bbox' in result.keys(): result = result['img_bbox'] data_info = self.data_infos[i] img_path = data_info['file_name'] file_name = osp.split(img_path)[-1].split('.')[0] img, img_metas = self._extract_data(i, pipeline, ['img', 'img_metas']) # need to transpose channel to first dim img = img.numpy().transpose(1, 2, 0) gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'] pred_bboxes = result['boxes_3d'] show_multi_modality_result( img, gt_bboxes, pred_bboxes, img_metas['cam2img'], out_dir, file_name, box_mode='camera', show=show) def output_to_nusc_box(detection): """Convert the output to the box class in the nuScenes. Args: detection (dict): Detection results. - boxes_3d (:obj:`BaseInstance3DBoxes`): Detection bbox. - scores_3d (torch.Tensor): Detection scores. - labels_3d (torch.Tensor): Predicted box labels. - attrs_3d (torch.Tensor, optional): Predicted attributes. Returns: list[:obj:`NuScenesBox`]: List of standard NuScenesBoxes. """ box3d = detection['boxes_3d'] scores = detection['scores_3d'].numpy() labels = detection['labels_3d'].numpy() attrs = None if 'attrs_3d' in detection: attrs = detection['attrs_3d'].numpy() box_gravity_center = box3d.gravity_center.numpy() box_dims = box3d.dims.numpy() box_yaw = box3d.yaw.numpy() # convert the dim/rot to nuscbox convention box_dims[:, [0, 1, 2]] = box_dims[:, [2, 0, 1]] box_yaw = -box_yaw box_list = [] for i in range(len(box3d)): q1 = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw[i]) q2 = pyquaternion.Quaternion(axis=[1, 0, 0], radians=np.pi / 2) quat = q2 * q1 velocity = (box3d.tensor[i, 7], 0.0, box3d.tensor[i, 8]) box = NuScenesBox( box_gravity_center[i], box_dims[i], quat, label=labels[i], score=scores[i], velocity=velocity) box_list.append(box) return box_list, attrs def cam_nusc_box_to_global(info, boxes, attrs, classes, eval_configs, eval_version='detection_cvpr_2019'): """Convert the box from camera to global coordinate. Args: info (dict): Info for a specific sample data, including the calibration information. boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. eval_version (str, optional): Evaluation version. Default: 'detection_cvpr_2019' Returns: list: List of standard NuScenesBoxes in the global coordinate. """ box_list = [] attr_list = [] for (box, attr) in zip(boxes, attrs): # Move box to ego vehicle coord system box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation'])) box.translate(np.array(info['cam2ego_translation'])) # filter det in ego. cls_range_map = eval_configs.class_range radius = np.linalg.norm(box.center[:2], 2) det_range = cls_range_map[classes[box.label]] if radius > det_range: continue # Move box to global coord system box.rotate(pyquaternion.Quaternion(info['ego2global_rotation'])) box.translate(np.array(info['ego2global_translation'])) box_list.append(box) attr_list.append(attr) return box_list, attr_list def global_nusc_box_to_cam(info, boxes, classes, eval_configs, eval_version='detection_cvpr_2019'): """Convert the box from global to camera coordinate. Args: info (dict): Info for a specific sample data, including the calibration information. boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. classes (list[str]): Mapped classes in the evaluation. eval_configs (object): Evaluation configuration object. eval_version (str, optional): Evaluation version. Default: 'detection_cvpr_2019' Returns: list: List of standard NuScenesBoxes in the global coordinate. """ box_list = [] for box in boxes: # Move box to ego vehicle coord system box.translate(-np.array(info['ego2global_translation'])) box.rotate( pyquaternion.Quaternion(info['ego2global_rotation']).inverse) # filter det in ego. cls_range_map = eval_configs.class_range radius = np.linalg.norm(box.center[:2], 2) det_range = cls_range_map[classes[box.label]] if radius > det_range: continue # Move box to camera coord system box.translate(-np.array(info['cam2ego_translation'])) box.rotate(pyquaternion.Quaternion(info['cam2ego_rotation']).inverse) box_list.append(box) return box_list def nusc_box_to_cam_box3d(boxes): """Convert boxes from :obj:`NuScenesBox` to :obj:`CameraInstance3DBoxes`. Args: boxes (list[:obj:`NuScenesBox`]): List of predicted NuScenesBoxes. Returns: tuple (:obj:`CameraInstance3DBoxes` | torch.Tensor | torch.Tensor): Converted 3D bounding boxes, scores and labels. """ locs = torch.Tensor([b.center for b in boxes]).view(-1, 3) dims = torch.Tensor([b.wlh for b in boxes]).view(-1, 3) rots = torch.Tensor([b.orientation.yaw_pitch_roll[0] for b in boxes]).view(-1, 1) velocity = torch.Tensor([b.velocity[0::2] for b in boxes]).view(-1, 2) # convert nusbox to cambox convention dims[:, [0, 1, 2]] = dims[:, [1, 2, 0]] rots = -rots boxes_3d = torch.cat([locs, dims, rots, velocity], dim=1).cuda() cam_boxes3d = CameraInstance3DBoxes( boxes_3d, box_dim=9, origin=(0.5, 0.5, 0.5)) scores = torch.Tensor([b.score for b in boxes]).cuda() labels = torch.LongTensor([b.label for b in boxes]).cuda() nms_scores = scores.new_zeros(scores.shape[0], 10 + 1) indices = labels.new_tensor(list(range(scores.shape[0]))) nms_scores[indices, labels] = scores return cam_boxes3d, nms_scores, labels ================================================ FILE: mmdet3d/datasets/occ_metrics.py ================================================ import numpy as np import os from pathlib import Path from tqdm import tqdm import pickle as pkl import argparse import time import torch import sys, platform from sklearn.neighbors import KDTree from termcolor import colored from pathlib import Path from copy import deepcopy from functools import reduce np.seterr(divide='ignore', invalid='ignore') os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" def pcolor(string, color, on_color=None, attrs=None): """ Produces a colored string for printing Parameters ---------- string : str String that will be colored color : str Color to use on_color : str Background color to use attrs : list of str Different attributes for the string Returns ------- string: str Colored string """ return colored(string, color, on_color, attrs) def getCellCoordinates(points, voxelSize): return (points / voxelSize).astype(np.int) def getNumUniqueCells(cells): M = cells.max() + 1 return np.unique(cells[:, 0] + M * cells[:, 1] + M ** 2 * cells[:, 2]).shape[0] class Metric_mIoU(): def __init__(self, save_dir='.', num_classes=18, use_lidar_mask=False, use_image_mask=False, min_d = -1, max_d = 100, ): self.class_names = ['others','barrier', 'bicycle', 'bus', 'car', 'construction_vehicle', 'motorcycle', 'pedestrian', 'traffic_cone', 'trailer', 'truck', 'driveable_surface', 'other_flat', 'sidewalk', 'terrain', 'manmade', 'vegetation','free'] self.save_dir = save_dir self.use_lidar_mask = use_lidar_mask self.use_image_mask = use_image_mask self.num_classes = num_classes self.point_cloud_range = [-40.0, -40.0, -1.0, 40.0, 40.0, 5.4] self.occupancy_size = [0.4, 0.4, 0.4] self.voxel_size = 0.4 self.occ_xdim = int((self.point_cloud_range[3] - self.point_cloud_range[0]) / self.occupancy_size[0]) self.occ_ydim = int((self.point_cloud_range[4] - self.point_cloud_range[1]) / self.occupancy_size[1]) self.occ_zdim = int((self.point_cloud_range[5] - self.point_cloud_range[2]) / self.occupancy_size[2]) self.voxel_num = self.occ_xdim * self.occ_ydim * self.occ_zdim self.hist = np.zeros((self.num_classes, self.num_classes)) self.cnt = 0 self.max_d = max_d self.min_d = min_d def hist_info(self, n_cl, pred, gt): """ build confusion matrix # empty classes:0 non-empty class: 0-16 free voxel class: 17 Args: n_cl (int): num_classes_occupancy pred (1-d array): pred_occupancy_label gt (1-d array): gt_occupancu_label Returns: tuple:(hist, correctly number_predicted_labels, num_labelled_sample) """ assert pred.shape == gt.shape k = (gt >= 0) & (gt < n_cl) # exclude 255 labeled = np.sum(k) correct = np.sum((pred[k] == gt[k])) return ( np.bincount( n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2 ).reshape(n_cl, n_cl), correct, labeled, ) def per_class_iu(self, hist): return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) def compute_mIoU(self, pred, label, n_classes): hist = np.zeros((n_classes, n_classes)) new_hist, correct, labeled = self.hist_info(n_classes, pred.flatten(), label.flatten()) hist += new_hist mIoUs = self.per_class_iu(hist) # for ind_class in range(n_classes): # print(str(round(mIoUs[ind_class] * 100, 2))) # print('===> mIoU: ' + str(round(np.nanmean(mIoUs) * 100, 2))) return round(np.nanmean(mIoUs) * 100, 2), hist def add_batch(self,semantics_pred,semantics_gt,mask_lidar,mask_camera): self.cnt += 1 if len(semantics_pred.shape) == 4 or len(semantics_pred.shape) == 2: semantics_pred = semantics_pred.argmax(-1) if len(semantics_pred.shape) == 1: semantics_pred_ = deepcopy(semantics_gt) semantics_pred_[mask_camera] = semantics_pred semantics_pred = semantics_pred_ xx, yy = np.meshgrid(np.arange(200), np.arange(200)) mask = (np.stack([yy, xx], -1) -100) * 0.4 distance_map = np.linalg.norm(mask, 2, -1) distance_map = (distance_map<=self.max_d) & (distance_map>=self.min_d) # print(semantics_pred.shape) # from IPython import embed # embed() # exit() # semantics_pred = semantics_pred[distance_map] # semantics_gt = semantics_gt[distance_map] # mask_camera = mask_camera[distance_map] mask_camera = mask_camera & distance_map[:,:, None] assert self.use_image_mask if self.use_image_mask: masked_semantics_gt = semantics_gt[mask_camera] if len(semantics_pred.shape) == 3: masked_semantics_pred = semantics_pred[mask_camera] elif len(semantics_pred.shape) == 1: masked_semantics_pred = semantics_pred elif self.use_lidar_mask: masked_semantics_gt = semantics_gt[mask_lidar] masked_semantics_pred = semantics_pred[mask_lidar] else: masked_semantics_gt = semantics_gt masked_semantics_pred = semantics_pred # # pred = np.random.randint(low=0, high=17, size=masked_semantics.shape) _, _hist = self.compute_mIoU(masked_semantics_pred, masked_semantics_gt, self.num_classes) self.hist += _hist def count_miou(self): res = {} mIoU = self.per_class_iu(self.hist) # assert cnt == num_samples, 'some samples are not included in the miou calculation' print(f'===> per class IoU of {self.cnt} samples:') for ind_class in range(self.num_classes-1): print(f'===> {self.class_names[ind_class]} - IoU = ' + str(round(mIoU[ind_class] * 100, 4))) res[self.class_names[ind_class]] = round(mIoU[ind_class] * 100, 2) print(f'===> mIoU of {self.cnt} samples: ' + str(round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2))) res['Overall'] = round(np.nanmean(mIoU[:self.num_classes-1]) * 100, 2) # print(f'===> sample-wise averaged mIoU of {cnt} samples: ' + str(round(np.nanmean(mIoU_avg), 2))) return res class Metric_FScore(): def __init__(self, leaf_size=10, threshold_acc=0.6, threshold_complete=0.6, voxel_size=[0.4, 0.4, 0.4], range=[-40, -40, -1, 40, 40, 5.4], void=[17, 255], use_lidar_mask=False, use_image_mask=False, ) -> None: self.leaf_size = leaf_size self.threshold_acc = threshold_acc self.threshold_complete = threshold_complete self.voxel_size = voxel_size self.range = range self.void = void self.use_lidar_mask = use_lidar_mask self.use_image_mask = use_image_mask self.cnt=0 self.tot_acc = 0. self.tot_cmpl = 0. self.tot_f1_mean = 0. self.eps = 1e-8 def voxel2points(self, voxel): # occIdx = torch.where(torch.logical_and(voxel != FREE, voxel != NOT_OBSERVED)) # if isinstance(voxel, np.ndarray): voxel = torch.from_numpy(voxel) mask = np.logical_not(reduce(np.logical_or, [voxel == self.void[i] for i in range(len(self.void))])) occIdx = np.where(mask) points = np.concatenate((occIdx[0][:, None] * self.voxel_size[0] + self.voxel_size[0] / 2 + self.range[0], \ occIdx[1][:, None] * self.voxel_size[1] + self.voxel_size[1] / 2 + self.range[1], \ occIdx[2][:, None] * self.voxel_size[2] + self.voxel_size[2] / 2 + self.range[2]), axis=1) return points def add_batch(self,semantics_pred,semantics_gt,mask_lidar,mask_camera ): # for scene_token in tqdm(preds_dict.keys()): self.cnt += 1 if len(semantics_pred.shape) == 4 or len(semantics_pred.shape) == 2: semantics_pred = semantics_pred.argmax(-1) assert self.use_image_mask if self.use_image_mask: semantics_gt[mask_camera == False] = 255 if len(semantics_pred.shape) == 1: semantics_pred_ = deepcopy(semantics_gt) semantics_pred_[mask_camera] = semantics_pred semantics_pred = semantics_pred_ else: semantics_pred[mask_camera == False] = 255 elif self.use_lidar_mask: semantics_gt[mask_lidar == False] = 255 semantics_pred[mask_lidar == False] = 255 else: pass ground_truth = self.voxel2points(semantics_gt) prediction = self.voxel2points(semantics_pred) if prediction.shape[0] == 0: accuracy=0 completeness=0 fmean=0 else: prediction_tree = KDTree(prediction, leaf_size=self.leaf_size) ground_truth_tree = KDTree(ground_truth, leaf_size=self.leaf_size) complete_distance, _ = prediction_tree.query(ground_truth) complete_distance = complete_distance.flatten() accuracy_distance, _ = ground_truth_tree.query(prediction) accuracy_distance = accuracy_distance.flatten() # evaluate completeness complete_mask = complete_distance < self.threshold_complete completeness = complete_mask.mean() # evalute accuracy accuracy_mask = accuracy_distance < self.threshold_acc accuracy = accuracy_mask.mean() fmean = 2.0 / (1 / (accuracy+self.eps) + 1 / (completeness+self.eps)) self.tot_acc += accuracy self.tot_cmpl += completeness self.tot_f1_mean += fmean def count_fscore(self,): res = {} base_color, attrs = 'red', ['bold', 'dark'] print(pcolor('\n######## F score: {} #######'.format(self.tot_f1_mean / self.cnt), base_color, attrs=attrs)) res['f-score'] = round(self.tot_f1_mean / self.cnt, 4) return res import argparse import os import sys import numpy as np from tqdm import tqdm import time def parse_args(): parser = argparse.ArgumentParser( description='eval occupancy') parser.add_argument('pred_path', help='pred_path') parser.add_argument('--gt_path', default='/mount/data/occupancy_cvpr2023/gts', help='checkpoint file') parser.add_argument('--min_d', default=-1, type=int, help='min range') parser.add_argument('--max_d', default=100, type=int, help='max range') parser.add_argument( '--eval_fscore', action='store_true', help='whether to eval f-score.') args = parser.parse_args() return args def eval(args): occ_eval_metrics = Metric_mIoU( num_classes=18, use_lidar_mask=False, min_d = args.min_d, max_d = args.max_d, use_image_mask=True) if args.eval_fscore: fscore_eval_metrics = Metric_FScore( leaf_size=10, threshold_acc=0.4, threshold_complete=0.4, voxel_size=[0.4, 0.4, 0.4], range=[-40, -40, -1, 40, 40, 5.4], void=[17, 255], use_lidar_mask=False, use_image_mask=True,) # print(len(os.listdir(args.pred_path))) pred_files = os.listdir(args.pred_path) val_splits = ['scene-0003', 'scene-0012', 'scene-0013', 'scene-0014', 'scene-0015', 'scene-0016', 'scene-0017', 'scene-0018', 'scene-0035', 'scene-0036', 'scene-0038', 'scene-0039', 'scene-0092', 'scene-0093', 'scene-0094', 'scene-0095', 'scene-0096', 'scene-0097', 'scene-0098', 'scene-0099', 'scene-0100', 'scene-0101', 'scene-0102', 'scene-0103', 'scene-0104', 'scene-0105', 'scene-0106', 'scene-0107', 'scene-0108', 'scene-0109', 'scene-0110', 'scene-0221', 'scene-0268', 'scene-0269', 'scene-0270', 'scene-0271', 'scene-0272', 'scene-0273', 'scene-0274', 'scene-0275', 'scene-0276', 'scene-0277', 'scene-0278', 'scene-0329', 'scene-0330', 'scene-0331', 'scene-0332', 'scene-0344', 'scene-0345', 'scene-0346', 'scene-0519', 'scene-0520', 'scene-0521', 'scene-0522', 'scene-0523', 'scene-0524', 'scene-0552', 'scene-0553', 'scene-0554', 'scene-0555', 'scene-0556', 'scene-0557', 'scene-0558', 'scene-0559', 'scene-0560', 'scene-0561', 'scene-0562', 'scene-0563', 'scene-0564', 'scene-0565', 'scene-0625', 'scene-0626', 'scene-0627', 'scene-0629', 'scene-0630', 'scene-0632', 'scene-0633', 'scene-0634', 'scene-0635', 'scene-0636', 'scene-0637', 'scene-0638', 'scene-0770', 'scene-0771', 'scene-0775', 'scene-0777', 'scene-0778', 'scene-0780', 'scene-0781', 'scene-0782', 'scene-0783', 'scene-0784', 'scene-0794', 'scene-0795', 'scene-0796', 'scene-0797', 'scene-0798', 'scene-0799', 'scene-0800', 'scene-0802', 'scene-0904', 'scene-0905', 'scene-0906', 'scene-0907', 'scene-0908', 'scene-0909', 'scene-0910', 'scene-0911', 'scene-0912', 'scene-0913', 'scene-0914', 'scene-0915', 'scene-0916', 'scene-0917', 'scene-0919', 'scene-0920', 'scene-0921', 'scene-0922', 'scene-0923', 'scene-0924', 'scene-0925', 'scene-0926', 'scene-0927', 'scene-0928', 'scene-0929', 'scene-0930', 'scene-0931', 'scene-0962', 'scene-0963', 'scene-0966', 'scene-0967', 'scene-0968', 'scene-0969', 'scene-0971', 'scene-0972', 'scene-1059', 'scene-1060', 'scene-1061', 'scene-1062', 'scene-1063', 'scene-1064', 'scene-1065', 'scene-1066', 'scene-1067', 'scene-1068', 'scene-1069', 'scene-1070', 'scene-1071', 'scene-1072', 'scene-1073'] mini_splits = ['scene-0103', 'scene-0916'] for scene_name in tqdm(val_splits): if scene_name not in val_splits:continue for sample_token in os.listdir(os.path.join(args.gt_path, scene_name)): occ_gt = np.load(os.path.join(args.gt_path, scene_name, sample_token, 'labels.npz')) occ_pred = np.load(os.path.join(args.pred_path, scene_name+'_'+sample_token+'.npz'))['pred'] gt_semantics = occ_gt['semantics'] mask_lidar = occ_gt['mask_lidar'].astype(bool) mask_camera = occ_gt['mask_camera'].astype(bool) occ_eval_metrics.add_batch(occ_pred, gt_semantics, mask_lidar, mask_camera) if args.eval_fscore: fscore_eval_metrics.add_batch(occ_pred, gt_semantics, mask_lidar, mask_camera) res = occ_eval_metrics.count_miou() if args.eval_fscore: fscore_eval_metrics.count_fscore() if __name__ == '__main__': args = parse_args() eval(args) ================================================ FILE: mmdet3d/datasets/occupancy_eval.py ================================================ from .occ_metrics import Metric_mIoU, Metric_FScore import argparse import os import sys import nunmpy as np def parse_args(): parser = argparse.ArgumentParser( description='eval occupancy') parser.add_argument('pred_path', help='pred_path') parser.add_argument('--gt', default='/mount/data/occupancy_cvpr2023/gts', help='checkpoint file') parser.add_argument( '--eval_fscore', action='store_true', help='whether to eval f-score.') args = parser.parse_args() return args def eval(args): occ_eval_metrics = Metric_mIoU( num_classes=18, use_lidar_mask=False, use_image_mask=True) if args.eval_fscore: fscore_eval_metrics = Metric_FScore( leaf_size=10, threshold_acc=0.4, threshold_complete=0.4, voxel_size=[0.4, 0.4, 0.4], range=[-40, -40, -1, 40, 40, 5.4], void=[17, 255], use_lidar_mask=False, use_image_mask=True,) for pred_path in os.listdir(args.pred_path): occ_pred = np.load(os.path.join(args.pred_path, pred_path))['pred'] occ_gt = np.load(os.path.join(args.gt_path, pred_path.split('.')[0], 'labels.npz')) gt_semantics = occ_gt['semantics'] mask_lidar = occ_gt['mask_lidar'].astype(bool) mask_camera = occ_gt['mask_camera'].astype(bool) occ_eval_metrics.add_batch(occ_pred, gt_semantics, mask_lidar, mask_camera) if args.eval_fscore: fscore_eval_metrics.add_batch(occ_pred, gt_semantics, mask_lidar, mask_camera) res = occ_eval_metrics.count_miou() if eval_fscore: fscore_eval_metrics.count_fscore() if __main__: args = parse_args() eval(args) ================================================ FILE: mmdet3d/datasets/pipelines/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .compose import Compose from .dbsampler import DataBaseSampler from .formating import Collect3D, DefaultFormatBundle, DefaultFormatBundle3D from .loading import (LoadAnnotations3D, LoadAnnotationsBEVDepth, LoadImageFromFileMono3D, LoadMultiViewImageFromFiles, LoadPointsFromDict, LoadPointsFromFile, LoadPointsFromMultiSweeps, NormalizePointsColor, PointSegClassMapping, PointToMultiViewDepth, PrepareImageInputs, LoadVectorMap) from .test_time_aug import MultiScaleFlipAug3D # yapf: disable from .transforms_3d import (AffineResize, BackgroundPointsFilter, GlobalAlignment, GlobalRotScaleTrans, IndoorPatchPointSample, IndoorPointSample, MultiViewWrapper, ObjectNameFilter, ObjectNoise, ObjectRangeFilter, ObjectSample, PointSample, PointShuffle, PointsRangeFilter, RandomDropPointsColor, RandomFlip3D, RandomJitterPoints, RandomRotate, RandomShiftScale, RangeLimitedRandomCrop, VoxelBasedPointSampler) __all__ = [ 'ObjectSample', 'RandomFlip3D', 'ObjectNoise', 'GlobalRotScaleTrans', 'PointShuffle', 'ObjectRangeFilter', 'PointsRangeFilter', 'Collect3D', 'Compose', 'LoadMultiViewImageFromFiles', 'LoadPointsFromFile', 'DefaultFormatBundle', 'DefaultFormatBundle3D', 'DataBaseSampler', 'NormalizePointsColor', 'LoadAnnotations3D', 'IndoorPointSample', 'PointSample', 'PointSegClassMapping', 'MultiScaleFlipAug3D', 'LoadPointsFromMultiSweeps', 'BackgroundPointsFilter', 'VoxelBasedPointSampler', 'GlobalAlignment', 'IndoorPatchPointSample', 'LoadImageFromFileMono3D', 'ObjectNameFilter', 'RandomDropPointsColor', 'RandomJitterPoints', 'AffineResize', 'RandomShiftScale', 'LoadPointsFromDict', 'MultiViewWrapper', 'RandomRotate', 'RangeLimitedRandomCrop', 'PrepareImageInputs', 'LoadAnnotationsBEVDepth', 'PointToMultiViewDepth', 'LoadVectorMap' ] ================================================ FILE: mmdet3d/datasets/pipelines/compose.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import collections from mmcv.utils import build_from_cfg from mmdet.datasets.builder import PIPELINES as MMDET_PIPELINES from ..builder import PIPELINES @PIPELINES.register_module() class Compose: """Compose multiple transforms sequentially. The pipeline registry of mmdet3d separates with mmdet, however, sometimes we may need to use mmdet's pipeline. So the class is rewritten to be able to use pipelines from both mmdet3d and mmdet. Args: transforms (Sequence[dict | callable]): Sequence of transform object or config dict to be composed. """ def __init__(self, transforms): assert isinstance(transforms, collections.abc.Sequence) self.transforms = [] for transform in transforms: if isinstance(transform, dict): _, key = PIPELINES.split_scope_key(transform['type']) if key in PIPELINES._module_dict.keys(): transform = build_from_cfg(transform, PIPELINES) else: transform = build_from_cfg(transform, MMDET_PIPELINES) self.transforms.append(transform) elif callable(transform): self.transforms.append(transform) else: raise TypeError('transform must be callable or a dict') def __call__(self, data): """Call function to apply transforms sequentially. Args: data (dict): A result dict contains the data to transform. Returns: dict: Transformed data. """ for t in self.transforms: data = t(data) if data is None: return None return data def __repr__(self): format_string = self.__class__.__name__ + '(' for t in self.transforms: format_string += '\n' format_string += f' {t}' format_string += '\n)' return format_string ================================================ FILE: mmdet3d/datasets/pipelines/data_augment_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import numba import numpy as np from numba.core.errors import NumbaPerformanceWarning from mmdet3d.core.bbox import box_np_ops warnings.filterwarnings('ignore', category=NumbaPerformanceWarning) @numba.njit def _rotation_box2d_jit_(corners, angle, rot_mat_T): """Rotate 2D boxes. Args: corners (np.ndarray): Corners of boxes. angle (float): Rotation angle. rot_mat_T (np.ndarray): Transposed rotation matrix. """ rot_sin = np.sin(angle) rot_cos = np.cos(angle) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = rot_sin rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos corners[:] = corners @ rot_mat_T @numba.jit(nopython=True) def box_collision_test(boxes, qboxes, clockwise=True): """Box collision test. Args: boxes (np.ndarray): Corners of current boxes. qboxes (np.ndarray): Boxes to be avoid colliding. clockwise (bool, optional): Whether the corners are in clockwise order. Default: True. """ N = boxes.shape[0] K = qboxes.shape[0] ret = np.zeros((N, K), dtype=np.bool_) slices = np.array([1, 2, 3, 0]) lines_boxes = np.stack((boxes, boxes[:, slices, :]), axis=2) # [N, 4, 2(line), 2(xy)] lines_qboxes = np.stack((qboxes, qboxes[:, slices, :]), axis=2) # vec = np.zeros((2,), dtype=boxes.dtype) boxes_standup = box_np_ops.corner_to_standup_nd_jit(boxes) qboxes_standup = box_np_ops.corner_to_standup_nd_jit(qboxes) for i in range(N): for j in range(K): # calculate standup first iw = ( min(boxes_standup[i, 2], qboxes_standup[j, 2]) - max(boxes_standup[i, 0], qboxes_standup[j, 0])) if iw > 0: ih = ( min(boxes_standup[i, 3], qboxes_standup[j, 3]) - max(boxes_standup[i, 1], qboxes_standup[j, 1])) if ih > 0: for k in range(4): for box_l in range(4): A = lines_boxes[i, k, 0] B = lines_boxes[i, k, 1] C = lines_qboxes[j, box_l, 0] D = lines_qboxes[j, box_l, 1] acd = (D[1] - A[1]) * (C[0] - A[0]) > (C[1] - A[1]) * ( D[0] - A[0]) bcd = (D[1] - B[1]) * (C[0] - B[0]) > (C[1] - B[1]) * ( D[0] - B[0]) if acd != bcd: abc = (C[1] - A[1]) * (B[0] - A[0]) > ( B[1] - A[1]) * ( C[0] - A[0]) abd = (D[1] - A[1]) * (B[0] - A[0]) > ( B[1] - A[1]) * ( D[0] - A[0]) if abc != abd: ret[i, j] = True # collision. break if ret[i, j] is True: break if ret[i, j] is False: # now check complete overlap. # box overlap qbox: box_overlap_qbox = True for box_l in range(4): # point l in qboxes for k in range(4): # corner k in boxes vec = boxes[i, k] - boxes[i, (k + 1) % 4] if clockwise: vec = -vec cross = vec[1] * ( boxes[i, k, 0] - qboxes[j, box_l, 0]) cross -= vec[0] * ( boxes[i, k, 1] - qboxes[j, box_l, 1]) if cross >= 0: box_overlap_qbox = False break if box_overlap_qbox is False: break if box_overlap_qbox is False: qbox_overlap_box = True for box_l in range(4): # point box_l in boxes for k in range(4): # corner k in qboxes vec = qboxes[j, k] - qboxes[j, (k + 1) % 4] if clockwise: vec = -vec cross = vec[1] * ( qboxes[j, k, 0] - boxes[i, box_l, 0]) cross -= vec[0] * ( qboxes[j, k, 1] - boxes[i, box_l, 1]) if cross >= 0: # qbox_overlap_box = False break if qbox_overlap_box is False: break if qbox_overlap_box: ret[i, j] = True # collision. else: ret[i, j] = True # collision. return ret @numba.njit def noise_per_box(boxes, valid_mask, loc_noises, rot_noises): """Add noise to every box (only on the horizontal plane). Args: boxes (np.ndarray): Input boxes with shape (N, 5). valid_mask (np.ndarray): Mask to indicate which boxes are valid with shape (N). loc_noises (np.ndarray): Location noises with shape (N, M, 3). rot_noises (np.ndarray): Rotation noises with shape (N, M). Returns: np.ndarray: Mask to indicate whether the noise is added successfully (pass the collision test). """ num_boxes = boxes.shape[0] num_tests = loc_noises.shape[1] box_corners = box_np_ops.box2d_to_corner_jit(boxes) current_corners = np.zeros((4, 2), dtype=boxes.dtype) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) success_mask = -np.ones((num_boxes, ), dtype=np.int64) # print(valid_mask) for i in range(num_boxes): if valid_mask[i]: for j in range(num_tests): current_corners[:] = box_corners[i] current_corners -= boxes[i, :2] _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) current_corners += boxes[i, :2] + loc_noises[i, j, :2] coll_mat = box_collision_test( current_corners.reshape(1, 4, 2), box_corners) coll_mat[0, i] = False # print(coll_mat) if not coll_mat.any(): success_mask[i] = j box_corners[i] = current_corners break return success_mask @numba.njit def noise_per_box_v2_(boxes, valid_mask, loc_noises, rot_noises, global_rot_noises): """Add noise to every box (only on the horizontal plane). Version 2 used when enable global rotations. Args: boxes (np.ndarray): Input boxes with shape (N, 5). valid_mask (np.ndarray): Mask to indicate which boxes are valid with shape (N). loc_noises (np.ndarray): Location noises with shape (N, M, 3). rot_noises (np.ndarray): Rotation noises with shape (N, M). Returns: np.ndarray: Mask to indicate whether the noise is added successfully (pass the collision test). """ num_boxes = boxes.shape[0] num_tests = loc_noises.shape[1] box_corners = box_np_ops.box2d_to_corner_jit(boxes) current_corners = np.zeros((4, 2), dtype=boxes.dtype) current_box = np.zeros((1, 5), dtype=boxes.dtype) rot_mat_T = np.zeros((2, 2), dtype=boxes.dtype) dst_pos = np.zeros((2, ), dtype=boxes.dtype) success_mask = -np.ones((num_boxes, ), dtype=np.int64) corners_norm = np.zeros((4, 2), dtype=boxes.dtype) corners_norm[1, 1] = 1.0 corners_norm[2] = 1.0 corners_norm[3, 0] = 1.0 corners_norm -= np.array([0.5, 0.5], dtype=boxes.dtype) corners_norm = corners_norm.reshape(4, 2) for i in range(num_boxes): if valid_mask[i]: for j in range(num_tests): current_box[0, :] = boxes[i] current_radius = np.sqrt(boxes[i, 0]**2 + boxes[i, 1]**2) current_grot = np.arctan2(boxes[i, 0], boxes[i, 1]) dst_grot = current_grot + global_rot_noises[i, j] dst_pos[0] = current_radius * np.sin(dst_grot) dst_pos[1] = current_radius * np.cos(dst_grot) current_box[0, :2] = dst_pos current_box[0, -1] += (dst_grot - current_grot) rot_sin = np.sin(current_box[0, -1]) rot_cos = np.cos(current_box[0, -1]) rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = rot_sin rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos current_corners[:] = current_box[ 0, 2:4] * corners_norm @ rot_mat_T + current_box[0, :2] current_corners -= current_box[0, :2] _rotation_box2d_jit_(current_corners, rot_noises[i, j], rot_mat_T) current_corners += current_box[0, :2] + loc_noises[i, j, :2] coll_mat = box_collision_test( current_corners.reshape(1, 4, 2), box_corners) coll_mat[0, i] = False if not coll_mat.any(): success_mask[i] = j box_corners[i] = current_corners loc_noises[i, j, :2] += (dst_pos - boxes[i, :2]) rot_noises[i, j] += (dst_grot - current_grot) break return success_mask def _select_transform(transform, indices): """Select transform. Args: transform (np.ndarray): Transforms to select from. indices (np.ndarray): Mask to indicate which transform to select. Returns: np.ndarray: Selected transforms. """ result = np.zeros((transform.shape[0], *transform.shape[2:]), dtype=transform.dtype) for i in range(transform.shape[0]): if indices[i] != -1: result[i] = transform[i, indices[i]] return result @numba.njit def _rotation_matrix_3d_(rot_mat_T, angle, axis): """Get the 3D rotation matrix. Args: rot_mat_T (np.ndarray): Transposed rotation matrix. angle (float): Rotation angle. axis (int): Rotation axis. """ rot_sin = np.sin(angle) rot_cos = np.cos(angle) rot_mat_T[:] = np.eye(3) if axis == 1: rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 2] = rot_sin rot_mat_T[2, 0] = -rot_sin rot_mat_T[2, 2] = rot_cos elif axis == 2 or axis == -1: rot_mat_T[0, 0] = rot_cos rot_mat_T[0, 1] = rot_sin rot_mat_T[1, 0] = -rot_sin rot_mat_T[1, 1] = rot_cos elif axis == 0: rot_mat_T[1, 1] = rot_cos rot_mat_T[1, 2] = rot_sin rot_mat_T[2, 1] = -rot_sin rot_mat_T[2, 2] = rot_cos @numba.njit def points_transform_(points, centers, point_masks, loc_transform, rot_transform, valid_mask): """Apply transforms to points and box centers. Args: points (np.ndarray): Input points. centers (np.ndarray): Input box centers. point_masks (np.ndarray): Mask to indicate which points need to be transformed. loc_transform (np.ndarray): Location transform to be applied. rot_transform (np.ndarray): Rotation transform to be applied. valid_mask (np.ndarray): Mask to indicate which boxes are valid. """ num_box = centers.shape[0] num_points = points.shape[0] rot_mat_T = np.zeros((num_box, 3, 3), dtype=points.dtype) for i in range(num_box): _rotation_matrix_3d_(rot_mat_T[i], rot_transform[i], 2) for i in range(num_points): for j in range(num_box): if valid_mask[j]: if point_masks[i, j] == 1: points[i, :3] -= centers[j, :3] points[i:i + 1, :3] = points[i:i + 1, :3] @ rot_mat_T[j] points[i, :3] += centers[j, :3] points[i, :3] += loc_transform[j] break # only apply first box's transform @numba.njit def box3d_transform_(boxes, loc_transform, rot_transform, valid_mask): """Transform 3D boxes. Args: boxes (np.ndarray): 3D boxes to be transformed. loc_transform (np.ndarray): Location transform to be applied. rot_transform (np.ndarray): Rotation transform to be applied. valid_mask (np.ndarray): Mask to indicate which boxes are valid. """ num_box = boxes.shape[0] for i in range(num_box): if valid_mask[i]: boxes[i, :3] += loc_transform[i] boxes[i, 6] += rot_transform[i] def noise_per_object_v3_(gt_boxes, points=None, valid_mask=None, rotation_perturb=np.pi / 4, center_noise_std=1.0, global_random_rot_range=np.pi / 4, num_try=100): """Random rotate or remove each groundtruth independently. use kitti viewer to test this function points_transform_ Args: gt_boxes (np.ndarray): Ground truth boxes with shape (N, 7). points (np.ndarray, optional): Input point cloud with shape (M, 4). Default: None. valid_mask (np.ndarray, optional): Mask to indicate which boxes are valid. Default: None. rotation_perturb (float, optional): Rotation perturbation. Default: pi / 4. center_noise_std (float, optional): Center noise standard deviation. Default: 1.0. global_random_rot_range (float, optional): Global random rotation range. Default: pi/4. num_try (int, optional): Number of try. Default: 100. """ num_boxes = gt_boxes.shape[0] if not isinstance(rotation_perturb, (list, tuple, np.ndarray)): rotation_perturb = [-rotation_perturb, rotation_perturb] if not isinstance(global_random_rot_range, (list, tuple, np.ndarray)): global_random_rot_range = [ -global_random_rot_range, global_random_rot_range ] enable_grot = np.abs(global_random_rot_range[0] - global_random_rot_range[1]) >= 1e-3 if not isinstance(center_noise_std, (list, tuple, np.ndarray)): center_noise_std = [ center_noise_std, center_noise_std, center_noise_std ] if valid_mask is None: valid_mask = np.ones((num_boxes, ), dtype=np.bool_) center_noise_std = np.array(center_noise_std, dtype=gt_boxes.dtype) loc_noises = np.random.normal( scale=center_noise_std, size=[num_boxes, num_try, 3]) rot_noises = np.random.uniform( rotation_perturb[0], rotation_perturb[1], size=[num_boxes, num_try]) gt_grots = np.arctan2(gt_boxes[:, 0], gt_boxes[:, 1]) grot_lowers = global_random_rot_range[0] - gt_grots grot_uppers = global_random_rot_range[1] - gt_grots global_rot_noises = np.random.uniform( grot_lowers[..., np.newaxis], grot_uppers[..., np.newaxis], size=[num_boxes, num_try]) origin = (0.5, 0.5, 0) gt_box_corners = box_np_ops.center_to_corner_box3d( gt_boxes[:, :3], gt_boxes[:, 3:6], gt_boxes[:, 6], origin=origin, axis=2) # TODO: rewrite this noise box function? if not enable_grot: selected_noise = noise_per_box(gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises) else: selected_noise = noise_per_box_v2_(gt_boxes[:, [0, 1, 3, 4, 6]], valid_mask, loc_noises, rot_noises, global_rot_noises) loc_transforms = _select_transform(loc_noises, selected_noise) rot_transforms = _select_transform(rot_noises, selected_noise) surfaces = box_np_ops.corner_to_surfaces_3d_jit(gt_box_corners) if points is not None: # TODO: replace this points_in_convex function by my tools? point_masks = box_np_ops.points_in_convex_polygon_3d_jit( points[:, :3], surfaces) points_transform_(points, gt_boxes[:, :3], point_masks, loc_transforms, rot_transforms, valid_mask) box3d_transform_(gt_boxes, loc_transforms, rot_transforms, valid_mask) ================================================ FILE: mmdet3d/datasets/pipelines/dbsampler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import os import warnings import mmcv import numpy as np from mmdet3d.core.bbox import box_np_ops from mmdet3d.datasets.pipelines import data_augment_utils from ..builder import OBJECTSAMPLERS, PIPELINES class BatchSampler: """Class for sampling specific category of ground truths. Args: sample_list (list[dict]): List of samples. name (str, optional): The category of samples. Default: None. epoch (int, optional): Sampling epoch. Default: None. shuffle (bool, optional): Whether to shuffle indices. Default: False. drop_reminder (bool, optional): Drop reminder. Default: False. """ def __init__(self, sampled_list, name=None, epoch=None, shuffle=True, drop_reminder=False): self._sampled_list = sampled_list self._indices = np.arange(len(sampled_list)) if shuffle: np.random.shuffle(self._indices) self._idx = 0 self._example_num = len(sampled_list) self._name = name self._shuffle = shuffle self._epoch = epoch self._epoch_counter = 0 self._drop_reminder = drop_reminder def _sample(self, num): """Sample specific number of ground truths and return indices. Args: num (int): Sampled number. Returns: list[int]: Indices of sampled ground truths. """ if self._idx + num >= self._example_num: ret = self._indices[self._idx:].copy() self._reset() else: ret = self._indices[self._idx:self._idx + num] self._idx += num return ret def _reset(self): """Reset the index of batchsampler to zero.""" assert self._name is not None # print("reset", self._name) if self._shuffle: np.random.shuffle(self._indices) self._idx = 0 def sample(self, num): """Sample specific number of ground truths. Args: num (int): Sampled number. Returns: list[dict]: Sampled ground truths. """ indices = self._sample(num) return [self._sampled_list[i] for i in indices] @OBJECTSAMPLERS.register_module() class DataBaseSampler(object): """Class for sampling data from the ground truth database. Args: info_path (str): Path of groundtruth database info. data_root (str): Path of groundtruth database. rate (float): Rate of actual sampled over maximum sampled number. prepare (dict): Name of preparation functions and the input value. sample_groups (dict): Sampled classes and numbers. classes (list[str], optional): List of classes. Default: None. bbox_code_size (int, optional): The number of bbox dimensions. Default: None. points_loader(dict, optional): Config of points loader. Default: dict(type='LoadPointsFromFile', load_dim=4, use_dim=[0,1,2,3]) """ def __init__(self, info_path, data_root, rate, prepare, sample_groups, classes=None, bbox_code_size=None, points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=[0, 1, 2, 3]), file_client_args=dict(backend='disk')): super().__init__() self.data_root = data_root self.info_path = info_path self.rate = rate self.prepare = prepare self.classes = classes self.cat2label = {name: i for i, name in enumerate(classes)} self.label2cat = {i: name for i, name in enumerate(classes)} self.points_loader = mmcv.build_from_cfg(points_loader, PIPELINES) self.file_client = mmcv.FileClient(**file_client_args) # load data base infos if hasattr(self.file_client, 'get_local_path'): with self.file_client.get_local_path(info_path) as local_path: # loading data from a file-like object needs file format db_infos = mmcv.load(open(local_path, 'rb'), file_format='pkl') else: warnings.warn( 'The used MMCV version does not have get_local_path. ' f'We treat the {info_path} as local paths and it ' 'might cause errors if the path is not a local path. ' 'Please use MMCV>= 1.3.16 if you meet errors.') db_infos = mmcv.load(info_path) # filter database infos from mmdet3d.utils import get_root_logger logger = get_root_logger() for k, v in db_infos.items(): logger.info(f'load {len(v)} {k} database infos') for prep_func, val in prepare.items(): db_infos = getattr(self, prep_func)(db_infos, val) logger.info('After filter database:') for k, v in db_infos.items(): logger.info(f'load {len(v)} {k} database infos') self.db_infos = db_infos self.bbox_code_size = bbox_code_size if bbox_code_size is not None: for k, info_cls in self.db_infos.items(): for info in info_cls: info['box3d_lidar'] = info['box3d_lidar'][:self. bbox_code_size] # load sample groups # TODO: more elegant way to load sample groups self.sample_groups = [] for name, num in sample_groups.items(): self.sample_groups.append({name: int(num)}) self.group_db_infos = self.db_infos # just use db_infos self.sample_classes = [] self.sample_max_nums = [] for group_info in self.sample_groups: self.sample_classes += list(group_info.keys()) self.sample_max_nums += list(group_info.values()) self.sampler_dict = {} for k, v in self.group_db_infos.items(): self.sampler_dict[k] = BatchSampler(v, k, shuffle=True) # TODO: No group_sampling currently @staticmethod def filter_by_difficulty(db_infos, removed_difficulty): """Filter ground truths by difficulties. Args: db_infos (dict): Info of groundtruth database. removed_difficulty (list): Difficulties that are not qualified. Returns: dict: Info of database after filtering. """ new_db_infos = {} for key, dinfos in db_infos.items(): new_db_infos[key] = [ info for info in dinfos if info.get('difficulty', 0) not in removed_difficulty ] return new_db_infos @staticmethod def filter_by_min_points(db_infos, min_gt_points_dict): """Filter ground truths by number of points in the bbox. Args: db_infos (dict): Info of groundtruth database. min_gt_points_dict (dict): Different number of minimum points needed for different categories of ground truths. Returns: dict: Info of database after filtering. """ for name, min_num in min_gt_points_dict.items(): min_num = int(min_num) if min_num > 0: filtered_infos = [] for info in db_infos[name]: if info['num_points_in_gt'] >= min_num: filtered_infos.append(info) db_infos[name] = filtered_infos return db_infos def sample_all(self, gt_bboxes, gt_labels, img=None, ground_plane=None): """Sampling all categories of bboxes. Args: gt_bboxes (np.ndarray): Ground truth bounding boxes. gt_labels (np.ndarray): Ground truth labels of boxes. Returns: dict: Dict of sampled 'pseudo ground truths'. - gt_labels_3d (np.ndarray): ground truths labels of sampled objects. - gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): sampled ground truth 3D bounding boxes - points (np.ndarray): sampled points - group_ids (np.ndarray): ids of sampled ground truths """ sampled_num_dict = {} sample_num_per_class = [] for class_name, max_sample_num in zip(self.sample_classes, self.sample_max_nums): class_label = self.cat2label[class_name] # sampled_num = int(max_sample_num - # np.sum([n == class_name for n in gt_names])) sampled_num = int(max_sample_num - np.sum([n == class_label for n in gt_labels])) sampled_num = np.round(self.rate * sampled_num).astype(np.int64) sampled_num_dict[class_name] = sampled_num sample_num_per_class.append(sampled_num) sampled = [] sampled_gt_bboxes = [] avoid_coll_boxes = gt_bboxes for class_name, sampled_num in zip(self.sample_classes, sample_num_per_class): if sampled_num > 0: sampled_cls = self.sample_class_v2(class_name, sampled_num, avoid_coll_boxes) sampled += sampled_cls if len(sampled_cls) > 0: if len(sampled_cls) == 1: sampled_gt_box = sampled_cls[0]['box3d_lidar'][ np.newaxis, ...] else: sampled_gt_box = np.stack( [s['box3d_lidar'] for s in sampled_cls], axis=0) sampled_gt_bboxes += [sampled_gt_box] avoid_coll_boxes = np.concatenate( [avoid_coll_boxes, sampled_gt_box], axis=0) ret = None if len(sampled) > 0: sampled_gt_bboxes = np.concatenate(sampled_gt_bboxes, axis=0) # center = sampled_gt_bboxes[:, 0:3] # num_sampled = len(sampled) s_points_list = [] count = 0 for info in sampled: file_path = os.path.join( self.data_root, info['path']) if self.data_root else info['path'] results = dict(pts_filename=file_path) s_points = self.points_loader(results)['points'] s_points.translate(info['box3d_lidar'][:3]) count += 1 s_points_list.append(s_points) gt_labels = np.array([self.cat2label[s['name']] for s in sampled], dtype=np.long) if ground_plane is not None: xyz = sampled_gt_bboxes[:, :3] dz = (ground_plane[:3][None, :] * xyz).sum(-1) + ground_plane[3] sampled_gt_bboxes[:, 2] -= dz for i, s_points in enumerate(s_points_list): s_points.tensor[:, 2].sub_(dz[i]) ret = { 'gt_labels_3d': gt_labels, 'gt_bboxes_3d': sampled_gt_bboxes, 'points': s_points_list[0].cat(s_points_list), 'group_ids': np.arange(gt_bboxes.shape[0], gt_bboxes.shape[0] + len(sampled)) } return ret def sample_class_v2(self, name, num, gt_bboxes): """Sampling specific categories of bounding boxes. Args: name (str): Class of objects to be sampled. num (int): Number of sampled bboxes. gt_bboxes (np.ndarray): Ground truth boxes. Returns: list[dict]: Valid samples after collision test. """ sampled = self.sampler_dict[name].sample(num) sampled = copy.deepcopy(sampled) num_gt = gt_bboxes.shape[0] num_sampled = len(sampled) gt_bboxes_bv = box_np_ops.center_to_corner_box2d( gt_bboxes[:, 0:2], gt_bboxes[:, 3:5], gt_bboxes[:, 6]) sp_boxes = np.stack([i['box3d_lidar'] for i in sampled], axis=0) boxes = np.concatenate([gt_bboxes, sp_boxes], axis=0).copy() sp_boxes_new = boxes[gt_bboxes.shape[0]:] sp_boxes_bv = box_np_ops.center_to_corner_box2d( sp_boxes_new[:, 0:2], sp_boxes_new[:, 3:5], sp_boxes_new[:, 6]) total_bv = np.concatenate([gt_bboxes_bv, sp_boxes_bv], axis=0) coll_mat = data_augment_utils.box_collision_test(total_bv, total_bv) diag = np.arange(total_bv.shape[0]) coll_mat[diag, diag] = False valid_samples = [] for i in range(num_gt, num_gt + num_sampled): if coll_mat[i].any(): coll_mat[i] = False coll_mat[:, i] = False else: valid_samples.append(sampled[i - num_gt]) return valid_samples ================================================ FILE: mmdet3d/datasets/pipelines/formating.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np from mmcv.parallel import DataContainer as DC from mmdet3d.core.bbox import BaseInstance3DBoxes from mmdet3d.core.points import BasePoints from mmdet.datasets.pipelines import to_tensor from ..builder import PIPELINES @PIPELINES.register_module() class DefaultFormatBundle(object): """Default formatting bundle. It simplifies the pipeline of formatting common fields, including "img", "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - proposals: (1)to tensor, (2)to DataContainer - gt_bboxes: (1)to tensor, (2)to DataContainer - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - gt_labels: (1)to tensor, (2)to DataContainer - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True) - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, (3)to DataContainer (stack=True) """ def __init__(self, ): return def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ if 'img' in results: if isinstance(results['img'], list): # process multiple imgs in single frame imgs = [img.transpose(2, 0, 1) for img in results['img']] imgs = np.ascontiguousarray(np.stack(imgs, axis=0)) results['img'] = DC(to_tensor(imgs), stack=True) else: img = np.ascontiguousarray(results['img'].transpose(2, 0, 1)) results['img'] = DC(to_tensor(img), stack=True) for key in [ 'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels', 'gt_labels_3d', 'attr_labels', 'pts_instance_mask', 'pts_semantic_mask', 'depths' ]: if key not in results: continue if isinstance(results[key], list): results[key] = DC([to_tensor(res) for res in results[key]]) else: results[key] = DC(to_tensor(results[key])) if 'gt_bboxes_3d' in results: if isinstance(results['gt_bboxes_3d'], BaseInstance3DBoxes): results['gt_bboxes_3d'] = DC( results['gt_bboxes_3d'], cpu_only=True) else: results['gt_bboxes_3d'] = DC( to_tensor(results['gt_bboxes_3d'])) for key in ['centers2d', 'depths2d', 'gt_labels_2d', 'gt_bboxes_2d']: if key in results: results[key] = DC(results[key], cpu_only=True, stack=False) for key in ['gt_ego_lcf_feat', 'gt_ego_fut_trajs', 'gt_ego_his_trajs', 'gt_ego_fut_cmd', 'gt_ego_fut_masks', 'vad_ego_fut_trajs']: if key in results: results[key] = DC(results[key], stack=False) for key in ['gt_agent_fut_traj', 'gt_agent_fut_traj_mask', 'gt_agent_fut_abs_traj']: if key in results: results[key] = DC(results[key], cpu_only=False, stack=False) if 'gt_masks' in results: results['gt_masks'] = DC(results['gt_masks'], cpu_only=True) if 'gt_semantic_seg' in results: results['gt_semantic_seg'] = DC( to_tensor(results['gt_semantic_seg'][None, ...]), stack=True) if 'can_bus_info' in results: results['can_bus_info'] = DC( to_tensor(results['can_bus_info'][None, ...]), stack=False) if 'gt_fut_segmentations' in results: results['gt_fut_segmentations'] = DC( to_tensor(results['gt_fut_segmentations']), stack=True) results['gt_fut_segmentations_plus'] = DC( to_tensor(results['gt_fut_segmentations_plus']), stack=True) if 'fut_boxes_in_cur_ego_list' in results: results['fut_boxes_in_cur_ego_list'] = DC( results['fut_boxes_in_cur_ego_list'], cpu_only=True, stack=False) return results def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class Collect3D(object): """Collect data from the loader relevant to the specific task. This is usually the last stage of the data loader pipeline. Typically keys is set to some subset of "img", "proposals", "gt_bboxes", "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". The "img_meta" item is always populated. The contents of the "img_meta" dictionary depends on "meta_keys". By default this includes: - 'img_shape': shape of the image input to the network as a tuple (h, w, c). Note that images may be zero padded on the bottom/right if the batch tensor is larger than this shape. - 'scale_factor': a float indicating the preprocessing scale - 'flip': a boolean indicating if image flip transform was used - 'filename': path to the image file - 'ori_shape': original shape of the image as a tuple (h, w, c) - 'pad_shape': image shape after padding - 'lidar2img': transform from lidar to image - 'depth2img': transform from depth to image - 'cam2img': transform from camera to image - 'pcd_horizontal_flip': a boolean indicating if point cloud is flipped horizontally - 'pcd_vertical_flip': a boolean indicating if point cloud is flipped vertically - 'box_mode_3d': 3D box mode - 'box_type_3d': 3D box type - 'img_norm_cfg': a dict of normalization information: - mean: per channel mean subtraction - std: per channel std divisor - to_rgb: bool indicating if bgr was converted to rgb - 'pcd_trans': point cloud transformations - 'sample_idx': sample index - 'pcd_scale_factor': point cloud scale factor - 'pcd_rotation': rotation applied to point cloud - 'pts_filename': path to point cloud file. Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str], optional): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') """ def __init__( self, keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pcd_rotation_angle', 'pts_filename', 'transformation_3d_flow', 'trans_mat', 'index', 'sequence_group_idx', 'curr_to_prev_lidar_rt', 'curr_to_prev_ego_rt', 'start_of_sequence', 'index', 'global_to_curr_lidar_rt', 'tta_config', 'input_size', 'prev_lidar_to_global_rt', 'sample_index', 'scene_name', 'curr', 'nuscenes_get_rt_matrix', 'aux_cam_params', 'affine_aug', 'ego_pose_inv', 'ego_pose', 'timestamp', 'has_valid_map', 'instance_inds')): self.keys = keys self.meta_keys = meta_keys def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` will be converted to :obj:`mmcv.DataContainer`. Args: results (dict): Result dict contains the data to collect. Returns: dict: The result dict contains the following keys - keys in ``self.keys`` - ``img_metas`` """ data = {} img_metas = {} for key in self.meta_keys: if key in results: img_metas[key] = results[key] data['img_metas'] = DC(img_metas, cpu_only=True) for key in self.keys: data[key] = results[key] return data def __repr__(self): """str: Return a string that describes the module.""" return self.__class__.__name__ + \ f'(keys={self.keys}, meta_keys={self.meta_keys})' @PIPELINES.register_module() class DefaultFormatBundle3D(DefaultFormatBundle): """Default formatting bundle. It simplifies the pipeline of formatting common fields for voxels, including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - proposals: (1)to tensor, (2)to DataContainer - gt_bboxes: (1)to tensor, (2)to DataContainer - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - gt_labels: (1)to tensor, (2)to DataContainer """ def __init__(self, class_names, with_gt=True, with_label=True): super(DefaultFormatBundle3D, self).__init__() self.class_names = class_names self.with_gt = with_gt self.with_label = with_label def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ # Format 3D data if 'points' in results: assert isinstance(results['points'], BasePoints) results['points'] = DC(results['points'].tensor) for key in ['voxels', 'coors', 'voxel_centers', 'num_points']: if key not in results: continue results[key] = DC(to_tensor(results[key]), stack=False) if self.with_gt: # Clean GT bboxes in the final if 'gt_bboxes_3d_mask' in results: gt_bboxes_3d_mask = results['gt_bboxes_3d_mask'] results['gt_bboxes_3d'] = results['gt_bboxes_3d'][ gt_bboxes_3d_mask] if 'gt_names_3d' in results: results['gt_names_3d'] = results['gt_names_3d'][ gt_bboxes_3d_mask] if 'centers2d' in results: results['centers2d'] = results['centers2d'][ gt_bboxes_3d_mask] if 'depths' in results: results['depths'] = results['depths'][gt_bboxes_3d_mask] if 'gt_bboxes_mask' in results: gt_bboxes_mask = results['gt_bboxes_mask'] if 'gt_bboxes' in results: results['gt_bboxes'] = results['gt_bboxes'][gt_bboxes_mask] results['gt_names'] = results['gt_names'][gt_bboxes_mask] if self.with_label: if 'gt_names' in results and len(results['gt_names']) == 0: results['gt_labels'] = np.array([], dtype=np.int64) results['attr_labels'] = np.array([], dtype=np.int64) elif 'gt_names' in results and isinstance( results['gt_names'][0], list): # gt_labels might be a list of list in multi-view setting results['gt_labels'] = [ np.array([self.class_names.index(n) for n in res], dtype=np.int64) for res in results['gt_names'] ] elif 'gt_names' in results: results['gt_labels'] = np.array([ self.class_names.index(n) for n in results['gt_names'] ], dtype=np.int64) # we still assume one pipeline for one frame LiDAR # thus, the 3D name is list[string] if 'gt_names_3d' in results: results['gt_labels_3d'] = np.array([ self.class_names.index(n) for n in results['gt_names_3d'] ], dtype=np.int64) results = super(DefaultFormatBundle3D, self).__call__(results) return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(class_names={self.class_names}, ' repr_str += f'with_gt={self.with_gt}, with_label={self.with_label})' return repr_str ================================================ FILE: mmdet3d/datasets/pipelines/loading.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv import numpy as np import torch from PIL import Image from pyquaternion import Quaternion import os.path as osp from mmdet3d.core.points import BasePoints, get_points_type from mmdet.datasets.pipelines import LoadAnnotations, LoadImageFromFile from ...core.bbox import LiDARInstance3DBoxes from ..builder import PIPELINES from copy import deepcopy import cv2 import os from torchvision.transforms.functional import rotate from mmdet3d.datasets.vector_map import VectorizedLocalMap, LiDARInstanceLines from nuscenes.eval.common.utils import quaternion_yaw from nuscenes.eval.common.utils import Quaternion as Quaternion_nus # from .vad_custom_nuscenes_eval import NuScenesEval_custom from nuscenes.eval.common.utils import center_distance from mmcv.parallel import DataContainer as DC import random from mmdet3d.core import LiDARInstance3DBoxes from nuscenes.utils.data_classes import Box as NuScenesBox # from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox from shapely import affinity, ops from shapely.geometry import LineString, box, MultiPolygon, MultiLineString from mmdet.datasets.pipelines import to_tensor from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer from nuscenes.eval.detection.constants import DETECTION_NAMES from mmcv.runner import get_dist_info from nuscenes.utils.data_classes import Box as NuScenesBox import pyquaternion import torch.nn as nn @PIPELINES.register_module() class LoadVectorMap(object): def __init__(self, data_root, point_cloud_range, map_fixed_ptsnum_per_line=20, map_classes=['divider', 'ped_crossing', 'boundary'], **kwargs): patch_h = point_cloud_range[4]-point_cloud_range[1] patch_w = point_cloud_range[3]-point_cloud_range[0] self.patch_size = (min(patch_h, 50), patch_w) self.vector_map = VectorizedLocalMap(data_root, patch_size=self.patch_size, map_classes=map_classes, fixed_ptsnum_per_line=map_fixed_ptsnum_per_line) def vectormap_pipeline(self, location, ego2global_translation, patch_angle, flip_dx, flip_dy): ''' `example` type: keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img'; all keys type is 'DataContainer'; 'img_metas' cpu_only=True, type is dict, others are false; 'gt_labels_3d' shape torch.size([num_samples]), stack=False, padding_value=0, cpu_only=False 'gt_bboxes_3d': stack=False, cpu_only=True ''' anns_results = self.vector_map.gen_vectorized_samples( location, ego2global_translation, patch_angle, flip_dx, flip_dy ) has_valid_map = True if len(anns_results['gt_vecs_label']) == 0: ## params that can generate non-empty anns location = 'boston-seaport' ego2global_translation = [1178.1282, 1140.1135, 0.0] patch_angle = 143.6049566307475 flip_dx = False flip_dy = False ## pseudo_anns_results = self.vector_map.gen_vectorized_samples( location, ego2global_translation, patch_angle, flip_dx, flip_dy ) anns_results = pseudo_anns_results has_valid_map = False ''' anns_results, type: dict 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates 'gt_vecs_pts_num': list[num_vecs], vec with num_points 'gt_vecs_label': list[num_vecs], vec with cls index ''' gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] else: gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) try: gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) except: assert False # empty tensor, will be passed in train, # but we preserve it for test gt_vecs_pts_loc = gt_vecs_pts_loc return dict( map_gt_labels_3d = DC(gt_vecs_label, cpu_only=False), map_gt_bboxes_3d = DC(gt_vecs_pts_loc, cpu_only=True), has_valid_map = has_valid_map, ) def __call__(self, results): ego2global_translation = list(results['ori_ego_pose'][:3,3].numpy()) # ego2global_rotation = list(Quaternion_nus(matrix=ego2global.numpy(), rtol=eps, atol=eps).q) v = np.dot( results['ori_ego_pose'][:3,:3].numpy(), np.array([1, 0, 0])) yaw = np.arctan2(v[1], v[0]) ori_patch_angle = yaw / np.pi * 180 # v = np.dot(ego2global[:3,:3].numpy(), np.array([1, 0, 0])) # yaw = np.arctan2(v[1], v[0]) # patch_angle2 = yaw / np.pi * 180 results.update( self.vectormap_pipeline(results['curr']['map_location'], ego2global_translation, ori_patch_angle-results['rotate_bda'], results['flip_dx'], results['flip_dy']) ) return results @PIPELINES.register_module() class LoadVectorMap2(object): def __init__(self, data_root, point_cloud_range, map_fixed_ptsnum_per_line=20, map_classes=['divider', 'ped_crossing', 'boundary'], **kwargs): patch_h = point_cloud_range[4]-point_cloud_range[1] patch_w = point_cloud_range[3]-point_cloud_range[0] self.point_cloud_range = torch.tensor(point_cloud_range) self.patch_size = (min(patch_h, 50), patch_w) self.vector_map = VectorizedLocalMap(data_root, patch_size=self.patch_size, map_classes=map_classes, fixed_ptsnum_per_line=map_fixed_ptsnum_per_line) def vectormap_pipeline(self, location, ego2global_translation, patch_angle, flip_dx, flip_dy): ''' `example` type: keys: 'img_metas', 'gt_bboxes_3d', 'gt_labels_3d', 'img'; all keys type is 'DataContainer'; 'img_metas' cpu_only=True, type is dict, others are false; 'gt_labels_3d' shape torch.size([num_samples]), stack=False, padding_value=0, cpu_only=False 'gt_bboxes_3d': stack=False, cpu_only=True ''' anns_results = self.vector_map.gen_vectorized_samples( location, ego2global_translation, patch_angle, flip_dx, flip_dy ) has_valid_map = True if len(anns_results['gt_vecs_label']) == 0: ## params that can generate non-empty anns location = 'boston-seaport' ego2global_translation = [1178.1282, 1140.1135, 0.0] patch_angle = 143.6049566307475 flip_dx = False flip_dy = False ## pseudo_anns_results = self.vector_map.gen_vectorized_samples( location, ego2global_translation, patch_angle, flip_dx, flip_dy ) anns_results = pseudo_anns_results has_valid_map = False ''' anns_results, type: dict 'gt_vecs_pts_loc': list[num_vecs], vec with num_points*2 coordinates 'gt_vecs_pts_num': list[num_vecs], vec with num_points 'gt_vecs_label': list[num_vecs], vec with cls index ''' gt_vecs_label = to_tensor(anns_results['gt_vecs_label']) if isinstance(anns_results['gt_vecs_pts_loc'], LiDARInstanceLines): gt_vecs_pts_loc = anns_results['gt_vecs_pts_loc'] else: gt_vecs_pts_loc = to_tensor(anns_results['gt_vecs_pts_loc']) try: gt_vecs_pts_loc = gt_vecs_pts_loc.flatten(1).to(dtype=torch.float32) except: assert False # empty tensor, will be passed in train, # but we preserve it for test gt_vecs_pts_loc = gt_vecs_pts_loc gt_pts = gt_vecs_pts_loc.shift_fixed_num_sampled_points_v2 gt_pts = (gt_pts - self.point_cloud_range[:2])/(self.point_cloud_range[3:5]-self.point_cloud_range[:2]) return dict( map_gt_labels_3d = DC(gt_vecs_label, cpu_only=False), map_gt_bboxes_3d = DC(gt_pts, cpu_only=False), has_valid_map = has_valid_map, ) def __call__(self, results): ego2global_translation = list(results['ori_ego_pose'][:3,3].numpy()) # ego2global_rotation = list(Quaternion_nus(matrix=ego2global.numpy(), rtol=eps, atol=eps).q) v = np.dot( results['ori_ego_pose'][:3,:3].numpy(), np.array([1, 0, 0])) yaw = np.arctan2(v[1], v[0]) ori_patch_angle = yaw / np.pi * 180 # v = np.dot(ego2global[:3,:3].numpy(), np.array([1, 0, 0])) # yaw = np.arctan2(v[1], v[0]) # patch_angle2 = yaw / np.pi * 180 results.update( self.vectormap_pipeline(results['curr']['map_location'], ego2global_translation, ori_patch_angle-results['rotate_bda'], results['flip_dx'], results['flip_dy']) ) return results @PIPELINES.register_module() class LoadGTPlaner(object): def __init__(self): pass def __call__(self, results): results['gt_ego_lcf_feat'] = to_tensor(results['curr']['gt_ego_lcf_feat']) results['gt_ego_lcf_feat'][:2] = (results['bda_mat'][:2, :2] @ results['gt_ego_lcf_feat'][:2, None]).squeeze(-1) results['gt_ego_fut_trajs'] = torch.cumsum(to_tensor(results['curr']['gt_ego_fut_trajs']), dim=0)[:6] results['gt_ego_fut_trajs'] = (results['bda_mat'][:2,:2] @ results['gt_ego_fut_trajs'][..., None]).squeeze(-1) results['gt_ego_his_trajs'] = -to_tensor(results['curr']['gt_ego_his_trajs']) results['gt_ego_his_trajs'] = (results['bda_mat'][:2,:2] @ results['gt_ego_his_trajs'][..., None]).squeeze(-1) if results['gt_ego_fut_trajs'][-1][1] >= 2: command = np.array([1, 0, 0]) # Turn Right elif results['gt_ego_fut_trajs'][-1][1] <= -2: command = np.array([0, 1, 0]) # Turn Left else: command = np.array([0, 0, 1]) # Go Straight results['gt_ego_fut_cmd'] = to_tensor(command) results['gt_ego_fut_masks'] = to_tensor(results['curr']['gt_ego_fut_masks'])[: 6] return results @PIPELINES.register_module() class LoadGTMotion(object): def __init__(self, with_ego_as_agent=False): self.with_ego_as_agent = with_ego_as_agent def __call__(self, results): agent_fut_traj_mask = torch.tensor(np.array(results['curr']['ann_infos']['fut_traj_mask']), dtype=torch.float32) agent_fut_traj = torch.tensor(np.array(results['curr']['ann_infos']['fut_traj']), dtype=torch.float32) agent_fut_traj = torch.cat([agent_fut_traj, torch.ones_like(agent_fut_traj[..., 0:2])], dim=-1) if len(agent_fut_traj)>0: agent_fut_traj = (results['ego_pose_inv'] @ agent_fut_traj.unsqueeze(-1)).squeeze(-1)[..., :2] * agent_fut_traj_mask if self.with_ego_as_agent: gt_ego_fut_trajs = torch.cumsum(to_tensor(results['curr']['gt_ego_fut_trajs']), dim=0)[: 6] gt_ego_fut_trajs = torch.cat([gt_ego_fut_trajs, torch.zeros_like(gt_ego_fut_trajs[:2])]) agent_fut_traj = torch.cat([gt_ego_fut_trajs[None], agent_fut_traj], 0) gt_fut_traj_mask = torch.ones_like(gt_ego_fut_trajs) gt_fut_traj_mask[-2:] = 0 agent_fut_traj_mask = torch.cat([gt_fut_traj_mask[None], agent_fut_traj_mask], 0) centers = results['gt_bboxes_3d'].center[..., :2] try: tmp = torch.cat([centers[:, None], agent_fut_traj], 1) except: print(centers.shape, agent_fut_traj.shape, agent_fut_traj_mask.shape, results['gt_labels_3d'].shape) agent_fut_traj = tmp[:, 1:] - tmp[:, :-1] results['gt_agent_fut_traj_mask'] = agent_fut_traj_mask results['gt_agent_fut_traj'] = agent_fut_traj return results @PIPELINES.register_module() class LoadFutBoxInfo(object): def __init__(self, add_boundary=True): self.X_BOUND = [-50.0, 50.0, 0.1] # Forward self.Y_BOUND = [-50.0, 50.0, 0.1] # Sides self.Z_BOUND = [-10.0, 10.0, 20.0] # Height dx, bx, _ = self.gen_dx_bx(self.X_BOUND, self.Y_BOUND, self.Z_BOUND) self.dx, self.bx = dx[:2], bx[:2] bev_resolution, bev_start_position, bev_dimension = self.calculate_birds_eye_view_parameters( self.X_BOUND, self.Y_BOUND, self.Z_BOUND ) self.bev_resolution = bev_resolution.numpy() self.bev_start_position = bev_start_position.numpy() self.bev_dimension = bev_dimension.numpy() ego_width, ego_length = 1.85, 4.084 self.W = ego_width self.H = ego_length self.category_index = { 'human':[2,3,4,5,6,7,8], 'vehicle':[14,15,16,17,18,19,20,21,22,23] } self.add_boundary = add_boundary # self.n_future = n_future # self.add_state("obj_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum") # self.add_state("obj_box_col", default=torch.zeros(self.n_future), dist_reduce_fx="sum") # self.add_state("L2", default=torch.zeros(self.n_future),dist_reduce_fx="sum") # self.add_state("total", default=torch.tensor(0), dist_reduce_fx="sum") def gen_dx_bx(self, xbound, ybound, zbound): dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]]) nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) return dx, bx, nx def calculate_birds_eye_view_parameters(self, x_bounds, y_bounds, z_bounds): """ Parameters ---------- x_bounds: Forward direction in the ego-car. y_bounds: Sides z_bounds: Height Returns ------- bev_resolution: Bird's-eye view bev_resolution bev_start_position Bird's-eye view first element bev_dimension Bird's-eye view tensor spatial dimension """ bev_resolution = torch.tensor([row[2] for row in [x_bounds, y_bounds, z_bounds]]) bev_start_position = torch.tensor([row[0] + row[2] / 2.0 for row in [x_bounds, y_bounds, z_bounds]]) bev_dimension = torch.tensor([(row[1] - row[0]) / row[2] for row in [x_bounds, y_bounds, z_bounds]], dtype=torch.long) return bev_resolution, bev_start_position, bev_dimension def get_label( self, boxes_in_cur_ego_list, labels_in_cur_ego_list ): segmentation_np, pedestrian_np = self.get_birds_eye_view_label(boxes_in_cur_ego_list, labels_in_cur_ego_list) segmentation = torch.from_numpy(segmentation_np).long() pedestrian = torch.from_numpy(pedestrian_np).long() return segmentation, pedestrian def world2bev_vis(self, x, y): return int((x - self.bx[0].item()) / self.dx[0].item()), int((y - self.bx[1].item()) / self.dx[1].item()) def get_birds_eye_view_label(self, boxes_in_cur_ego_list, labels_in_cur_ego_list): T = 6 segmentation = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1])) pedestrian = np.zeros((T,self.bev_dimension[0], self.bev_dimension[1])) for k, fut_boxes in enumerate(boxes_in_cur_ego_list): if fut_boxes is None: continue for i, corners in enumerate(fut_boxes.corners[:, [4, 7, 3, 0], :2]): # fitler vehicle vehicle_classes = ['car', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'truck', 'trailer'] if labels_in_cur_ego_list[k][i] not in [0, 1, 2, 3, 4, 6, 7]: continue corners = np.array([self.world2bev_vis(*corner) for corner in corners]) cv2.fillPoly(segmentation[k], [corners], 1.0) return segmentation, pedestrian def __call__(self, results): ego2global_rotation = results['nuscenes_get_rt_matrix']['ego2global_rotation'] ego2global_translation =results['nuscenes_get_rt_matrix'][ 'ego2global_translation'] trans = -np.array(ego2global_translation) rot = Quaternion(ego2global_rotation).inverse boxes_in_cur_ego_list = [] for gt_boxes_each_frame in results['fut_boxes_info']: boxes_in_cur_ego = [] if len(gt_boxes_each_frame)==0: boxes_in_cur_ego_list.append(None) continue for box in gt_boxes_each_frame: center = box[:3] wlh = box[3:6] box_yaw = box[6] box_vel = box[7:].tolist() box_vel.append(0) quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw) nusc_box = NuScenesBox(center, wlh, quat, velocity=box_vel) nusc_box.translate(trans) nusc_box.rotate(rot) box_xyz = np.array(nusc_box.center) box_dxdydz = np.array(nusc_box.wlh) box_yaw = np.array([nusc_box.orientation.yaw_pitch_roll[0]]) box_velo = np.array(nusc_box.velocity[:2]) gt_box = np.concatenate([box_xyz, box_dxdydz, box_yaw, box_velo]) boxes_in_cur_ego.append(gt_box) boxes_in_cur_ego = torch.tensor(np.array(boxes_in_cur_ego)) boxes_in_cur_ego = LiDARInstance3DBoxes(boxes_in_cur_ego, box_dim=boxes_in_cur_ego.shape[-1], origin=(0.5, 0.5, 0.5)) boxes_in_cur_ego_list.append(boxes_in_cur_ego) results['fut_boxes_in_cur_ego_list'] = boxes_in_cur_ego_list segmentation, pedestrian = self.get_label(boxes_in_cur_ego_list, results['fut_labels_info']) segmentation_plus = segmentation.permute(1, 2, 0).cpu().clone().numpy() segmentation_plus *= 0 # only consider boudnary, temporal map_gt_bboxes_3d = results['map_gt_bboxes_3d'].data.fixed_num_sampled_points map_gt_bboxes_3d= map_gt_bboxes_3d[ results['map_gt_labels_3d'].data==2] map_gt_bboxes_3d = (map_gt_bboxes_3d - self.bx.cpu().numpy() ) / (self.dx.cpu().numpy()) a = segmentation_plus[:, :, :3].copy() a = np.ascontiguousarray(a, dtype=np.uint8) b = segmentation_plus[:, :, :3].copy() b = np.ascontiguousarray(a, dtype=np.uint8) for line in map_gt_bboxes_3d: line = line.clip(0, 999).numpy().astype(np.int32) for i, corner in enumerate(line[:-1]): a = cv2.line(a, tuple(line[i]), tuple(line[i+1]), color=(1, 1, 1), thickness=1) b = cv2.line(b, tuple(line[i]), tuple(line[i+1]), color=(1, 1, 1), thickness=1) segmentation_plus = torch.cat([torch.tensor(a), torch.tensor(b)], -1).permute(2, 0, 1) results['gt_fut_segmentations'] = segmentation results['gt_fut_segmentations_plus'] = segmentation_plus return results @PIPELINES.register_module() class LoadSemanticImageMask(object): def __init__(self, mask_file_path='./data/nus_sem'): self.mask_file_path = mask_file_path def __call__(self, results): masks = [] for cam in results['cam_names']: data_token = results['curr']['cams'][cam]['sample_data_token'] filename = osp.join(self.mask_file_path, data_token+'.png') img = Image.open(filename) img_augs = results['img_augs'][cam] resize, resize_dims, crop, flip, rotate = img_augs img = self.img_transform_core(img, resize_dims, crop, flip, rotate) img = np.array(img) masks.append(img) masks = np.stack(masks, 0) results['gt_img_sem_masks'] = to_tensor(masks) return results def img_transform_core(self, img, resize_dims, crop, flip, rotate): # adjust image img = img.resize(resize_dims, resample=0) img = img.crop(crop) if flip: img = img.transpose(method=Image.FLIP_LEFT_RIGHT) img = img.rotate(rotate, resample=0, expand=0) return img @PIPELINES.register_module() class LoadMultiViewImageFromFiles(object): """Load multi channel images from a list of separate channel files. Expects results['img_filename'] to be a list of filenames. Args: to_float32 (bool, optional): Whether to convert the img to float32. Defaults to False. color_type (str, optional): Color type of the file. Defaults to 'unchanged'. """ def __init__(self, to_float32=False, color_type='unchanged'): self.to_float32 = to_float32 self.color_type = color_type def __call__(self, results): """Call function to load multi-view image from files. Args: results (dict): Result dict containing multi-view image filenames. Returns: dict: The result dict containing the multi-view image data. Added keys and values are described below. - filename (str): Multi-view image filenames. - img (np.ndarray): Multi-view image arrays. - img_shape (tuple[int]): Shape of multi-view image arrays. - ori_shape (tuple[int]): Shape of original image arrays. - pad_shape (tuple[int]): Shape of padded image arrays. - scale_factor (float): Scale factor. - img_norm_cfg (dict): Normalization configuration of images. """ filename = results['img_filename'] # img is of shape (h, w, c, num_views) img = np.stack( [mmcv.imread(name, self.color_type) for name in filename], axis=-1) if self.to_float32: img = img.astype(np.float32) results['filename'] = filename # unravel to list, see `DefaultFormatBundle` in formatting.py # which will transpose each image separately and then stack into array results['img'] = [img[..., i] for i in range(img.shape[-1])] results['img_shape'] = img.shape results['ori_shape'] = img.shape # Set initial values for default meta_keys results['pad_shape'] = img.shape results['scale_factor'] = 1.0 num_channels = 1 if len(img.shape) < 3 else img.shape[2] results['img_norm_cfg'] = dict( mean=np.zeros(num_channels, dtype=np.float32), std=np.ones(num_channels, dtype=np.float32), to_rgb=False) return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(to_float32={self.to_float32}, ' repr_str += f"color_type='{self.color_type}')" return repr_str @PIPELINES.register_module() class LoadImageFromFileMono3D(object): """Load an image from file in monocular 3D object detection. Compared to 2D detection, additional camera parameters need to be loaded. Args: kwargs (dict): Arguments are the same as those in :class:`LoadImageFromFile`. """ def __call__(self, results): """Call functions to load image and get image meta information. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded image and meta information. """ super().__call__(results) results['cam2img'] = results['img_info']['cam_intrinsic'] return results @PIPELINES.register_module() class LoadOccupancy(object): """Load an image from file in monocular 3D object detection. Compared to 2D detection, additional camera parameters need to be loaded. Args: kwargs (dict): Arguments are the same as those in :class:`LoadImageFromFile`. """ def __init__(self, occupancy_path='/mount/dnn_data/occupancy_2023/gts', num_classes=17, ignore_nonvisible=False, mask='mask_camera', ignore_classes=[], fix_void=True) : self.occupancy_path = occupancy_path self.num_classes = num_classes self.ignore_nonvisible = ignore_nonvisible self.mask = mask self.ignore_classes=ignore_classes self.fix_void = fix_void def __call__(self, results): """Call functions to load image and get image meta information. Args: results (dict): Result dict from :obj:`mmdet.CustomDataset`. Returns: dict: The dict contains loaded image and meta information. """ scene_name = results['curr']['scene_name'] sample_token = results['curr']['token'] occupancy_file_path = osp.join(self.occupancy_path, scene_name, sample_token, 'labels.npz') data = np.load(occupancy_file_path) occupancy = torch.tensor(data['semantics']) visible_mask = torch.tensor(data[self.mask]) # visible_mask_lidar = data['mask_lidar'] if self.ignore_nonvisible: occupancy[~visible_mask.to(torch.bool)] = 255 # to BEVDet format occupancy = occupancy.permute(2, 0, 1) occupancy = torch.rot90(occupancy, 1, [1, 2]) occupancy = torch.flip(occupancy, [1]) occupancy = occupancy.permute(1, 2, 0) if self.fix_void: occupancy[occupancy<255] = occupancy[occupancy<255] + 1 for class_ in self.ignore_classes: occupancy[occupancy==class_] = 255 if results['rotate_bda'] != 0: occupancy = occupancy.permute(2, 0, 1) occupancy = rotate(occupancy, -results['rotate_bda'], fill=255).permute(1, 2, 0) if results['flip_dx']: occupancy = torch.flip(occupancy, [1]) if results['flip_dy']: occupancy = torch.flip(occupancy, [0]) results['gt_occupancy'] = occupancy results['visible_mask'] = visible_mask results['visible_mask_bev'] = (occupancy==255).sum(-1) return results @PIPELINES.register_module() class LoadPointsFromMultiSweeps(object): """Load points from multiple sweeps. This is usually used for nuScenes dataset to utilize previous sweeps. Args: sweeps_num (int, optional): Number of sweeps. Defaults to 10. load_dim (int, optional): Dimension number of the loaded points. Defaults to 5. use_dim (list[int], optional): Which dimension to use. Defaults to [0, 1, 2, 4]. time_dim (int, optional): Which dimension to represent the timestamps of each points. Defaults to 4. file_client_args (dict, optional): Config dict of file clients, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. Defaults to dict(backend='disk'). pad_empty_sweeps (bool, optional): Whether to repeat keyframe when sweeps is empty. Defaults to False. remove_close (bool, optional): Whether to remove close points. Defaults to False. test_mode (bool, optional): If `test_mode=True`, it will not randomly sample sweeps but select the nearest N frames. Defaults to False. """ def __init__(self, sweeps_num=10, load_dim=5, use_dim=[0, 1, 2, 4], time_dim=4, file_client_args=dict(backend='disk'), pad_empty_sweeps=False, remove_close=False, translate2ego=False, test_mode=False): self.load_dim = load_dim self.sweeps_num = sweeps_num self.use_dim = use_dim self.time_dim = time_dim assert time_dim < load_dim, \ f'Expect the timestamp dimension < {load_dim}, got {time_dim}' self.file_client_args = file_client_args.copy() self.file_client = None self.pad_empty_sweeps = pad_empty_sweeps self.remove_close = remove_close self.test_mode = test_mode assert max(use_dim) < load_dim, \ f'Expect all used dimensions < {load_dim}, got {use_dim}' self.translate2ego = translate2ego def _load_points(self, pts_filename): """Private function to load point clouds data. Args: pts_filename (str): Filename of point clouds data. Returns: np.ndarray: An array containing point clouds data. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: pts_bytes = self.file_client.get(pts_filename) points = np.frombuffer(pts_bytes, dtype=np.float32) except ConnectionError: mmcv.check_file_exist(pts_filename) if pts_filename.endswith('.npy'): points = np.load(pts_filename) else: points = np.fromfile(pts_filename, dtype=np.float32) return points def _remove_close(self, points, radius=1.0): """Removes point too close within a certain radius from origin. Args: points (np.ndarray | :obj:`BasePoints`): Sweep points. radius (float, optional): Radius below which points are removed. Defaults to 1.0. Returns: np.ndarray: Points after removing. """ if isinstance(points, np.ndarray): points_numpy = points elif isinstance(points, BasePoints): points_numpy = points.tensor.numpy() else: raise NotImplementedError x_filt = np.abs(points_numpy[:, 0]) < radius y_filt = np.abs(points_numpy[:, 1]) < radius not_close = np.logical_not(np.logical_and(x_filt, y_filt)) return points[not_close] def __call__(self, results): """Call function to load multi-sweep point clouds from files. Args: results (dict): Result dict containing multi-sweep point cloud filenames. Returns: dict: The result dict containing the multi-sweep points data. Added key and value are described below. - points (np.ndarray | :obj:`BasePoints`): Multi-sweep point cloud arrays. """ points = results['points'] points.tensor[:, self.time_dim] = 0 sweep_points_list = [points] ts = results['timestamp'] if self.pad_empty_sweeps and len(results['sweeps']) == 0: for i in range(self.sweeps_num): if self.remove_close: sweep_points_list.append(self._remove_close(points)) else: sweep_points_list.append(points) else: if len(results['sweeps']) <= self.sweeps_num: choices = np.arange(len(results['sweeps'])) elif self.test_mode: choices = np.arange(self.sweeps_num) else: choices = np.random.choice( len(results['sweeps']), self.sweeps_num, replace=False) for idx in choices: sweep = results['sweeps'][idx] points_sweep = self._load_points(sweep['data_path']) points_sweep = np.copy(points_sweep).reshape(-1, self.load_dim) if self.remove_close: points_sweep = self._remove_close(points_sweep) sweep_ts = sweep['timestamp'] / 1e6 points_sweep[:, :3] = points_sweep[:, :3] @ sweep[ 'sensor2lidar_rotation'].T points_sweep[:, :3] += sweep['sensor2lidar_translation'] points_sweep[:, self.time_dim] = ts - sweep_ts points_sweep = points.new_point(points_sweep) sweep_points_list.append(points_sweep) points = points.cat(sweep_points_list) points = points[:, self.use_dim] results['points'] = points if self.translate2ego: lidar2lidarego = np.eye(4, dtype=np.float32) lidar2lidarego[:3, :3] = Quaternion( results['curr']['lidar2ego_rotation']).rotation_matrix lidar2lidarego[:3, 3] = results['curr']['lidar2ego_translation'] lidar2lidarego = to_tensor(lidar2lidarego) results['points'].tensor[:, :3] = results['points'].tensor[:, :3].matmul(lidar2lidarego[:3, :3].T) + lidar2lidarego[:3, 3] return results def __repr__(self): """str: Return a string that describes the module.""" return f'{self.__class__.__name__}(sweeps_num={self.sweeps_num})' @PIPELINES.register_module() class PointsFromLidartoEgo(object): def __init__(self, translate2ego=True, ego_cam='CAM_FRONT'): self.ego_cam=ego_cam self.translate2ego = translate2ego def __call__(self, results): if self.translate2ego: # lidar2lidarego = np.eye(4, dtype=np.float32) # lidar2lidarego[:3, :3] = Quaternion( # results['curr']['lidar2ego_rotation']).rotation_matrix # lidar2lidarego[:3, 3] = results['curr']['lidar2ego_translation'] # lidar2lidarego = to_tensor(lidar2lidarego) # results['points'].tensor[:, :3] = results['points'].tensor[:, :3].matmul(lidar2lidarego[:3, :3].T) + lidar2lidarego[:3, 3] lidar2lidarego = np.eye(4, dtype=np.float32) lidar2lidarego[:3, :3] = Quaternion( results['curr']['lidar2ego_rotation']).rotation_matrix lidar2lidarego[:3, 3] = results['curr']['lidar2ego_translation'] lidarego2global = np.eye(4, dtype=np.float32) lidarego2global[:3, :3] = Quaternion( results['curr']['ego2global_rotation']).rotation_matrix lidarego2global[:3, 3] = results['curr']['ego2global_translation'] camego2global = np.eye(4, dtype=np.float32) camego2global[:3, :3] = Quaternion( results['curr']['cams'][self.ego_cam] ['ego2global_rotation']).rotation_matrix camego2global[:3, 3] = results['curr']['cams'][self.ego_cam][ 'ego2global_translation'] lidar2camego = np.linalg.inv(camego2global) @ lidarego2global @ lidar2lidarego lidar2camego = to_tensor(lidar2camego) results['points'].tensor[:, :3] = results['points'].tensor[:, :3].matmul(lidar2camego[:3, :3].T) + lidar2camego[:3, 3] return results @PIPELINES.register_module() class PointSegClassMapping(object): """Map original semantic class to valid category ids. Map valid classes as 0~len(valid_cat_ids)-1 and others as len(valid_cat_ids). Args: valid_cat_ids (tuple[int]): A tuple of valid category. max_cat_id (int, optional): The max possible cat_id in input segmentation mask. Defaults to 40. """ def __init__(self, valid_cat_ids, max_cat_id=40): assert max_cat_id >= np.max(valid_cat_ids), \ 'max_cat_id should be greater than maximum id in valid_cat_ids' self.valid_cat_ids = valid_cat_ids self.max_cat_id = int(max_cat_id) # build cat_id to class index mapping neg_cls = len(valid_cat_ids) self.cat_id2class = np.ones( self.max_cat_id + 1, dtype=np.int) * neg_cls for cls_idx, cat_id in enumerate(valid_cat_ids): self.cat_id2class[cat_id] = cls_idx def __call__(self, results): """Call function to map original semantic class to valid category ids. Args: results (dict): Result dict containing point semantic masks. Returns: dict: The result dict containing the mapped category ids. Updated key and value are described below. - pts_semantic_mask (np.ndarray): Mapped semantic masks. """ assert 'pts_semantic_mask' in results pts_semantic_mask = results['pts_semantic_mask'] converted_pts_sem_mask = self.cat_id2class[pts_semantic_mask] results['pts_semantic_mask'] = converted_pts_sem_mask return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(valid_cat_ids={self.valid_cat_ids}, ' repr_str += f'max_cat_id={self.max_cat_id})' return repr_str @PIPELINES.register_module() class NormalizePointsColor(object): """Normalize color of points. Args: color_mean (list[float]): Mean color of the point cloud. """ def __init__(self, color_mean): self.color_mean = color_mean def __call__(self, results): """Call function to normalize color of points. Args: results (dict): Result dict containing point clouds data. Returns: dict: The result dict containing the normalized points. Updated key and value are described below. - points (:obj:`BasePoints`): Points after color normalization. """ points = results['points'] assert points.attribute_dims is not None and \ 'color' in points.attribute_dims.keys(), \ 'Expect points have color attribute' if self.color_mean is not None: points.color = points.color - \ points.color.new_tensor(self.color_mean) points.color = points.color / 255.0 results['points'] = points return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(color_mean={self.color_mean})' return repr_str @PIPELINES.register_module() class LoadPointsFromFile(object): """Load Points From File. Load points from file. Args: coord_type (str): The type of coordinates of points cloud. Available options includes: - 'LIDAR': Points in LiDAR coordinates. - 'DEPTH': Points in depth coordinates, usually for indoor dataset. - 'CAMERA': Points in camera coordinates. load_dim (int, optional): The dimension of the loaded points. Defaults to 6. use_dim (list[int], optional): Which dimensions of the points to use. Defaults to [0, 1, 2]. For KITTI dataset, set use_dim=4 or use_dim=[0, 1, 2, 3] to use the intensity dimension. shift_height (bool, optional): Whether to use shifted height. Defaults to False. use_color (bool, optional): Whether to use color features. Defaults to False. file_client_args (dict, optional): Config dict of file clients, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. Defaults to dict(backend='disk'). """ def __init__(self, coord_type, load_dim=6, use_dim=[0, 1, 2], shift_height=False, use_color=False, dtype='float32', file_client_args=dict(backend='disk'), translate2ego=True, ): self.shift_height = shift_height self.use_color = use_color if isinstance(use_dim, int): use_dim = list(range(use_dim)) assert max(use_dim) < load_dim, \ f'Expect all used dimensions < {load_dim}, got {use_dim}' assert coord_type in ['CAMERA', 'LIDAR', 'DEPTH'] self.coord_type = coord_type self.load_dim = load_dim self.use_dim = use_dim self.file_client_args = file_client_args.copy() self.file_client = None if dtype=='float32': self.dtype = np.float32 elif dtype== 'float16': self.dtype = np.float16 else: assert False self.translate2ego = translate2ego def _load_points(self, pts_filename): """Private function to load point clouds data. Args: pts_filename (str): Filename of point clouds data. Returns: np.ndarray: An array containing point clouds data. """ if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: pts_bytes = self.file_client.get(pts_filename) points = np.frombuffer(pts_bytes, dtype=self.dtype) except ConnectionError: mmcv.check_file_exist(pts_filename) if pts_filename.endswith('.npy'): points = np.load(pts_filename) else: points = np.fromfile(pts_filename, dtype=self.dtype) return points def __call__(self, results): """Call function to load points data from file. Args: results (dict): Result dict containing point clouds data. Returns: dict: The result dict containing the point clouds data. Added key and value are described below. - points (:obj:`BasePoints`): Point clouds data. """ pts_filename = results['pts_filename'] points = self._load_points(pts_filename) points = points.reshape(-1, self.load_dim) points = points[:, self.use_dim] attribute_dims = None if self.shift_height: floor_height = np.percentile(points[:, 2], 0.99) height = points[:, 2] - floor_height points = np.concatenate( [points[:, :3], np.expand_dims(height, 1), points[:, 3:]], 1) attribute_dims = dict(height=3) if self.use_color: assert len(self.use_dim) >= 6 if attribute_dims is None: attribute_dims = dict() attribute_dims.update( dict(color=[ points.shape[1] - 3, points.shape[1] - 2, points.shape[1] - 1, ])) points_class = get_points_type(self.coord_type) points = points_class( points, points_dim=points.shape[-1], attribute_dims=attribute_dims) results['points'] = points if self.translate2ego: lidar2lidarego = np.eye(4, dtype=np.float32) lidar2lidarego[:3, :3] = Quaternion( results['curr']['lidar2ego_rotation']).rotation_matrix lidar2lidarego[:3, 3] = results['curr']['lidar2ego_translation'] lidar2lidarego = to_tensor(lidar2lidarego) results['points'].tensor[:, :3] = results['points'].tensor[:, :3].matmul(lidar2lidarego[:3, :3].T) + lidar2lidarego[:3, 3] return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ + '(' repr_str += f'shift_height={self.shift_height}, ' repr_str += f'use_color={self.use_color}, ' repr_str += f'file_client_args={self.file_client_args}, ' repr_str += f'load_dim={self.load_dim}, ' repr_str += f'use_dim={self.use_dim})' return repr_str @PIPELINES.register_module() class LoadPointsFromDict(LoadPointsFromFile): """Load Points From Dict.""" def __call__(self, results): assert 'points' in results return results @PIPELINES.register_module() class LoadAnnotations3D(LoadAnnotations): """Load Annotations3D. Load instance mask and semantic mask of points and encapsulate the items into related fields. Args: with_bbox_3d (bool, optional): Whether to load 3D boxes. Defaults to True. with_label_3d (bool, optional): Whether to load 3D labels. Defaults to True. with_attr_label (bool, optional): Whether to load attribute label. Defaults to False. with_mask_3d (bool, optional): Whether to load 3D instance masks. for points. Defaults to False. with_seg_3d (bool, optional): Whether to load 3D semantic masks. for points. Defaults to False. with_bbox (bool, optional): Whether to load 2D boxes. Defaults to False. with_label (bool, optional): Whether to load 2D labels. Defaults to False. with_mask (bool, optional): Whether to load 2D instance masks. Defaults to False. with_seg (bool, optional): Whether to load 2D semantic masks. Defaults to False. with_bbox_depth (bool, optional): Whether to load 2.5D boxes. Defaults to False. poly2mask (bool, optional): Whether to convert polygon annotations to bitmasks. Defaults to True. seg_3d_dtype (dtype, optional): Dtype of 3D semantic masks. Defaults to int64 file_client_args (dict): Config dict of file clients, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/fileio/file_client.py for more details. """ def __init__(self, with_bbox_3d=True, with_label_3d=True, with_attr_label=False, with_mask_3d=False, with_seg_3d=False, with_bbox=False, with_label=False, with_mask=False, with_seg=False, with_bbox_depth=False, poly2mask=True, seg_3d_dtype=np.int64, file_client_args=dict(backend='disk')): super().__init__( with_bbox, with_label, with_mask, with_seg, poly2mask, file_client_args=file_client_args) self.with_bbox_3d = with_bbox_3d self.with_bbox_depth = with_bbox_depth self.with_label_3d = with_label_3d self.with_attr_label = with_attr_label self.with_mask_3d = with_mask_3d self.with_seg_3d = with_seg_3d self.seg_3d_dtype = seg_3d_dtype def _load_bboxes_3d(self, results): """Private function to load 3D bounding box annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box annotations. """ results['gt_bboxes_3d'] = results['ann_infos'][0] results['bbox3d_fields'].append('gt_bboxes_3d') return results def _load_bboxes_depth(self, results): """Private function to load 2.5D bounding box annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 2.5D bounding box annotations. """ results['centers2d'] = results['ann_info']['centers2d'] results['depths'] = results['ann_info']['depths'] return results def _load_labels_3d(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['gt_labels_3d'] = results['ann_infos'][1] return results def _load_attr_labels(self, results): """Private function to load label annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded label annotations. """ results['attr_labels'] = results['ann_infos']['attr_labels'] return results def _load_masks_3d(self, results): """Private function to load 3D mask annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D mask annotations. """ pts_instance_mask_path = results['ann_infos']['pts_instance_mask_path'] if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: mask_bytes = self.file_client.get(pts_instance_mask_path) pts_instance_mask = np.frombuffer(mask_bytes, dtype=np.int64) except ConnectionError: mmcv.check_file_exist(pts_instance_mask_path) pts_instance_mask = np.fromfile( pts_instance_mask_path, dtype=np.int64) results['pts_instance_mask'] = pts_instance_mask results['pts_mask_fields'].append('pts_instance_mask') return results def _load_semantic_seg_3d(self, results): """Private function to load 3D semantic segmentation annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing the semantic segmentation annotations. """ pts_semantic_mask_path = results['ann_infos']['pts_semantic_mask_path'] if self.file_client is None: self.file_client = mmcv.FileClient(**self.file_client_args) try: mask_bytes = self.file_client.get(pts_semantic_mask_path) # add .copy() to fix read-only bug pts_semantic_mask = np.frombuffer( mask_bytes, dtype=self.seg_3d_dtype).copy() except ConnectionError: mmcv.check_file_exist(pts_semantic_mask_path) pts_semantic_mask = np.fromfile( pts_semantic_mask_path, dtype=np.int64) results['pts_semantic_mask'] = pts_semantic_mask results['pts_seg_fields'].append('pts_semantic_mask') return results def __call__(self, results): """Call function to load multiple types annotations. Args: results (dict): Result dict from :obj:`mmdet3d.CustomDataset`. Returns: dict: The dict containing loaded 3D bounding box, label, mask and semantic segmentation annotations. """ results = super().__call__(results) if self.with_bbox_3d: results = self._load_bboxes_3d(results) if results is None: return None if self.with_bbox_depth: results = self._load_bboxes_depth(results) if results is None: return None if self.with_label_3d: results = self._load_labels_3d(results) if self.with_attr_label: results = self._load_attr_labels(results) if self.with_mask_3d: results = self._load_masks_3d(results) if self.with_seg_3d: results = self._load_semantic_seg_3d(results) return results def __repr__(self): """str: Return a string that describes the module.""" indent_str = ' ' repr_str = self.__class__.__name__ + '(\n' repr_str += f'{indent_str}with_bbox_3d={self.with_bbox_3d}, ' repr_str += f'{indent_str}with_label_3d={self.with_label_3d}, ' repr_str += f'{indent_str}with_attr_label={self.with_attr_label}, ' repr_str += f'{indent_str}with_mask_3d={self.with_mask_3d}, ' repr_str += f'{indent_str}with_seg_3d={self.with_seg_3d}, ' repr_str += f'{indent_str}with_bbox={self.with_bbox}, ' repr_str += f'{indent_str}with_label={self.with_label}, ' repr_str += f'{indent_str}with_mask={self.with_mask}, ' repr_str += f'{indent_str}with_seg={self.with_seg}, ' repr_str += f'{indent_str}with_bbox_depth={self.with_bbox_depth}, ' repr_str += f'{indent_str}poly2mask={self.poly2mask})' return repr_str @PIPELINES.register_module() class PointToMultiViewDepth(object): def __init__(self, grid_config, downsample=1): self.downsample = downsample self.grid_config = grid_config def points2depthmap(self, points, height, width): height, width = height // self.downsample, width // self.downsample depth_map = torch.zeros((height, width), dtype=torch.float32) coor = torch.round(points[:, :2] / self.downsample) depth = points[:, 2] kept1 = (coor[:, 0] >= 0) & (coor[:, 0] < width) & ( coor[:, 1] >= 0) & (coor[:, 1] < height) & ( depth < self.grid_config['depth'][1]) & ( depth >= self.grid_config['depth'][0]) coor, depth = coor[kept1], depth[kept1] ranks = coor[:, 0] + coor[:, 1] * width sort = (ranks + depth / 100.).argsort() coor, depth, ranks = coor[sort], depth[sort], ranks[sort] kept2 = torch.ones(coor.shape[0], device=coor.device, dtype=torch.bool) kept2[1:] = (ranks[1:] != ranks[:-1]) coor, depth = coor[kept2], depth[kept2] coor = coor.to(torch.long) depth_map[coor[:, 1], coor[:, 0]] = depth return depth_map def __call__(self, results): points_lidar = results['points'] imgs, rots, trans, intrins = results['img_inputs'][:4] post_rots, post_trans, bda = results['img_inputs'][4:] depth_map_list = [] for cid in range(len(results['cam_names'])): cam_name = results['cam_names'][cid] # lidar2lidarego = np.eye(4, dtype=np.float32) # lidar2lidarego[:3, :3] = Quaternion( # results['curr']['lidar2ego_rotation']).rotation_matrix # lidar2lidarego[:3, 3] = results['curr']['lidar2ego_translation'] # lidar2lidarego = to_tensor(lidar2lidarego) lidarego2global = np.eye(4, dtype=np.float32) lidarego2global[:3, :3] = Quaternion( results['curr']['ego2global_rotation']).rotation_matrix lidarego2global[:3, 3] = results['curr']['ego2global_translation'] lidarego2global = to_tensor(lidarego2global) cam2camego = np.eye(4, dtype=np.float32) cam2camego[:3, :3] = Quaternion( results['curr']['cams'][cam_name] ['sensor2ego_rotation']).rotation_matrix cam2camego[:3, 3] = results['curr']['cams'][cam_name][ 'sensor2ego_translation'] cam2camego = to_tensor(cam2camego) camego2global = np.eye(4, dtype=np.float32) camego2global[:3, :3] = Quaternion( results['curr']['cams'][cam_name] ['ego2global_rotation']).rotation_matrix camego2global[:3, 3] = results['curr']['cams'][cam_name][ 'ego2global_translation'] camego2global = to_tensor(camego2global) cam2img = np.eye(4, dtype=np.float32) cam2img = to_tensor(cam2img) cam2img[:3, :3] = intrins[cid] lidar2cam = torch.inverse(camego2global.matmul(cam2camego)).matmul(lidarego2global) # lidarego2global.matmul(lidar2lidarego)) lidar2img = cam2img.matmul(lidar2cam) points_img = points_lidar.tensor[:, :3].matmul( lidar2img[:3, :3].T) + lidar2img[:3, 3].unsqueeze(0) points_img = torch.cat( [points_img[:, :2] / points_img[:, 2:3], points_img[:, 2:3]], 1) points_img = points_img.matmul( post_rots[cid].T) + post_trans[cid:cid + 1, :] depth_map = self.points2depthmap(points_img, imgs.shape[2], imgs.shape[3]) depth_map_list.append(depth_map) depth_map = torch.stack(depth_map_list) results['gt_depth'] = depth_map return results def mmlabNormalize(img, mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True, debug=False): from mmcv.image.photometric import imnormalize mean = np.array(mean, dtype=np.float32) std = np.array(std, dtype=np.float32) to_rgb = to_rgb if debug: print('warning, debug in mmlabNormalize') img = np.asarray(img) # not normalize for visualization else: img = imnormalize(np.array(img), mean, std, to_rgb) img = torch.tensor(img).float().permute(2, 0, 1).contiguous() return img @PIPELINES.register_module() class PrepareImageInputs(object): """Load multi channel images from a list of separate channel files. Expects results['img_filename'] to be a list of filenames. Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. """ def __init__( self, data_config, is_train=False, sequential=False, ego_cam='CAM_FRONT', img_corruptions=None, normalize_cfg=dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True, debug=False ) ): self.is_train = is_train self.data_config = data_config self.normalize_img = mmlabNormalize self.sequential = sequential self.ego_cam = ego_cam self.normalize_cfg = normalize_cfg self.img_corruptions = img_corruptions def get_rot(self, h): return torch.Tensor( np.array([ [np.cos(h), np.sin(h)], [-np.sin(h), np.cos(h)], ])) def img_transform(self, img, post_rot, post_tran, resize, resize_dims, crop, flip, rotate): # adjust image img = self.img_transform_core(img, resize_dims, crop, flip, rotate) # post-homography transformation post_rot *= resize post_tran -= torch.Tensor(crop[:2]) if flip: A = torch.Tensor([[-1, 0], [0, 1]]) b = torch.Tensor([crop[2] - crop[0], 0]) post_rot = A.matmul(post_rot) post_tran = A.matmul(post_tran) + b A = self.get_rot(rotate / 180 * np.pi) b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 b = A.matmul(-b) + b post_rot = A.matmul(post_rot) post_tran = A.matmul(post_tran) + b return img, post_rot, post_tran def img_transform_core(self, img, resize_dims, crop, flip, rotate): # adjust image img = img.resize(resize_dims) img = img.crop(crop) if flip: img = img.transpose(method=Image.FLIP_LEFT_RIGHT) img = img.rotate(rotate) return img def choose_cams(self): if self.is_train and self.data_config['Ncams'] < len( self.data_config['cams']): cam_names = np.random.choice( self.data_config['cams'], self.data_config['Ncams'], replace=False) else: cam_names = self.data_config['cams'] return cam_names def sample_augmentation(self, H, W, flip=None, scale=None): fH, fW = self.data_config['input_size'] H, W = self.data_config['src_size'] if self.is_train: # resize = float(fW) / float(W) # resize += np.random.uniform(*self.data_config['resize']) resize = np.random.uniform(*self.data_config["resize"]) resize_dims = (int(W * resize), int(H * resize)) newW, newH = resize_dims crop_h = int((1 - np.random.uniform(*self.data_config['crop_h'])) * newH) - fH crop_w = int(np.random.uniform(0, max(0, newW - fW))) crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) flip = self.data_config['flip'] and np.random.choice([0, 1]) rotate = np.random.uniform(*self.data_config['rot']) else: # resize = float(fW) / float(W) resize = max(fH / H, fW / W) # resize += self.data_config.get('resize_test', 0.0) if scale is not None: resize = scale resize_dims = (int(W * resize), int(H * resize)) newW, newH = resize_dims crop_h = int((1 - np.mean(self.data_config['crop_h'])) * newH) - fH crop_w = int(max(0, newW - fW) / 2) crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) flip = False if flip is None else flip rotate = 0 return resize, resize_dims, crop, flip, rotate def get_sensor2ego_transformation(self, cam_info, key_info, cam_name, ego_cam=None): if ego_cam is None: ego_cam = cam_name w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation'] # sweep sensor to sweep ego sweepsensor2sweepego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sweepsensor2sweepego_tran = torch.Tensor( cam_info['cams'][cam_name]['sensor2ego_translation']) sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros((4, 4)) sweepsensor2sweepego[3, 3] = 1 sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran # sweep ego to global w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation'] sweepego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sweepego2global_tran = torch.Tensor( cam_info['cams'][cam_name]['ego2global_translation']) sweepego2global = sweepego2global_rot.new_zeros((4, 4)) sweepego2global[3, 3] = 1 sweepego2global[:3, :3] = sweepego2global_rot sweepego2global[:3, -1] = sweepego2global_tran # global sensor to cur ego w, x, y, z = key_info['cams'][ego_cam]['ego2global_rotation'] keyego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) keyego2global_tran = torch.Tensor( key_info['cams'][ego_cam]['ego2global_translation']) keyego2global = keyego2global_rot.new_zeros((4, 4)) keyego2global[3, 3] = 1 keyego2global[:3, :3] = keyego2global_rot keyego2global[:3, -1] = keyego2global_tran global2keyego = keyego2global.inverse() sweepsensor2keyego = \ global2keyego @ sweepego2global @ sweepsensor2sweepego # global sensor to cur ego w, x, y, z = key_info['cams'][cam_name]['ego2global_rotation'] keyego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) keyego2global_tran = torch.Tensor( key_info['cams'][cam_name]['ego2global_translation']) keyego2global = keyego2global_rot.new_zeros((4, 4)) keyego2global[3, 3] = 1 keyego2global[:3, :3] = keyego2global_rot keyego2global[:3, -1] = keyego2global_tran global2keyego = keyego2global.inverse() # cur ego to sensor w, x, y, z = key_info['cams'][cam_name]['sensor2ego_rotation'] keysensor2keyego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) keysensor2keyego_tran = torch.Tensor( key_info['cams'][cam_name]['sensor2ego_translation']) keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4)) keysensor2keyego[3, 3] = 1 keysensor2keyego[:3, :3] = keysensor2keyego_rot keysensor2keyego[:3, -1] = keysensor2keyego_tran keyego2keysensor = keysensor2keyego.inverse() keysensor2sweepsensor = ( keyego2keysensor @ global2keyego @ sweepego2global @ sweepsensor2sweepego).inverse() return sweepsensor2keyego, keysensor2sweepsensor def get_sensor_transforms(self, cam_info, cam_name): w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation'] # sweep sensor to sweep ego sensor2ego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sensor2ego_tran = torch.Tensor( cam_info['cams'][cam_name]['sensor2ego_translation']) sensor2ego = sensor2ego_rot.new_zeros((4, 4)) sensor2ego[3, 3] = 1 sensor2ego[:3, :3] = sensor2ego_rot sensor2ego[:3, -1] = sensor2ego_tran # sweep ego to global w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation'] ego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) ego2global_tran = torch.Tensor( cam_info['cams'][cam_name]['ego2global_translation']) ego2global = ego2global_rot.new_zeros((4, 4)) ego2global[3, 3] = 1 ego2global[:3, :3] = ego2global_rot ego2global[:3, -1] = ego2global_tran return sensor2ego, ego2global def get_inputs(self, results, scale=None): imgs = [] rots = [] trans = [] intrins = [] post_rots = [] post_trans = [] sensor2egos = [] ego2globals = [] cam_names = self.choose_cams() results['cam_names'] = cam_names results['input_size'] = self.data_config['input_size'] canvas = [] sensor2sensors = [] results['img_augs'] = {} for cam_name in cam_names: cam_data = results['curr']['cams'][cam_name] filename = cam_data['data_path'] if self.img_corruptions in ['sun', 'noise', 'rain', 'snow', 'fog']: filename = filename.split('/') filename[2] = 'nuscenes_aug' filename[3] = f'samples_{self.img_corruptions}' filename = osp.join(*filename) img = Image.open(filename) post_rot = torch.eye(2) post_tran = torch.zeros(2) intrin = torch.Tensor(cam_data['cam_intrinsic']) sensor2keyego, sensor2sensor = \ self.get_sensor2ego_transformation(results['curr'], results['curr'], cam_name, self.ego_cam) rot = sensor2keyego[:3, :3] tran = sensor2keyego[:3, 3] sensor2ego, ego2global = \ self.get_sensor_transforms(results['curr'], cam_name) # image view augmentation (resize, crop, horizontal flip, rotate) if results.get('tta_config', None) is not None: flip = results['tta_config']['tta_flip'] else: flip = None img_augs = self.sample_augmentation( H=img.height, W=img.width, flip=flip, scale=scale) resize, resize_dims, crop, flip, rotate = img_augs results['img_augs'][cam_name] = img_augs img, post_rot2, post_tran2 = \ self.img_transform(img, post_rot, post_tran, resize=resize, resize_dims=resize_dims, crop=crop, flip=flip, rotate=rotate) # for convenience, make augmentation matrices 3x3 post_tran = torch.zeros(3) post_rot = torch.eye(3) post_tran[:2] = post_tran2 post_rot[:2, :2] = post_rot2 canvas.append(np.array(img)) if self.img_corruptions == 'drop': imgs.append(self.normalize_img(img, **self.normalize_cfg)* 0) else: imgs.append(self.normalize_img(img, **self.normalize_cfg)) if self.sequential: assert 'adjacent' in results for adj_info in results['adjacent']: filename_adj = adj_info['cams'][cam_name]['data_path'] img_adjacent = Image.open(filename_adj) img_adjacent = self.img_transform_core( img_adjacent, resize_dims=resize_dims, crop=crop, flip=flip, rotate=rotate) imgs.append(self.normalize_img(img_adjacent, **self.normalize_cfg)) intrins.append(intrin) rots.append(rot) trans.append(tran) post_rots.append(post_rot) post_trans.append(post_tran) sensor2sensors.append(sensor2sensor) sensor2egos.append(sensor2ego) ego2globals.append(ego2global) if self.sequential: for adj_info in results['adjacent']: post_trans.extend(post_trans[:len(cam_names)]) post_rots.extend(post_rots[:len(cam_names)]) intrins.extend(intrins[:len(cam_names)]) # align trans_adj = [] rots_adj = [] sensor2sensors_adj = [] for cam_name in cam_names: adjsensor2keyego, sensor2sensor = \ self.get_sensor2ego_transformation(adj_info, results['curr'], cam_name, self.ego_cam) rot = adjsensor2keyego[:3, :3] tran = adjsensor2keyego[:3, 3] rots_adj.append(rot) trans_adj.append(tran) sensor2sensors_adj.append(sensor2sensor) for cam_name in cam_names: sensor2ego, ego2global = \ self.get_sensor_transforms(adj_info, cam_name) sensor2egos.append(sensor2ego) ego2globals.append(ego2global) rots.extend(rots_adj) trans.extend(trans_adj) sensor2sensors.extend(sensor2sensors_adj) imgs = torch.stack(imgs) sensor2egos = torch.stack(sensor2egos) ego2globals = torch.stack(ego2globals) rots = torch.stack(rots) trans = torch.stack(trans) intrins = torch.stack(intrins) post_rots = torch.stack(post_rots) post_trans = torch.stack(post_trans) sensor2sensors = torch.stack(sensor2sensors) results['canvas'] = canvas results['sensor2sensors'] = sensor2sensors return (imgs, rots, trans, intrins, post_rots, post_trans), (sensor2egos, ego2globals) def __call__(self, results): results['img_inputs'], results['aux_cam_params'] = self.get_inputs(results) return results @PIPELINES.register_module() class LoadAnnotationsBEVDepth(object): def __init__(self, bda_aug_conf, classes, with_2d_bbox=False, with_ego_as_agent=False, is_train=True): self.bda_aug_conf = bda_aug_conf self.is_train = is_train self.classes = classes self.with_2d_bbox = with_2d_bbox self.min_size = 2.0 self.with_ego_as_agent = with_ego_as_agent def sample_bda_augmentation(self, tta_config=None): """Generate bda augmentation values based on bda_config.""" if self.is_train: rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim']) scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim']) flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio'] flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio'] translation_std = self.bda_aug_conf.get('tran_lim', [0.0, 0.0, 0.0]) tran_bda = np.random.normal(scale=translation_std, size=3).T else: rotate_bda = 0 scale_bda = 1.0 if tta_config is not None: flip_dx = tta_config['flip_dx'] flip_dy = tta_config['flip_dy'] else: flip_dx = False flip_dy = False tran_bda = np.zeros((1, 3), dtype=np.float32) return rotate_bda, scale_bda, flip_dx, flip_dy, tran_bda def bev_transform(self, gt_boxes, rotate_angle, scale_ratio, flip_dx, flip_dy): rotate_angle = torch.tensor(rotate_angle / 180 * np.pi) rot_sin = torch.sin(rotate_angle) rot_cos = torch.cos(rotate_angle) rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]) scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0], [0, 0, scale_ratio]]) flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) if flip_dx: flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) if flip_dy: flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) rot_mat = flip_mat @ (scale_mat @ rot_mat) if gt_boxes.shape[0] > 0: gt_boxes[:, :3] = ( rot_mat @ gt_boxes[:, :3].unsqueeze(-1)).squeeze(-1) gt_boxes[:, 3:6] *= scale_ratio gt_boxes[:, 6] += rotate_angle if flip_dx: gt_boxes[:, 6] = 2 * torch.asin(torch.tensor(1.0)) - gt_boxes[:, 6] if flip_dy: gt_boxes[:, 6] = -gt_boxes[:, 6] gt_boxes[:, 7:] = ( rot_mat[:2, :2] @ gt_boxes[:, 7:].unsqueeze(-1)).squeeze(-1) return gt_boxes, rot_mat def _bboxes_transform(self, bboxes, centers2d, gt_labels, depths, resize, crop, flip, fH, fW): assert len(bboxes) == len(centers2d) == len(gt_labels) == len(depths) bboxes = bboxes * resize bboxes[:, 0] = bboxes[:, 0] - crop[0] bboxes[:, 1] = bboxes[:, 1] - crop[1] bboxes[:, 2] = bboxes[:, 2] - crop[0] bboxes[:, 3] = bboxes[:, 3] - crop[1] bboxes[:, 0] = np.clip(bboxes[:, 0], 0, fW) bboxes[:, 2] = np.clip(bboxes[:, 2], 0, fW) bboxes[:, 1] = np.clip(bboxes[:, 1], 0, fH) bboxes[:, 3] = np.clip(bboxes[:, 3], 0, fH) keep = ((bboxes[:, 2] - bboxes[:, 0]) >= self.min_size) & ((bboxes[:, 3] - bboxes[:, 1]) >= self.min_size) if flip: x0 = bboxes[:, 0].copy() x1 = bboxes[:, 2].copy() bboxes[:, 2] = fW - x0 bboxes[:, 0] = fW - x1 bboxes = bboxes[keep] centers2d = centers2d * resize centers2d[:, 0] = centers2d[:, 0] - crop[0] centers2d[:, 1] = centers2d[:, 1] - crop[1] centers2d[:, 0] = np.clip(centers2d[:, 0], 0, fW) centers2d[:, 1] = np.clip(centers2d[:, 1], 0, fH) if flip: centers2d[:, 0] = fW - centers2d[:, 0] centers2d = centers2d[keep] gt_labels = gt_labels[keep] depths = depths[keep] return bboxes, centers2d, gt_labels, depths def _filter_invisible(self, bboxes, centers2d, gt_labels, depths, fH, fW ): # filter invisible 2d bboxes assert len(bboxes) == len(centers2d) == len(gt_labels) == len(depths) indices_maps = np.zeros((fH,fW)) tmp_bboxes = np.zeros_like(bboxes) tmp_bboxes[:, :2] = np.ceil(bboxes[:, :2]) tmp_bboxes[:, 2:] = np.floor(bboxes[:, 2:]) tmp_bboxes = tmp_bboxes.astype(np.int64) sort_idx = np.argsort(-depths, axis=0, kind='stable') tmp_bboxes = tmp_bboxes[sort_idx] bboxes = bboxes[sort_idx] depths = depths[sort_idx] centers2d = centers2d[sort_idx] gt_labels = gt_labels[sort_idx] for i in range(bboxes.shape[0]): u1, v1, u2, v2 = tmp_bboxes[i] indices_maps[v1:v2, u1:u2] = i indices_res = np.unique(indices_maps).astype(np.int64) bboxes = bboxes[indices_res] depths = depths[indices_res] centers2d = centers2d[indices_res] gt_labels = gt_labels[indices_res] return bboxes, centers2d, gt_labels, depths def __call__(self, results): gt_boxes, gt_labels = results['ann_infos']['gt_boxes_3d'], results['ann_infos']['gt_labels_3d'] if self.with_ego_as_agent: ego_xyz = np.array([0, 0, 0]) ego_wlh = np.array([4.084, 1.85, 1.8]) ego_yaw = np.array([0]) ego_vel = results['curr']['gt_ego_lcf_feat'][:2] ego_box = np.concatenate([ego_xyz, ego_wlh, ego_yaw, ego_vel]) gt_boxes = [ego_box] + gt_boxes gt_labels = [0] + gt_labels if 'instance_inds' in results.keys(): results['instance_inds'] = np.concatenate([[1e7], results['instance_inds']]) if self.with_2d_bbox: # gt_boxes_2d, gt_labels_2d = results['ann_infos']['gt_boxes_2d'], results['ann_infos']['gt_labels_2d'] # gt_centers2d, gt_depth2d = results['ann_infos']['centers2d'], results['ann_infos']['depths'] new_gt_bboxes = [] new_centers2d = [] new_gt_labels = [] new_depths = [] fH, fW = results['input_size'] for cam in results['cam_names']: camera_types_2d = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] i = camera_types_2d.index(cam) resize, resize_dims, crop, flip, rotate = results['img_augs'][cam] gt_bboxes_2d = results['ann_infos']['gt_boxes_2d'][i] centers2d = results['ann_infos']['centers2d'][i] gt_labels_2d = results['ann_infos']['gt_labels_2d'][i] depths = results['ann_infos']['depths'][i] if len(gt_bboxes_2d) != 0: gt_bboxes_2d, centers2d, gt_labels_2d, depths = self._bboxes_transform( gt_bboxes_2d, centers2d, gt_labels_2d, depths, resize=resize, crop=crop, flip=flip, fH=fH, fW=fW, ) if len(gt_bboxes_2d) != 0: gt_bboxes_2d, centers2d, gt_labels_2d, depths = self._filter_invisible(gt_bboxes_2d, centers2d, gt_labels_2d, depths, fH, fW) new_gt_bboxes.append(to_tensor(gt_bboxes_2d)) new_centers2d.append(to_tensor(centers2d)) new_gt_labels.append(to_tensor(gt_labels_2d)) new_depths.append(to_tensor(depths)) results['gt_bboxes_2d'] = new_gt_bboxes results['centers2d'] = new_centers2d results['gt_labels_2d'] = new_gt_labels results['depths2d'] = new_depths gt_boxes, gt_labels = torch.Tensor(np.array(gt_boxes)), torch.tensor(np.array(gt_labels)) tta_confg = results.get('tta_config', None) rotate_bda, scale_bda, flip_dx, flip_dy, tran_bda = self.sample_bda_augmentation(tta_confg) bda_mat = torch.zeros(4, 4) bda_mat[3, 3] = 1 gt_boxes, bda_rot = self.bev_transform(gt_boxes, rotate_bda, scale_bda, flip_dx, flip_dy) if 'points' in results: points = results['points'].tensor points_aug = (bda_rot @ points[:, :3].unsqueeze(-1)).squeeze(-1) points[:, :3] = points_aug + tran_bda points = results['points'].new_point(points) results['points'] = points bda_mat[:3, :3] = bda_rot if len(gt_boxes) == 0: gt_boxes = torch.zeros(0, 9) results['gt_bboxes_3d'] = \ LiDARInstance3DBoxes(gt_boxes, box_dim=gt_boxes.shape[-1], origin=(0.5, 0.5, 0.5)) results['gt_labels_3d'] = gt_labels imgs, rots, trans, intrins = results['img_inputs'][:4] post_rots, post_trans = results['img_inputs'][4:] results['img_inputs'] = (imgs, rots, trans, intrins, post_rots, post_trans, bda_rot) results['flip_dx'] = flip_dx results['flip_dy'] = flip_dy results['rotate_bda'] = rotate_bda results['scale_bda'] = scale_bda results['bda_mat'] = bda_mat if 'ego_pose' in results: results['ori_ego_pose'] = results['ego_pose'].clone() results['ego_pose'] = results['ego_pose'] @ torch.inverse(bda_mat) results['ego_pose_inv'] = bda_mat @ results['ego_pose_inv'] return results ================================================ FILE: mmdet3d/datasets/pipelines/test_time_aug.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from copy import deepcopy import mmcv from ..builder import PIPELINES from .compose import Compose from mmcv.runner import get_dist_info @PIPELINES.register_module() class MultiScaleFlipAug: """Test-time augmentation with multiple scales and flipping. An example configuration is as followed: .. code-block:: img_scale=[(1333, 400), (1333, 800)], flip=True, transforms=[ dict(type='Resize', keep_ratio=True), dict(type='RandomFlip'), dict(type='Normalize', **img_norm_cfg), dict(type='Pad', size_divisor=32), dict(type='ImageToTensor', keys=['img']), dict(type='Collect', keys=['img']), ] After MultiScaleFLipAug with above configuration, the results are wrapped into lists of the same length as followed: .. code-block:: dict( img=[...], img_shape=[...], scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)] flip=[False, True, False, True] ... ) Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple] | None): Images scales for resizing. scale_factor (float | list[float] | None): Scale factors for resizing. flip (bool): Whether apply flip augmentation. Default: False. flip_direction (str | list[str]): Flip augmentation directions, options are "horizontal", "vertical" and "diagonal". If flip_direction is a list, multiple flip augmentations will be applied. It has no effect when flip == False. Default: "horizontal". """ def __init__(self, transforms, img_scale=None, scale_factor=None, flip=False, flip_direction='horizontal'): self.transforms = Compose(transforms) assert (img_scale is None) ^ (scale_factor is None), ( 'Must have but only one variable can be set') if img_scale is not None: self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] self.scale_key = 'scale' assert mmcv.is_list_of(self.img_scale, tuple) else: self.img_scale = scale_factor if isinstance( scale_factor, list) else [scale_factor] self.scale_key = 'scale_factor' self.flip = flip self.flip_direction = flip_direction if isinstance( flip_direction, list) else [flip_direction] assert mmcv.is_list_of(self.flip_direction, str) if not self.flip and self.flip_direction != ['horizontal']: warnings.warn( 'flip_direction has no effect when flip is set to False') if (self.flip and not any([t['type'] == 'RandomFlip' for t in transforms])): warnings.warn( 'flip has no effect when RandomFlip is not in transforms') def __call__(self, results): """Call function to apply test time augment transforms on results. Args: results (dict): Result dict contains the data to transform. Returns: dict[str: list]: The augmented data, where each value is wrapped into a list. """ aug_data = [] flip_args = [(False, None)] if self.flip: flip_args += [(True, direction) for direction in self.flip_direction] for scale in self.img_scale: for flip, direction in flip_args: _results = results.copy() _results[self.scale_key] = scale _results['flip'] = flip _results['flip_direction'] = direction data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(transforms={self.transforms}, ' repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' repr_str += f'flip_direction={self.flip_direction})' return repr_str @PIPELINES.register_module() class MultiScaleFlipAug3D(object): """Test-time augmentation with multiple scales and flipping. Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple]: Images scales for resizing. pts_scale_ratio (float | list[float]): Points scale ratios for resizing. flip (bool, optional): Whether apply flip augmentation. Defaults to False. flip_direction (str | list[str], optional): Flip augmentation directions for images, options are "horizontal" and "vertical". If flip_direction is list, multiple flip augmentations will be applied. It has no effect when ``flip == False``. Defaults to "horizontal". pcd_horizontal_flip (bool, optional): Whether apply horizontal flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. pcd_vertical_flip (bool, optional): Whether apply vertical flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. """ def __init__(self, transforms, img_scale, pts_scale_ratio, flip=False, flip_direction='horizontal', pcd_horizontal_flip=False, pcd_vertical_flip=False): self.transforms = Compose(transforms) self.img_scale = img_scale if isinstance(img_scale, list) else [img_scale] self.pts_scale_ratio = pts_scale_ratio \ if isinstance(pts_scale_ratio, list) else [float(pts_scale_ratio)] assert mmcv.is_list_of(self.img_scale, tuple) assert mmcv.is_list_of(self.pts_scale_ratio, float) self.flip = flip self.pcd_horizontal_flip = pcd_horizontal_flip self.pcd_vertical_flip = pcd_vertical_flip self.flip_direction = flip_direction if isinstance( flip_direction, list) else [flip_direction] assert mmcv.is_list_of(self.flip_direction, str) if not self.flip and self.flip_direction != ['horizontal']: warnings.warn( 'flip_direction has no effect when flip is set to False') if (self.flip and not any([(t['type'] == 'RandomFlip3D' or t['type'] == 'RandomFlip') for t in transforms])): warnings.warn( 'flip has no effect when RandomFlip is not in transforms') def __call__(self, results): """Call function to augment common fields in results. Args: results (dict): Result dict contains the data to augment. Returns: dict: The result dict contains the data that is augmented with different scales and flips. """ aug_data = [] # modified from `flip_aug = [False, True] if self.flip else [False]` # to reduce unnecessary scenes when using double flip augmentation # during test time flip_aug = [True] if self.flip else [False] pcd_horizontal_flip_aug = [False, True] \ if self.flip and self.pcd_horizontal_flip else [False] pcd_vertical_flip_aug = [False, True] \ if self.flip and self.pcd_vertical_flip else [False] for scale in self.img_scale: for pts_scale_ratio in self.pts_scale_ratio: for flip in flip_aug: for pcd_horizontal_flip in pcd_horizontal_flip_aug: for pcd_vertical_flip in pcd_vertical_flip_aug: for direction in self.flip_direction: # results.copy will cause bug # since it is shallow copy _results = deepcopy(results) _results['scale'] = scale _results['flip'] = flip _results['pcd_scale_factor'] = \ pts_scale_ratio _results['flip_direction'] = direction _results['pcd_horizontal_flip'] = \ pcd_horizontal_flip _results['pcd_vertical_flip'] = \ pcd_vertical_flip data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(transforms={self.transforms}, ' repr_str += f'img_scale={self.img_scale}, flip={self.flip}, ' repr_str += f'pts_scale_ratio={self.pts_scale_ratio}, ' repr_str += f'flip_direction={self.flip_direction})' return repr_str @PIPELINES.register_module() class CustomMultiScaleFlipAug3D(object): """Test-time augmentation with multiple scales and flipping. Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple]: Images scales for resizing. pts_scale_ratio (float | list[float]): Points scale ratios for resizing. flip (bool, optional): Whether apply flip augmentation. Defaults to False. flip_direction (str | list[str], optional): Flip augmentation directions for images, options are "horizontal" and "vertical". If flip_direction is list, multiple flip augmentations will be applied. It has no effect when ``flip == False``. Defaults to "horizontal". pcd_horizontal_flip (bool, optional): Whether apply horizontal flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. pcd_vertical_flip (bool, optional): Whether apply vertical flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. """ def __init__(self, transforms, tta_dx=False, tta_dy=False, tta=False): self.transforms = Compose(transforms) self.tta = tta self.tta_dx = tta_dx self.tta_dy = tta_dy def __call__(self, results): """Call function to augment common fields in results. Args: results (dict): Result dict contains the data to augment. Returns: dict: The result dict contains the data that is augmented with different scales and flips. """ aug_data = [] # modified from `flip_aug = [False, True] if self.flip else [False]` # to reduce unnecessary scenes when using double flip augmentation # during test time flip_aug = [False, True] if self.tta else [False] flip_dx_aug = [False, True] \ if self.tta_dx else [False] flip_dy_aug = [False, True] \ if self.tta_dy else [False] for flip in flip_aug: for flip_dx in flip_dx_aug: for flip_dy in flip_dy_aug: # for direction in self.flip_direction: # results.copy will cause bug # since it is shallow copy tta_config = dict( tta_flip = flip, flip_dx = flip_dx, flip_dy = flip_dy, ) results['tta_config'] = tta_config _results = deepcopy(results) data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ return repr_str @PIPELINES.register_module() class CustomDistMultiScaleFlipAug3D(object): """Test-time augmentation with multiple scales and flipping. Args: transforms (list[dict]): Transforms to apply in each augmentation. img_scale (tuple | list[tuple]: Images scales for resizing. pts_scale_ratio (float | list[float]): Points scale ratios for resizing. flip (bool, optional): Whether apply flip augmentation. Defaults to False. flip_direction (str | list[str], optional): Flip augmentation directions for images, options are "horizontal" and "vertical". If flip_direction is list, multiple flip augmentations will be applied. It has no effect when ``flip == False``. Defaults to "horizontal". pcd_horizontal_flip (bool, optional): Whether apply horizontal flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. pcd_vertical_flip (bool, optional): Whether apply vertical flip augmentation to point cloud. Defaults to True. Note that it works only when 'flip' is turned on. """ def __init__(self, transforms, tta=False): self.transforms = Compose(transforms) self.tta = tta def __call__(self, results): """Call function to augment common fields in results. Args: results (dict): Result dict contains the data to augment. Returns: dict: The result dict contains the data that is augmented with different scales and flips. """ _rank, _world_size = get_dist_info() if self.tta: assert _world_size == 8 aug_data = [] # modified from `flip_aug = [False, True] if self.flip else [False]` # to reduce unnecessary scenes when using double flip augmentation # during test time if self.tta: flip_aug = [_rank & 0b100>0] flip_dx_aug = [_rank & 0b010 >0] flip_dy_aug = [_rank & 0b001 >0] else: flip_aug, flip_dx_aug, flip_dy_aug = [False], [False], [False] for flip in flip_aug: for flip_dx in flip_dx_aug: for flip_dy in flip_dy_aug: # for direction in self.flip_direction: # results.copy will cause bug # since it is shallow copy tta_config = dict( tta_flip = flip, flip_dx = flip_dx, flip_dy = flip_dy, dist_tta = self.tta, ) results['tta_config'] = tta_config _results = deepcopy(results) data = self.transforms(_results) aug_data.append(data) # list of dict to dict of list aug_data_dict = {key: [] for key in aug_data[0]} for data in aug_data: for key, val in data.items(): aug_data_dict[key].append(val) return aug_data_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ return repr_str ================================================ FILE: mmdet3d/datasets/pipelines/transforms_3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import random import warnings import cv2 import numpy as np from mmcv import is_tuple_of from mmcv.utils import build_from_cfg from mmdet3d.core import VoxelGenerator from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes, LiDARInstance3DBoxes, box_np_ops) from mmdet3d.datasets.pipelines.compose import Compose from mmdet.datasets.pipelines import RandomCrop, RandomFlip, Rotate from ..builder import OBJECTSAMPLERS, PIPELINES from .data_augment_utils import noise_per_object_v3_ import mmcv from copy import deepcopy from pyquaternion import Quaternion import torch from PIL import Image import time import matplotlib.pyplot as plt import numpy as np from mmdet3d.models.fbbev.utils.draw_bbox import show_multi_modality_result @PIPELINES.register_module() class VisualInputsAndGT(object): """ show images and gt. """ def __init__(self, max=20): self.max = max self.i = 0 def _draw_point_cloud_(self, point_cloud, img_size): max_range = np.array([50 , 50 , 5.]) min_range = np.array([-50 , -50 , -3.]) point_cloud[:, :3] = (point_cloud[:, :3] - min_range) / (max_range - min_range) max_intensity = np.max(point_cloud[:, 3]) min_intensity = np.min(point_cloud[:, 3]) point_cloud[:, 3] = point_cloud[:, 3 ]/255 point_cloud[:, 3] = (point_cloud[:, 3])# .astype(np.uint8) img = np.zeros((img_size[0], img_size[1], 3), np.uint8) jet = plt.get_cmap('jet') for i in range(point_cloud.shape[0]): color = jet(point_cloud[i, 3]) color = (int(color[0]*255), int(color[1]*255), int(color[2]*255)) x = int(point_cloud[i, 0] * img_size[1]) y = int(point_cloud[i, 1] * img_size[0]) try: cv2.circle(img, (x, y), 1, color, -1) except: pass return img def world2bev_vis(self, x, y): return int((x + 51.2) * 15), int((y + 51.2) * 15) def world2bev_vis2(self, x, y): return int((x) * 1536), int((y) * 1536) def __visual__(self, results): import bbox_visualizer as bbv _, _, h, w = results['img_inputs'][0].shape imgs = results['img_inputs'][0].reshape(1, 6, 3, h, w) imgs = imgs[0, :] for i in range(6): # tmp = bbv.draw_rectangle(imgs[i].permute(1, 2, 0).cpu().numpy(), results['gt_bboxes_2d'][i][0], bbox_color=[255, 0,0]) tmp = bbv.draw_multiple_rectangles(imgs[i].permute(1, 2, 0).cpu().numpy(), results['gt_bboxes_2d'][i].numpy().astype(np.int), bbox_color=[255, 0,0]) mmcv.imwrite(tmp[:,:,::-1], f'tmp_{i}.png') for i in range(6): # print(results['bbox3d_fields']) # print(results['gt_bboxes_3d'].tensor) # print(results['lidar2img'][i]) show_multi_modality_result( imgs[i].permute(1, 2, 0).cpu().numpy(), results['gt_bboxes_3d'], None, None, '.', f'aug_{i}.png', camera_params=results['img_inputs'][1:] + (i,), box_mode='lidar', show=True, scores=None, ) bev_img = np.ones([1536, 1536, 3], dtype=np.float32) * 255 point_cloud = results['points'].tensor[:, :4].cpu().numpy().copy() img_size = (1536, 1536) bev_img = self._draw_point_cloud_(point_cloud, img_size) bev_img = bev_img.astype(np.float32) mmcv.imwrite(bev_img, f'aug_bev_{results["index"]}_lidar.png') for i, corners in enumerate(results['gt_bboxes_3d'].corners[:, [4, 7, 3, 0], :2]): corners = np.array([self.world2bev_vis(*corner) for corner in corners]) # _img = np.zeros([1536, 1536, 3], dtype=np.float32) bev_img = cv2.circle(bev_img, corners[0], 5, (61, 102, 255)) bev_img = cv2.fillPoly(bev_img, [corners], (61, 102, 255)) # bev_img = cv2.addWeighted(bev_img, 1, _img, 0.5, 0) # cv2.putText(bev_img, '%.1f, %.1f, %.1f' % ( # results['gt_bboxes_3d'].tensor[i][0], # results['gt_bboxes_3d'].tensor[i][1], # results['gt_bboxes_3d'].tensor[i][6]), corners[1], cv2.FONT_HERSHEY_COMPLEX, 1.0, (0, 0, 255), 2) bev_img = cv2.circle(bev_img, self.world2bev_vis(0, 0), 5, (0, 255, 0), thickness=-1) if 'map_gt_bboxes_3d' in results: if type(results['map_gt_bboxes_3d'].data) == torch.Tensor: lines = results['map_gt_bboxes_3d'].data[:, 0, :, :] world2bev_vis = self.world2bev_vis2 else: lines = results['map_gt_bboxes_3d'].data.fixed_num_sampled_points world2bev_vis = self.world2bev_vis for k, line in enumerate(lines): label = results['map_gt_labels_3d'].data[k] line = line.cpu().numpy() corners = np.array([world2bev_vis(*corner) for corner in line]) corners = [each for each in corners if ((each>=0).all() & (each<1536).all())] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 255)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=colors[label], thickness=1) # mmcv.imwrite(img, 'point bev.png') if 'gt_ego_fut_trajs' in results: gt_ego_fut_trajs = results['gt_ego_fut_trajs'] # self._render_traj(results['gt_ego_fut_trajs'], 2) points = np.array([self.world2bev_vis(*point.numpy()) for point in gt_ego_fut_trajs]) for point in points: bev_img = cv2.circle(bev_img, point, 1, (0, 255, 0)) if 'gt_agent_fut_traj' in results: gt_agent_fut_traj = results['gt_agent_fut_traj'] gt_agent_fut_traj_mask = results['gt_agent_fut_traj_mask'] centers = results['gt_bboxes_3d'].center[..., :2] tmp = torch.cat([centers[:, None], gt_agent_fut_traj], 1) trajs = torch.cumsum(tmp, 1)[:, 1:] for k, traj in enumerate(trajs): traj = traj.cpu().numpy() corners = np.array([self.world2bev_vis(*corner) for corner in traj]) center = np.array(self.world2bev_vis(*centers[k])) corners = [each for each in corners if ((each>=0).all() & (each<1536).all())] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): if gt_agent_fut_traj_mask[k, i+1].sum()<2 or gt_agent_fut_traj_mask[k, i].sum()<2: continue if i == 0: bev_img = cv2.line(bev_img, center, corners[i], color=(123, 22, 187), thickness=1) bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 32)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=(123, 22, 187), thickness=1) if 'fut_boxes_in_cur_ego_list' in results: for k, fut_boxes in enumerate(results['fut_boxes_in_cur_ego_list']): fut_bev_img = np.ones([1536, 1536, 3], dtype=np.float32) * 255 fut_bev_img = fut_bev_img.astype(np.float32) try: for i, corners in enumerate(fut_boxes.corners[:, [4, 7, 3, 0], :2]): corners = np.array([self.world2bev_vis(*corner) for corner in corners]) fut_bev_img = cv2.circle(fut_bev_img, corners[0], 5, (61, 102, 255)) fut_bev_img = cv2.fillPoly(fut_bev_img, [corners], (61, 102, 255)) except: pass fut_bev_img = cv2.circle(fut_bev_img, self.world2bev_vis(0, 0), 5, (0, 255, 0), thickness=-1) mmcv.imwrite(fut_bev_img, f'aug_bev_{results["index"]}_fut_{k}.png',) mmcv.imwrite(bev_img, f'bev_{results["index"]}.png',) print('saved', f'bev_{results["index"]}.png') from IPython import embed embed() exit() def _render_traj(self, future_traj, points_per_step=10): total_steps = (len(future_traj)-1) * points_per_step + 1 total_xy = torch.zeros((total_steps, 2), device=future_traj.device) for i in range(total_steps-1): unit_vec = future_traj[i//points_per_step + 1] - future_traj[i//points_per_step] total_xy[i] = (i/points_per_step - i//points_per_step) * \ unit_vec + future_traj[i//points_per_step] total_xy[-1] = future_traj[-1] return total_xy def __call__(self, results): self.__visual__(results) return results @PIPELINES.register_module() class GridMask: def __init__( self, use_h=True, use_w=True, rotate=1, offset=False, ratio=0.5, mode=1, ): self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.epoch = None def __call__(self, results): imgs = results["img_inputs"][0] h = imgs[0].shape[1] w = imgs[0].shape[2] self.d1 = 2 self.d2 = min(h, w) hh = int(1.5 * h) ww = int(1.5 * w) d = np.random.randint(self.d1, self.d2) if self.ratio == 1: self.l = np.random.randint(1, d) else: self.l = min(max(int(d * self.ratio + 0.5), 1), d - 1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh // d): s = d * i + st_h t = min(s + self.l, hh) mask[s:t, :] *= 0 if self.use_w: for i in range(ww // d): s = d * i + st_w t = min(s + self.l, ww) mask[:, s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[ (hh - h) // 2 : (hh - h) // 2 + h, (ww - w) // 2 : (ww - w) // 2 + w ] mask = mask.astype(np.float32) mask = mask[None, :, :] if self.mode == 1: mask = 1 - mask # mask = mask.expand_as(imgs[0]) if self.offset: offset = torch.from_numpy(2 * (np.random.rand(h, w) - 0.5)).float() offset = (1 - mask) * offset imgs = imgs * mask[None] + offset # imgs = [x * mask + offset for x in imgs] else: imgs = imgs * mask[None] # imgs = torch.tensor[x * mask for x in imgs] results["img_inputs"] = (imgs,) + results["img_inputs"][1:] # results.update(img=imgs) return results @PIPELINES.register_module() class AugPoints(object): def __call__(self, results): if results['rotate_bda'] != 0: results['points'].rotate(results['rotate_bda']/180 * np.pi) if results['scale_bda'] != 1: results['points'].scale(results['scale_bda']) if results['flip_dx']: results['points'].flip('vertical') if results['flip_dy']: results['points'].flip('horizontal') return results @PIPELINES.register_module() class ToEgo(object): def __init__(self, ego_cam='CAM_FRONT',): self.ego_cam=ego_cam def __call__(self, results): lidar2lidarego = np.eye(4, dtype=np.float32) lidar2lidarego[:3, :3] = Quaternion( results['curr']['lidar2ego_rotation']).rotation_matrix lidar2lidarego[:3, 3] = results['curr']['lidar2ego_translation'] lidarego2global = np.eye(4, dtype=np.float32) lidarego2global[:3, :3] = Quaternion( results['curr']['ego2global_rotation']).rotation_matrix lidarego2global[:3, 3] = results['curr']['ego2global_translation'] camego2global = np.eye(4, dtype=np.float32) camego2global[:3, :3] = Quaternion( results['curr']['cams'][self.ego_cam] ['ego2global_rotation']).rotation_matrix camego2global[:3, 3] = results['curr']['cams'][self.ego_cam][ 'ego2global_translation'] lidar2camego = np.linalg.inv(camego2global) @ lidarego2global @ lidar2lidarego points = results['points'].tensor.numpy() points_ego = lidar2camego[:3,:3].reshape(1, 3, 3) @ \ points[:, :3].reshape(-1, 3, 1) + \ lidar2camego[:3, 3].reshape(1, 3, 1) points[:, :3] = points_ego.squeeze(-1) points = results['points'].new_point(points) results['points'] = points return results @PIPELINES.register_module() class PadMultiViewImage(object): """Pad the multi-view image. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value, 0 by default. """ def __init__(self, size=None, size_divisor=None, pad_val=0): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val # only one of size and size_divisor should be valid # assert size is not None or size_divisor is not None # assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" if self.size == 'same2max': max_shape = (max([img.shape[0] for img in results['img']]), max([img.shape[1] for img in results['img']])) divisor = self.size_divisor pad_h = int(np.ceil(max_shape[0] / divisor)) * divisor pad_w = int(np.ceil(max_shape[1] / divisor)) * divisor padded_img = [mmcv.impad( img, shape=(pad_h, pad_w), pad_val=self.pad_val) for img in results['img']] elif self.size is not None: padded_img = [mmcv.impad( img, shape=self.size, pad_val=self.pad_val) for img in results['img']] elif self.size_divisor is not None: padded_img = [mmcv.impad_to_multiple( img, self.size_divisor, pad_val=self.pad_val) for img in results['img']] results['ori_shape'] = [img.shape for img in results['img']] results['img'] = padded_img results['img_shape'] = [img.shape for img in padded_img] results['pad_shape'] = [img.shape for img in padded_img] results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class RandomDropPointsColor(object): r"""Randomly set the color of points to all zeros. Once this transform is executed, all the points' color will be dropped. Refer to `PAConv `_ for more details. Args: drop_ratio (float, optional): The probability of dropping point colors. Defaults to 0.2. """ def __init__(self, drop_ratio=0.2): assert isinstance(drop_ratio, (int, float)) and 0 <= drop_ratio <= 1, \ f'invalid drop_ratio value {drop_ratio}' self.drop_ratio = drop_ratio def __call__(self, input_dict): """Call function to drop point colors. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after color dropping, 'points' key is updated in the result dict. """ points = input_dict['points'] assert points.attribute_dims is not None and \ 'color' in points.attribute_dims, \ 'Expect points have color attribute' # this if-expression is a bit strange # `RandomDropPointsColor` is used in training 3D segmentor PAConv # we discovered in our experiments that, using # `if np.random.rand() > 1.0 - self.drop_ratio` consistently leads to # better results than using `if np.random.rand() < self.drop_ratio` # so we keep this hack in our codebase if np.random.rand() > 1.0 - self.drop_ratio: points.color = points.color * 0.0 return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(drop_ratio={self.drop_ratio})' return repr_str @PIPELINES.register_module() class RandomFlip3D(RandomFlip): """Flip the points & bbox. If the input dict contains the key "flip", then the flag will be used, otherwise it will be randomly decided by a ratio specified in the init method. Args: sync_2d (bool, optional): Whether to apply flip according to the 2D images. If True, it will apply the same flip as that to 2D images. If False, it will decide whether to flip randomly and independently to that of 2D images. Defaults to True. flip_ratio_bev_horizontal (float, optional): The flipping probability in horizontal direction. Defaults to 0.0. flip_ratio_bev_vertical (float, optional): The flipping probability in vertical direction. Defaults to 0.0. """ def __init__(self, sync_2d=True, flip_ratio_bev_horizontal=0.0, flip_ratio_bev_vertical=0.0, **kwargs): super(RandomFlip3D, self).__init__( flip_ratio=flip_ratio_bev_horizontal, **kwargs) self.sync_2d = sync_2d self.flip_ratio_bev_vertical = flip_ratio_bev_vertical if flip_ratio_bev_horizontal is not None: assert isinstance( flip_ratio_bev_horizontal, (int, float)) and 0 <= flip_ratio_bev_horizontal <= 1 if flip_ratio_bev_vertical is not None: assert isinstance( flip_ratio_bev_vertical, (int, float)) and 0 <= flip_ratio_bev_vertical <= 1 def random_flip_data_3d(self, input_dict, direction='horizontal'): """Flip 3D data randomly. Args: input_dict (dict): Result dict from loading pipeline. direction (str, optional): Flip direction. Default: 'horizontal'. Returns: dict: Flipped results, 'points', 'bbox3d_fields' keys are updated in the result dict. """ assert direction in ['horizontal', 'vertical'] # for semantic segmentation task, only points will be flipped. if 'bbox3d_fields' not in input_dict: input_dict['points'].flip(direction) return if len(input_dict['bbox3d_fields']) == 0: # test mode input_dict['bbox3d_fields'].append('empty_box3d') input_dict['empty_box3d'] = input_dict['box_type_3d']( np.array([], dtype=np.float32)) assert len(input_dict['bbox3d_fields']) == 1 for key in input_dict['bbox3d_fields']: if 'points' in input_dict: input_dict['points'] = input_dict[key].flip( direction, points=input_dict['points']) else: input_dict[key].flip(direction) if 'centers2d' in input_dict: assert self.sync_2d is True and direction == 'horizontal', \ 'Only support sync_2d=True and horizontal flip with images' w = input_dict['ori_shape'][1] input_dict['centers2d'][..., 0] = \ w - input_dict['centers2d'][..., 0] # need to modify the horizontal position of camera center # along u-axis in the image (flip like centers2d) # ['cam2img'][0][2] = c_u # see more details and examples at # https://github.com/open-mmlab/mmdetection3d/pull/744 input_dict['cam2img'][0][2] = w - input_dict['cam2img'][0][2] def __call__(self, input_dict): """Call function to flip points, values in the ``bbox3d_fields`` and also flip 2D image and its annotations. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Flipped results, 'flip', 'flip_direction', 'pcd_horizontal_flip' and 'pcd_vertical_flip' keys are added into result dict. """ # flip 2D image and its annotations super(RandomFlip3D, self).__call__(input_dict) if self.sync_2d: input_dict['pcd_horizontal_flip'] = input_dict['flip'] input_dict['pcd_vertical_flip'] = False else: if 'pcd_horizontal_flip' not in input_dict: flip_horizontal = True if np.random.rand( ) < self.flip_ratio else False input_dict['pcd_horizontal_flip'] = flip_horizontal if 'pcd_vertical_flip' not in input_dict: flip_vertical = True if np.random.rand( ) < self.flip_ratio_bev_vertical else False input_dict['pcd_vertical_flip'] = flip_vertical if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] if input_dict['pcd_horizontal_flip']: self.random_flip_data_3d(input_dict, 'horizontal') input_dict['transformation_3d_flow'].extend(['HF']) if input_dict['pcd_vertical_flip']: self.random_flip_data_3d(input_dict, 'vertical') input_dict['transformation_3d_flow'].extend(['VF']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(sync_2d={self.sync_2d},' repr_str += f' flip_ratio_bev_vertical={self.flip_ratio_bev_vertical})' return repr_str @PIPELINES.register_module() class MultiViewWrapper(object): """Wrap transformation from single-view into multi-view. The wrapper processes the images from multi-view one by one. For each image, it constructs a pseudo dict according to the keys specified by the 'process_fields' parameter. After the transformation is finished, desired information can be collected by specifying the keys in the 'collected_keys' parameter. Multi-view images share the same transformation parameters but do not share the same magnitude when a random transformation is conducted. Args: transforms (list[dict]): A list of dict specifying the transformations for the monocular situation. process_fields (dict): Desired keys that the transformations should be conducted on. Default to dict(img_fields=['img']). collected_keys (list[str]): Collect information in transformation like rotate angles, crop roi, and flip state. """ def __init__(self, transforms, process_fields=dict(img_fields=['img']), collected_keys=[]): self.transform = Compose(transforms) self.collected_keys = collected_keys self.process_fields = process_fields def __call__(self, input_dict): for key in self.collected_keys: input_dict[key] = [] for img_id in range(len(input_dict['img'])): process_dict = self.process_fields.copy() for field in self.process_fields: for key in self.process_fields[field]: process_dict[key] = input_dict[key][img_id] process_dict = self.transform(process_dict) for field in self.process_fields: for key in self.process_fields[field]: input_dict[key][img_id] = process_dict[key] for key in self.collected_keys: input_dict[key].append(process_dict[key]) return input_dict @PIPELINES.register_module() class RangeLimitedRandomCrop(RandomCrop): """Randomly crop image-view objects under a limitation of range. Args: relative_x_offset_range (tuple[float]): Relative range of random crop in x direction. (x_min, x_max) in [0, 1.0]. Default to (0.0, 1.0). relative_y_offset_range (tuple[float]): Relative range of random crop in y direction. (y_min, y_max) in [0, 1.0]. Default to (0.0, 1.0). """ def __init__(self, relative_x_offset_range=(0.0, 1.0), relative_y_offset_range=(0.0, 1.0), **kwargs): super(RangeLimitedRandomCrop, self).__init__(**kwargs) for range in [relative_x_offset_range, relative_y_offset_range]: assert 0 <= range[0] <= range[1] <= 1 self.relative_x_offset_range = relative_x_offset_range self.relative_y_offset_range = relative_y_offset_range def _crop_data(self, results, crop_size, allow_negative_crop): """Function to randomly crop images. Modified from RandomCrop in mmdet==2.25.0 Args: results (dict): Result dict from loading pipeline. crop_size (tuple): Expected absolute size after cropping, (h, w). Returns: dict: Randomly cropped results, 'img_shape' key in result dict is updated according to crop size. """ assert crop_size[0] > 0 and crop_size[1] > 0 for key in results.get('img_fields', ['img']): img = results[key] margin_h = max(img.shape[0] - crop_size[0], 0) margin_w = max(img.shape[1] - crop_size[1], 0) offset_range_h = (margin_h * self.relative_y_offset_range[0], margin_h * self.relative_y_offset_range[1] + 1) offset_h = np.random.randint(*offset_range_h) offset_range_w = (margin_w * self.relative_x_offset_range[0], margin_w * self.relative_x_offset_range[1] + 1) offset_w = np.random.randint(*offset_range_w) crop_y1, crop_y2 = offset_h, offset_h + crop_size[0] crop_x1, crop_x2 = offset_w, offset_w + crop_size[1] # crop the image img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] img_shape = img.shape results[key] = img results['crop'] = (crop_x1, crop_y1, crop_x2, crop_y2) results['img_shape'] = img_shape # crop bboxes accordingly and clip to the image boundary for key in results.get('bbox_fields', []): # e.g. gt_bboxes and gt_bboxes_ignore bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h], dtype=np.float32) bboxes = results[key] - bbox_offset if self.bbox_clip_border: bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1]) bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0]) valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & ( bboxes[:, 3] > bboxes[:, 1]) # If the crop does not contain any gt-bbox area and # allow_negative_crop is False, skip this image. if (key == 'gt_bboxes' and not valid_inds.any() and not allow_negative_crop): return None results[key] = bboxes[valid_inds, :] # label fields. e.g. gt_labels and gt_labels_ignore label_key = self.bbox2label.get(key) if label_key in results: results[label_key] = results[label_key][valid_inds] # mask fields, e.g. gt_masks and gt_masks_ignore mask_key = self.bbox2mask.get(key) if mask_key in results: results[mask_key] = results[mask_key][ valid_inds.nonzero()[0]].crop( np.asarray([crop_x1, crop_y1, crop_x2, crop_y2])) if self.recompute_bbox: results[key] = results[mask_key].get_bboxes() # crop semantic seg for key in results.get('seg_fields', []): results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2] return results @PIPELINES.register_module() class RandomRotate(Rotate): """Randomly rotate images. The ratation angle is selected uniformly within the interval specified by the 'range' parameter. Args: range (tuple[float]): Define the range of random rotation. (angle_min, angle_max) in angle. """ def __init__(self, range, **kwargs): super(RandomRotate, self).__init__(**kwargs) self.range = range def __call__(self, results): self.angle = np.random.uniform(self.range[0], self.range[1]) super(RandomRotate, self).__call__(results) results['rotate'] = self.angle return results @PIPELINES.register_module() class RandomJitterPoints(object): """Randomly jitter point coordinates. Different from the global translation in ``GlobalRotScaleTrans``, here we apply different noises to each point in a scene. Args: jitter_std (list[float]): The standard deviation of jittering noise. This applies random noise to all points in a 3D scene, which is sampled from a gaussian distribution whose standard deviation is set by ``jitter_std``. Defaults to [0.01, 0.01, 0.01] clip_range (list[float]): Clip the randomly generated jitter noise into this range. If None is given, don't perform clipping. Defaults to [-0.05, 0.05] Note: This transform should only be used in point cloud segmentation tasks because we don't transform ground-truth bboxes accordingly. For similar transform in detection task, please refer to `ObjectNoise`. """ def __init__(self, jitter_std=[0.01, 0.01, 0.01], clip_range=[-0.05, 0.05]): seq_types = (list, tuple, np.ndarray) if not isinstance(jitter_std, seq_types): assert isinstance(jitter_std, (int, float)), \ f'unsupported jitter_std type {type(jitter_std)}' jitter_std = [jitter_std, jitter_std, jitter_std] self.jitter_std = jitter_std if clip_range is not None: if not isinstance(clip_range, seq_types): assert isinstance(clip_range, (int, float)), \ f'unsupported clip_range type {type(clip_range)}' clip_range = [-clip_range, clip_range] self.clip_range = clip_range def __call__(self, input_dict): """Call function to jitter all the points in the scene. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after adding noise to each point, 'points' key is updated in the result dict. """ points = input_dict['points'] jitter_std = np.array(self.jitter_std, dtype=np.float32) jitter_noise = \ np.random.randn(points.shape[0], 3) * jitter_std[None, :] if self.clip_range is not None: jitter_noise = np.clip(jitter_noise, self.clip_range[0], self.clip_range[1]) points.translate(jitter_noise) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(jitter_std={self.jitter_std},' repr_str += f' clip_range={self.clip_range})' return repr_str @PIPELINES.register_module() class ObjectSample(object): """Sample GT objects to the data. Args: db_sampler (dict): Config dict of the database sampler. sample_2d (bool): Whether to also paste 2D image patch to the images This should be true when applying multi-modality cut-and-paste. Defaults to False. use_ground_plane (bool): Whether to use gound plane to adjust the 3D labels. """ def __init__(self, db_sampler, sample_2d=False, use_ground_plane=False): self.sampler_cfg = db_sampler self.sample_2d = sample_2d if 'type' not in db_sampler.keys(): db_sampler['type'] = 'DataBaseSampler' self.db_sampler = build_from_cfg(db_sampler, OBJECTSAMPLERS) self.use_ground_plane = use_ground_plane @staticmethod def remove_points_in_boxes(points, boxes): """Remove the points in the sampled bounding boxes. Args: points (:obj:`BasePoints`): Input point cloud array. boxes (np.ndarray): Sampled ground truth boxes. Returns: np.ndarray: Points with those in the boxes removed. """ masks = box_np_ops.points_in_rbbox(points.coord.numpy(), boxes) points = points[np.logical_not(masks.any(-1))] return points def __call__(self, input_dict): """Call function to sample ground truth objects to the data. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after object sampling augmentation, 'points', 'gt_bboxes_3d', 'gt_labels_3d' keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_labels_3d = input_dict['gt_labels_3d'] if self.use_ground_plane and 'plane' in input_dict['ann_info']: ground_plane = input_dict['ann_info']['plane'] input_dict['plane'] = ground_plane else: ground_plane = None # change to float for blending operation points = input_dict['points'] if self.sample_2d: img = input_dict['img'] gt_bboxes_2d = input_dict['gt_bboxes'] # Assume for now 3D & 2D bboxes are the same sampled_dict = self.db_sampler.sample_all( gt_bboxes_3d.tensor.numpy(), gt_labels_3d, gt_bboxes_2d=gt_bboxes_2d, img=img) else: sampled_dict = self.db_sampler.sample_all( gt_bboxes_3d.tensor.numpy(), gt_labels_3d, img=None, ground_plane=ground_plane) if sampled_dict is not None: sampled_gt_bboxes_3d = sampled_dict['gt_bboxes_3d'] sampled_points = sampled_dict['points'] sampled_gt_labels = sampled_dict['gt_labels_3d'] gt_labels_3d = np.concatenate([gt_labels_3d, sampled_gt_labels], axis=0) gt_bboxes_3d = gt_bboxes_3d.new_box( np.concatenate( [gt_bboxes_3d.tensor.numpy(), sampled_gt_bboxes_3d])) points = self.remove_points_in_boxes(points, sampled_gt_bboxes_3d) # check the points dimension points = points.cat([sampled_points, points]) if self.sample_2d: sampled_gt_bboxes_2d = sampled_dict['gt_bboxes_2d'] gt_bboxes_2d = np.concatenate( [gt_bboxes_2d, sampled_gt_bboxes_2d]).astype(np.float32) input_dict['gt_bboxes'] = gt_bboxes_2d input_dict['img'] = sampled_dict['img'] input_dict['gt_bboxes_3d'] = gt_bboxes_3d input_dict['gt_labels_3d'] = gt_labels_3d.astype(np.int64) input_dict['points'] = points return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f' sample_2d={self.sample_2d},' repr_str += f' data_root={self.sampler_cfg.data_root},' repr_str += f' info_path={self.sampler_cfg.info_path},' repr_str += f' rate={self.sampler_cfg.rate},' repr_str += f' prepare={self.sampler_cfg.prepare},' repr_str += f' classes={self.sampler_cfg.classes},' repr_str += f' sample_groups={self.sampler_cfg.sample_groups}' return repr_str @PIPELINES.register_module() class ObjectNoise(object): """Apply noise to each GT objects in the scene. Args: translation_std (list[float], optional): Standard deviation of the distribution where translation noise are sampled from. Defaults to [0.25, 0.25, 0.25]. global_rot_range (list[float], optional): Global rotation to the scene. Defaults to [0.0, 0.0]. rot_range (list[float], optional): Object rotation range. Defaults to [-0.15707963267, 0.15707963267]. num_try (int, optional): Number of times to try if the noise applied is invalid. Defaults to 100. """ def __init__(self, translation_std=[0.25, 0.25, 0.25], global_rot_range=[0.0, 0.0], rot_range=[-0.15707963267, 0.15707963267], num_try=100): self.translation_std = translation_std self.global_rot_range = global_rot_range self.rot_range = rot_range self.num_try = num_try def __call__(self, input_dict): """Call function to apply noise to each ground truth in the scene. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after adding noise to each object, 'points', 'gt_bboxes_3d' keys are updated in the result dict. """ gt_bboxes_3d = input_dict['gt_bboxes_3d'] points = input_dict['points'] # TODO: this is inplace operation numpy_box = gt_bboxes_3d.tensor.numpy() numpy_points = points.tensor.numpy() noise_per_object_v3_( numpy_box, numpy_points, rotation_perturb=self.rot_range, center_noise_std=self.translation_std, global_random_rot_range=self.global_rot_range, num_try=self.num_try) input_dict['gt_bboxes_3d'] = gt_bboxes_3d.new_box(numpy_box) input_dict['points'] = points.new_point(numpy_points) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(num_try={self.num_try},' repr_str += f' translation_std={self.translation_std},' repr_str += f' global_rot_range={self.global_rot_range},' repr_str += f' rot_range={self.rot_range})' return repr_str @PIPELINES.register_module() class GlobalAlignment(object): """Apply global alignment to 3D scene points by rotation and translation. Args: rotation_axis (int): Rotation axis for points and bboxes rotation. Note: We do not record the applied rotation and translation as in GlobalRotScaleTrans. Because usually, we do not need to reverse the alignment step. For example, ScanNet 3D detection task uses aligned ground-truth bounding boxes for evaluation. """ def __init__(self, rotation_axis): self.rotation_axis = rotation_axis def _trans_points(self, input_dict, trans_factor): """Private function to translate points. Args: input_dict (dict): Result dict from loading pipeline. trans_factor (np.ndarray): Translation vector to be applied. Returns: dict: Results after translation, 'points' is updated in the dict. """ input_dict['points'].translate(trans_factor) def _rot_points(self, input_dict, rot_mat): """Private function to rotate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. rot_mat (np.ndarray): Rotation matrix to be applied. Returns: dict: Results after rotation, 'points' is updated in the dict. """ # input should be rot_mat_T so I transpose it here input_dict['points'].rotate(rot_mat.T) def _check_rot_mat(self, rot_mat): """Check if rotation matrix is valid for self.rotation_axis. Args: rot_mat (np.ndarray): Rotation matrix to be applied. """ is_valid = np.allclose(np.linalg.det(rot_mat), 1.0) valid_array = np.zeros(3) valid_array[self.rotation_axis] = 1.0 is_valid &= (rot_mat[self.rotation_axis, :] == valid_array).all() is_valid &= (rot_mat[:, self.rotation_axis] == valid_array).all() assert is_valid, f'invalid rotation matrix {rot_mat}' def __call__(self, input_dict): """Call function to shuffle points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after global alignment, 'points' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ assert 'axis_align_matrix' in input_dict['ann_info'].keys(), \ 'axis_align_matrix is not provided in GlobalAlignment' axis_align_matrix = input_dict['ann_info']['axis_align_matrix'] assert axis_align_matrix.shape == (4, 4), \ f'invalid shape {axis_align_matrix.shape} for axis_align_matrix' rot_mat = axis_align_matrix[:3, :3] trans_vec = axis_align_matrix[:3, -1] self._check_rot_mat(rot_mat) self._rot_points(input_dict, rot_mat) self._trans_points(input_dict, trans_vec) return input_dict def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(rotation_axis={self.rotation_axis})' return repr_str @PIPELINES.register_module() class GlobalRotScaleTrans(object): """Apply global rotation, scaling and translation to a 3D scene. Args: rot_range (list[float], optional): Range of rotation angle. Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]). scale_ratio_range (list[float], optional): Range of scale ratio. Defaults to [0.95, 1.05]. translation_std (list[float], optional): The standard deviation of translation noise applied to a scene, which is sampled from a gaussian distribution whose standard deviation is set by ``translation_std``. Defaults to [0, 0, 0] shift_height (bool, optional): Whether to shift height. (the fourth dimension of indoor points) when scaling. Defaults to False. """ def __init__(self, rot_range=[-0.78539816, 0.78539816], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0], shift_height=False): seq_types = (list, tuple, np.ndarray) if not isinstance(rot_range, seq_types): assert isinstance(rot_range, (int, float)), \ f'unsupported rot_range type {type(rot_range)}' rot_range = [-rot_range, rot_range] self.rot_range = rot_range assert isinstance(scale_ratio_range, seq_types), \ f'unsupported scale_ratio_range type {type(scale_ratio_range)}' self.scale_ratio_range = scale_ratio_range if not isinstance(translation_std, seq_types): assert isinstance(translation_std, (int, float)), \ f'unsupported translation_std type {type(translation_std)}' translation_std = [ translation_std, translation_std, translation_std ] assert all([std >= 0 for std in translation_std]), \ 'translation_std should be positive' self.translation_std = translation_std self.shift_height = shift_height def _trans_bbox_points(self, input_dict): """Private function to translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after translation, 'points', 'pcd_trans' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ translation_std = np.array(self.translation_std, dtype=np.float32) trans_factor = np.random.normal(scale=translation_std, size=3).T input_dict['points'].translate(trans_factor) input_dict['pcd_trans'] = trans_factor for key in input_dict['bbox3d_fields']: input_dict[key].translate(trans_factor) def _rot_bbox_points(self, input_dict): """Private function to rotate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after rotation, 'points', 'pcd_rotation' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ rotation = self.rot_range noise_rotation = np.random.uniform(rotation[0], rotation[1]) # if no bbox in input_dict, only rotate points if len(input_dict['bbox3d_fields']) == 0: rot_mat_T = input_dict['points'].rotate(noise_rotation) input_dict['pcd_rotation'] = rot_mat_T input_dict['pcd_rotation_angle'] = noise_rotation return # rotate points with bboxes for key in input_dict['bbox3d_fields']: if len(input_dict[key].tensor) != 0: points, rot_mat_T = input_dict[key].rotate( noise_rotation, input_dict['points']) input_dict['points'] = points input_dict['pcd_rotation'] = rot_mat_T input_dict['pcd_rotation_angle'] = noise_rotation def _scale_bbox_points(self, input_dict): """Private function to scale bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points'and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ scale = input_dict['pcd_scale_factor'] points = input_dict['points'] points.scale(scale) if self.shift_height: assert 'height' in points.attribute_dims.keys(), \ 'setting shift_height=True but points have no height attribute' points.tensor[:, points.attribute_dims['height']] *= scale input_dict['points'] = points for key in input_dict['bbox3d_fields']: input_dict[key].scale(scale) def _random_scale(self, input_dict): """Private function to randomly set the scale factor. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'pcd_scale_factor' are updated in the result dict. """ scale_factor = np.random.uniform(self.scale_ratio_range[0], self.scale_ratio_range[1]) input_dict['pcd_scale_factor'] = scale_factor def __call__(self, input_dict): """Private function to rotate, scale and translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ if 'transformation_3d_flow' not in input_dict: input_dict['transformation_3d_flow'] = [] self._rot_bbox_points(input_dict) if 'pcd_scale_factor' not in input_dict: self._random_scale(input_dict) self._scale_bbox_points(input_dict) self._trans_bbox_points(input_dict) input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(rot_range={self.rot_range},' repr_str += f' scale_ratio_range={self.scale_ratio_range},' repr_str += f' translation_std={self.translation_std},' repr_str += f' shift_height={self.shift_height})' return repr_str @PIPELINES.register_module() class RotScaleTransPoints(object): """Apply global rotation, scaling and translation to a 3D scene. Args: rot_range (list[float], optional): Range of rotation angle. Defaults to [-0.78539816, 0.78539816] (close to [-pi/4, pi/4]). scale_ratio_range (list[float], optional): Range of scale ratio. Defaults to [0.95, 1.05]. translation_std (list[float], optional): The standard deviation of translation noise applied to a scene, which is sampled from a gaussian distribution whose standard deviation is set by ``translation_std``. Defaults to [0, 0, 0] shift_height (bool, optional): Whether to shift height. (the fourth dimension of indoor points) when scaling. Defaults to False. """ def _trans_bbox_points(self, input_dict): """Private function to translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after translation, 'points', 'pcd_trans' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ translation_std = np.array(self.translation_std, dtype=np.float32) trans_factor = np.random.normal(scale=translation_std, size=3).T input_dict['points'].translate(trans_factor) input_dict['pcd_trans'] = trans_factor for key in input_dict['bbox3d_fields']: input_dict[key].translate(trans_factor) def _rot_bbox_points(self, input_dict): """Private function to rotate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after rotation, 'points', 'pcd_rotation' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ rotation = self.rot_range noise_rotation = np.random.uniform(rotation[0], rotation[1]) # if no bbox in input_dict, only rotate points if len(input_dict['bbox3d_fields']) == 0: rot_mat_T = input_dict['points'].rotate(noise_rotation) input_dict['pcd_rotation'] = rot_mat_T input_dict['pcd_rotation_angle'] = noise_rotation return # rotate points with bboxes for key in input_dict['bbox3d_fields']: if len(input_dict[key].tensor) != 0: points, rot_mat_T = input_dict[key].rotate( noise_rotation, input_dict['points']) input_dict['points'] = points input_dict['pcd_rotation'] = rot_mat_T input_dict['pcd_rotation_angle'] = noise_rotation def _scale_bbox_points(self, input_dict): """Private function to scale bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points'and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ scale = input_dict['pcd_scale_factor'] points = input_dict['points'] points.scale(scale) if self.shift_height: assert 'height' in points.attribute_dims.keys(), \ 'setting shift_height=True but points have no height attribute' points.tensor[:, points.attribute_dims['height']] *= scale input_dict['points'] = points for key in input_dict['bbox3d_fields']: input_dict[key].scale(scale) def _random_scale(self, input_dict): """Private function to randomly set the scale factor. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'pcd_scale_factor' are updated in the result dict. """ scale_factor = np.random.uniform(self.scale_ratio_range[0], self.scale_ratio_range[1]) input_dict['pcd_scale_factor'] = scale_factor def __call__(self, input_dict): """Private function to rotate, scale and translate bounding boxes and points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after scaling, 'points', 'pcd_rotation', 'pcd_scale_factor', 'pcd_trans' and keys in input_dict['bbox3d_fields'] are updated in the result dict. """ self._rot_bbox_points(input_dict) self._trans_bbox_points(input_dict) input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(rot_range={self.rot_range},' repr_str += f' scale_ratio_range={self.scale_ratio_range},' repr_str += f' translation_std={self.translation_std},' repr_str += f' shift_height={self.shift_height})' return repr_str @PIPELINES.register_module() class PointShuffle(object): """Shuffle input points.""" def __call__(self, input_dict): """Call function to shuffle points. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ idx = input_dict['points'].shuffle() idx = idx.numpy() pts_instance_mask = input_dict.get('pts_instance_mask', None) pts_semantic_mask = input_dict.get('pts_semantic_mask', None) if pts_instance_mask is not None: input_dict['pts_instance_mask'] = pts_instance_mask[idx] if pts_semantic_mask is not None: input_dict['pts_semantic_mask'] = pts_semantic_mask[idx] return input_dict def __repr__(self): return self.__class__.__name__ @PIPELINES.register_module() class ObjectRangeFilter(object): """Filter objects by the range. Args: point_cloud_range (list[float]): Point cloud range. """ def __init__(self, point_cloud_range): self.pcd_range = np.array(point_cloud_range, dtype=np.float32) def __call__(self, input_dict): """Call function to filter objects by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' keys are updated in the result dict. """ # Check points instance type and initialise bev_range if isinstance(input_dict['gt_bboxes_3d'], (LiDARInstance3DBoxes, DepthInstance3DBoxes)): bev_range = self.pcd_range[[0, 1, 3, 4]] elif isinstance(input_dict['gt_bboxes_3d'], CameraInstance3DBoxes): bev_range = self.pcd_range[[0, 2, 3, 5]] gt_bboxes_3d = input_dict['gt_bboxes_3d'] gt_labels_3d = input_dict['gt_labels_3d'] mask = gt_bboxes_3d.in_range_bev(bev_range) gt_bboxes_3d = gt_bboxes_3d[mask] # mask is a torch tensor but gt_labels_3d is still numpy array # using mask to index gt_labels_3d will cause bug when # len(gt_labels_3d) == 1, where mask=1 will be interpreted # as gt_labels_3d[1] and cause out of index error gt_labels_3d = gt_labels_3d[mask.numpy().astype(np.bool)] if 'instance_inds' in input_dict.keys(): input_dict['instance_inds'] = input_dict['instance_inds'][mask.numpy().astype(np.bool)] if 'gt_agent_fut_traj' in input_dict.keys(): input_dict['gt_agent_fut_traj'] = input_dict['gt_agent_fut_traj'][mask.numpy().astype(np.bool)] input_dict['gt_agent_fut_traj_mask'] = input_dict['gt_agent_fut_traj_mask'][mask.numpy().astype(np.bool)] # limit rad to [-pi, pi] gt_bboxes_3d.limit_yaw(offset=0.5, period=2 * np.pi) input_dict['gt_bboxes_3d'] = gt_bboxes_3d input_dict['gt_labels_3d'] = gt_labels_3d return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(point_cloud_range={self.pcd_range.tolist()})' return repr_str @PIPELINES.register_module() class PointsRangeFilter(object): """Filter points by the range. Args: point_cloud_range (list[float]): Point cloud range. """ def __init__(self, point_cloud_range): self.pcd_range = np.array(point_cloud_range, dtype=np.float32) def __call__(self, input_dict): """Call function to filter points by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = input_dict['points'] points_mask = points.in_range_3d(self.pcd_range) clean_points = points[points_mask] input_dict['points'] = clean_points points_mask = points_mask.numpy() pts_instance_mask = input_dict.get('pts_instance_mask', None) pts_semantic_mask = input_dict.get('pts_semantic_mask', None) if pts_instance_mask is not None: input_dict['pts_instance_mask'] = pts_instance_mask[points_mask] if pts_semantic_mask is not None: input_dict['pts_semantic_mask'] = pts_semantic_mask[points_mask] return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(point_cloud_range={self.pcd_range.tolist()})' return repr_str @PIPELINES.register_module() class ObjectNameFilter(object): """Filter GT objects by their names. Args: classes (list[str]): List of class names to be kept for training. """ def __init__(self, classes): self.classes = classes self.labels = list(range(len(self.classes))) def __call__(self, input_dict): """Call function to filter objects by their names. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'gt_bboxes_3d', 'gt_labels_3d' keys are updated in the result dict. """ gt_labels_3d = input_dict['gt_labels_3d'] gt_bboxes_mask = np.array([n in self.labels for n in gt_labels_3d], dtype=np.bool_) input_dict['gt_bboxes_3d'] = input_dict['gt_bboxes_3d'][gt_bboxes_mask] input_dict['gt_labels_3d'] = input_dict['gt_labels_3d'][gt_bboxes_mask] if 'instance_inds' in input_dict.keys(): input_dict['instance_inds'] = input_dict['instance_inds'][gt_bboxes_mask] if 'gt_agent_fut_traj' in input_dict.keys(): input_dict['gt_agent_fut_traj'] = input_dict['gt_agent_fut_traj'][gt_bboxes_mask] input_dict['gt_agent_fut_traj_mask'] = input_dict['gt_agent_fut_traj_mask'][gt_bboxes_mask] return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(classes={self.classes})' return repr_str @PIPELINES.register_module() class PointSample(object): """Point sample. Sampling data to a certain number. Args: num_points (int): Number of points to be sampled. sample_range (float, optional): The range where to sample points. If not None, the points with depth larger than `sample_range` are prior to be sampled. Defaults to None. replace (bool, optional): Whether the sampling is with or without replacement. Defaults to False. """ def __init__(self, num_points, sample_range=None, replace=False): self.num_points = num_points self.sample_range = sample_range self.replace = replace def _points_random_sampling(self, points, num_samples, sample_range=None, replace=False, return_choices=False): """Points random sampling. Sample points to a certain number. Args: points (np.ndarray | :obj:`BasePoints`): 3D Points. num_samples (int): Number of samples to be sampled. sample_range (float, optional): Indicating the range where the points will be sampled. Defaults to None. replace (bool, optional): Sampling with or without replacement. Defaults to None. return_choices (bool, optional): Whether return choice. Defaults to False. Returns: tuple[np.ndarray] | np.ndarray: - points (np.ndarray | :obj:`BasePoints`): 3D Points. - choices (np.ndarray, optional): The generated random samples. """ if not replace: replace = (points.shape[0] < num_samples) point_range = range(len(points)) if sample_range is not None and not replace: # Only sampling the near points when len(points) >= num_samples dist = np.linalg.norm(points.tensor, axis=1) far_inds = np.where(dist >= sample_range)[0] near_inds = np.where(dist < sample_range)[0] # in case there are too many far points if len(far_inds) > num_samples: far_inds = np.random.choice( far_inds, num_samples, replace=False) point_range = near_inds num_samples -= len(far_inds) choices = np.random.choice(point_range, num_samples, replace=replace) if sample_range is not None and not replace: choices = np.concatenate((far_inds, choices)) # Shuffle points after sampling np.random.shuffle(choices) if return_choices: return points[choices], choices else: return points[choices] def __call__(self, results): """Call function to sample points to in indoor scenes. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after sampling, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] points, choices = self._points_random_sampling( points, self.num_points, self.sample_range, self.replace, return_choices=True) results['points'] = points pts_instance_mask = results.get('pts_instance_mask', None) pts_semantic_mask = results.get('pts_semantic_mask', None) if pts_instance_mask is not None: pts_instance_mask = pts_instance_mask[choices] results['pts_instance_mask'] = pts_instance_mask if pts_semantic_mask is not None: pts_semantic_mask = pts_semantic_mask[choices] results['pts_semantic_mask'] = pts_semantic_mask return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(num_points={self.num_points},' repr_str += f' sample_range={self.sample_range},' repr_str += f' replace={self.replace})' return repr_str @PIPELINES.register_module() class IndoorPointSample(PointSample): """Indoor point sample. Sampling data to a certain number. NOTE: IndoorPointSample is deprecated in favor of PointSample Args: num_points (int): Number of points to be sampled. """ def __init__(self, *args, **kwargs): warnings.warn( 'IndoorPointSample is deprecated in favor of PointSample') super(IndoorPointSample, self).__init__(*args, **kwargs) @PIPELINES.register_module() class IndoorPatchPointSample(object): r"""Indoor point sample within a patch. Modified from `PointNet++ `_. Sampling data to a certain number for semantic segmentation. Args: num_points (int): Number of points to be sampled. block_size (float, optional): Size of a block to sample points from. Defaults to 1.5. sample_rate (float, optional): Stride used in sliding patch generation. This parameter is unused in `IndoorPatchPointSample` and thus has been deprecated. We plan to remove it in the future. Defaults to None. ignore_index (int, optional): Label index that won't be used for the segmentation task. This is set in PointSegClassMapping as neg_cls. If not None, will be used as a patch selection criterion. Defaults to None. use_normalized_coord (bool, optional): Whether to use normalized xyz as additional features. Defaults to False. num_try (int, optional): Number of times to try if the patch selected is invalid. Defaults to 10. enlarge_size (float, optional): Enlarge the sampled patch to [-block_size / 2 - enlarge_size, block_size / 2 + enlarge_size] as an augmentation. If None, set it as 0. Defaults to 0.2. min_unique_num (int, optional): Minimum number of unique points the sampled patch should contain. If None, use PointNet++'s method to judge uniqueness. Defaults to None. eps (float, optional): A value added to patch boundary to guarantee points coverage. Defaults to 1e-2. Note: This transform should only be used in the training process of point cloud segmentation tasks. For the sliding patch generation and inference process in testing, please refer to the `slide_inference` function of `EncoderDecoder3D` class. """ def __init__(self, num_points, block_size=1.5, sample_rate=None, ignore_index=None, use_normalized_coord=False, num_try=10, enlarge_size=0.2, min_unique_num=None, eps=1e-2): self.num_points = num_points self.block_size = block_size self.ignore_index = ignore_index self.use_normalized_coord = use_normalized_coord self.num_try = num_try self.enlarge_size = enlarge_size if enlarge_size is not None else 0.0 self.min_unique_num = min_unique_num self.eps = eps if sample_rate is not None: warnings.warn( "'sample_rate' has been deprecated and will be removed in " 'the future. Please remove them from your code.') def _input_generation(self, coords, patch_center, coord_max, attributes, attribute_dims, point_type): """Generating model input. Generate input by subtracting patch center and adding additional features. Currently support colors and normalized xyz as features. Args: coords (np.ndarray): Sampled 3D Points. patch_center (np.ndarray): Center coordinate of the selected patch. coord_max (np.ndarray): Max coordinate of all 3D Points. attributes (np.ndarray): features of input points. attribute_dims (dict): Dictionary to indicate the meaning of extra dimension. point_type (type): class of input points inherited from BasePoints. Returns: :obj:`BasePoints`: The generated input data. """ # subtract patch center, the z dimension is not centered centered_coords = coords.copy() centered_coords[:, 0] -= patch_center[0] centered_coords[:, 1] -= patch_center[1] if self.use_normalized_coord: normalized_coord = coords / coord_max attributes = np.concatenate([attributes, normalized_coord], axis=1) if attribute_dims is None: attribute_dims = dict() attribute_dims.update( dict(normalized_coord=[ attributes.shape[1], attributes.shape[1] + 1, attributes.shape[1] + 2 ])) points = np.concatenate([centered_coords, attributes], axis=1) points = point_type( points, points_dim=points.shape[1], attribute_dims=attribute_dims) return points def _patch_points_sampling(self, points, sem_mask): """Patch points sampling. First sample a valid patch. Then sample points within that patch to a certain number. Args: points (:obj:`BasePoints`): 3D Points. sem_mask (np.ndarray): semantic segmentation mask for input points. Returns: tuple[:obj:`BasePoints`, np.ndarray] | :obj:`BasePoints`: - points (:obj:`BasePoints`): 3D Points. - choices (np.ndarray): The generated random samples. """ coords = points.coord.numpy() attributes = points.tensor[:, 3:].numpy() attribute_dims = points.attribute_dims point_type = type(points) coord_max = np.amax(coords, axis=0) coord_min = np.amin(coords, axis=0) for _ in range(self.num_try): # random sample a point as patch center cur_center = coords[np.random.choice(coords.shape[0])] # boundary of a patch, which would be enlarged by # `self.enlarge_size` as an augmentation cur_max = cur_center + np.array( [self.block_size / 2.0, self.block_size / 2.0, 0.0]) cur_min = cur_center - np.array( [self.block_size / 2.0, self.block_size / 2.0, 0.0]) cur_max[2] = coord_max[2] cur_min[2] = coord_min[2] cur_choice = np.sum( (coords >= (cur_min - self.enlarge_size)) * (coords <= (cur_max + self.enlarge_size)), axis=1) == 3 if not cur_choice.any(): # no points in this patch continue cur_coords = coords[cur_choice, :] cur_sem_mask = sem_mask[cur_choice] point_idxs = np.where(cur_choice)[0] mask = np.sum( (cur_coords >= (cur_min - self.eps)) * (cur_coords <= (cur_max + self.eps)), axis=1) == 3 # two criteria for patch sampling, adopted from PointNet++ # 1. selected patch should contain enough unique points if self.min_unique_num is None: # use PointNet++'s method as default # [31, 31, 62] are just some big values used to transform # coords from 3d array to 1d and then check their uniqueness # this is used in all the ScanNet code following PointNet++ vidx = np.ceil( (cur_coords[mask, :] - cur_min) / (cur_max - cur_min) * np.array([31.0, 31.0, 62.0])) vidx = np.unique(vidx[:, 0] * 31.0 * 62.0 + vidx[:, 1] * 62.0 + vidx[:, 2]) flag1 = len(vidx) / 31.0 / 31.0 / 62.0 >= 0.02 else: # if `min_unique_num` is provided, directly compare with it flag1 = mask.sum() >= self.min_unique_num # 2. selected patch should contain enough annotated points if self.ignore_index is None: flag2 = True else: flag2 = np.sum(cur_sem_mask != self.ignore_index) / \ len(cur_sem_mask) >= 0.7 if flag1 and flag2: break # sample idx to `self.num_points` if point_idxs.size >= self.num_points: # no duplicate in sub-sampling choices = np.random.choice( point_idxs, self.num_points, replace=False) else: # do not use random choice here to avoid some points not counted dup = np.random.choice(point_idxs.size, self.num_points - point_idxs.size) idx_dup = np.concatenate( [np.arange(point_idxs.size), np.array(dup)], 0) choices = point_idxs[idx_dup] # construct model input points = self._input_generation(coords[choices], cur_center, coord_max, attributes[choices], attribute_dims, point_type) return points, choices def __call__(self, results): """Call function to sample points to in indoor scenes. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after sampling, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] assert 'pts_semantic_mask' in results.keys(), \ 'semantic mask should be provided in training and evaluation' pts_semantic_mask = results['pts_semantic_mask'] points, choices = self._patch_points_sampling(points, pts_semantic_mask) results['points'] = points results['pts_semantic_mask'] = pts_semantic_mask[choices] pts_instance_mask = results.get('pts_instance_mask', None) if pts_instance_mask is not None: results['pts_instance_mask'] = pts_instance_mask[choices] return results def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(num_points={self.num_points},' repr_str += f' block_size={self.block_size},' repr_str += f' ignore_index={self.ignore_index},' repr_str += f' use_normalized_coord={self.use_normalized_coord},' repr_str += f' num_try={self.num_try},' repr_str += f' enlarge_size={self.enlarge_size},' repr_str += f' min_unique_num={self.min_unique_num},' repr_str += f' eps={self.eps})' return repr_str @PIPELINES.register_module() class BackgroundPointsFilter(object): """Filter background points near the bounding box. Args: bbox_enlarge_range (tuple[float], float): Bbox enlarge range. """ def __init__(self, bbox_enlarge_range): assert (is_tuple_of(bbox_enlarge_range, float) and len(bbox_enlarge_range) == 3) \ or isinstance(bbox_enlarge_range, float), \ f'Invalid arguments bbox_enlarge_range {bbox_enlarge_range}' if isinstance(bbox_enlarge_range, float): bbox_enlarge_range = [bbox_enlarge_range] * 3 self.bbox_enlarge_range = np.array( bbox_enlarge_range, dtype=np.float32)[np.newaxis, :] def __call__(self, input_dict): """Call function to filter points by the range. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after filtering, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = input_dict['points'] gt_bboxes_3d = input_dict['gt_bboxes_3d'] # avoid groundtruth being modified gt_bboxes_3d_np = gt_bboxes_3d.tensor.clone().numpy() gt_bboxes_3d_np[:, :3] = gt_bboxes_3d.gravity_center.clone().numpy() enlarged_gt_bboxes_3d = gt_bboxes_3d_np.copy() enlarged_gt_bboxes_3d[:, 3:6] += self.bbox_enlarge_range points_numpy = points.tensor.clone().numpy() foreground_masks = box_np_ops.points_in_rbbox( points_numpy, gt_bboxes_3d_np, origin=(0.5, 0.5, 0.5)) enlarge_foreground_masks = box_np_ops.points_in_rbbox( points_numpy, enlarged_gt_bboxes_3d, origin=(0.5, 0.5, 0.5)) foreground_masks = foreground_masks.max(1) enlarge_foreground_masks = enlarge_foreground_masks.max(1) valid_masks = ~np.logical_and(~foreground_masks, enlarge_foreground_masks) input_dict['points'] = points[valid_masks] pts_instance_mask = input_dict.get('pts_instance_mask', None) if pts_instance_mask is not None: input_dict['pts_instance_mask'] = pts_instance_mask[valid_masks] pts_semantic_mask = input_dict.get('pts_semantic_mask', None) if pts_semantic_mask is not None: input_dict['pts_semantic_mask'] = pts_semantic_mask[valid_masks] return input_dict def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(bbox_enlarge_range={self.bbox_enlarge_range.tolist()})' return repr_str @PIPELINES.register_module() class VoxelBasedPointSampler(object): """Voxel based point sampler. Apply voxel sampling to multiple sweep points. Args: cur_sweep_cfg (dict): Config for sampling current points. prev_sweep_cfg (dict): Config for sampling previous points. time_dim (int): Index that indicate the time dimension for input points. """ def __init__(self, cur_sweep_cfg, prev_sweep_cfg=None, time_dim=3): self.cur_voxel_generator = VoxelGenerator(**cur_sweep_cfg) self.cur_voxel_num = self.cur_voxel_generator._max_voxels self.time_dim = time_dim if prev_sweep_cfg is not None: assert prev_sweep_cfg['max_num_points'] == \ cur_sweep_cfg['max_num_points'] self.prev_voxel_generator = VoxelGenerator(**prev_sweep_cfg) self.prev_voxel_num = self.prev_voxel_generator._max_voxels else: self.prev_voxel_generator = None self.prev_voxel_num = 0 def _sample_points(self, points, sampler, point_dim): """Sample points for each points subset. Args: points (np.ndarray): Points subset to be sampled. sampler (VoxelGenerator): Voxel based sampler for each points subset. point_dim (int): The dimension of each points Returns: np.ndarray: Sampled points. """ voxels, coors, num_points_per_voxel = sampler.generate(points) if voxels.shape[0] < sampler._max_voxels: padding_points = np.zeros([ sampler._max_voxels - voxels.shape[0], sampler._max_num_points, point_dim ], dtype=points.dtype) padding_points[:] = voxels[0] sample_points = np.concatenate([voxels, padding_points], axis=0) else: sample_points = voxels return sample_points def __call__(self, results): """Call function to sample points from multiple sweeps. Args: input_dict (dict): Result dict from loading pipeline. Returns: dict: Results after sampling, 'points', 'pts_instance_mask' and 'pts_semantic_mask' keys are updated in the result dict. """ points = results['points'] original_dim = points.shape[1] # TODO: process instance and semantic mask while _max_num_points # is larger than 1 # Extend points with seg and mask fields map_fields2dim = [] start_dim = original_dim points_numpy = points.tensor.numpy() extra_channel = [points_numpy] for idx, key in enumerate(results['pts_mask_fields']): map_fields2dim.append((key, idx + start_dim)) extra_channel.append(results[key][..., None]) start_dim += len(results['pts_mask_fields']) for idx, key in enumerate(results['pts_seg_fields']): map_fields2dim.append((key, idx + start_dim)) extra_channel.append(results[key][..., None]) points_numpy = np.concatenate(extra_channel, axis=-1) # Split points into two part, current sweep points and # previous sweeps points. # TODO: support different sampling methods for next sweeps points # and previous sweeps points. cur_points_flag = (points_numpy[:, self.time_dim] == 0) cur_sweep_points = points_numpy[cur_points_flag] prev_sweeps_points = points_numpy[~cur_points_flag] if prev_sweeps_points.shape[0] == 0: prev_sweeps_points = cur_sweep_points # Shuffle points before sampling np.random.shuffle(cur_sweep_points) np.random.shuffle(prev_sweeps_points) cur_sweep_points = self._sample_points(cur_sweep_points, self.cur_voxel_generator, points_numpy.shape[1]) if self.prev_voxel_generator is not None: prev_sweeps_points = self._sample_points(prev_sweeps_points, self.prev_voxel_generator, points_numpy.shape[1]) points_numpy = np.concatenate( [cur_sweep_points, prev_sweeps_points], 0) else: points_numpy = cur_sweep_points if self.cur_voxel_generator._max_num_points == 1: points_numpy = points_numpy.squeeze(1) results['points'] = points.new_point(points_numpy[..., :original_dim]) # Restore the corresponding seg and mask fields for key, dim_index in map_fields2dim: results[key] = points_numpy[..., dim_index] return results def __repr__(self): """str: Return a string that describes the module.""" def _auto_indent(repr_str, indent): repr_str = repr_str.split('\n') repr_str = [' ' * indent + t + '\n' for t in repr_str] repr_str = ''.join(repr_str)[:-1] return repr_str repr_str = self.__class__.__name__ indent = 4 repr_str += '(\n' repr_str += ' ' * indent + f'num_cur_sweep={self.cur_voxel_num},\n' repr_str += ' ' * indent + f'num_prev_sweep={self.prev_voxel_num},\n' repr_str += ' ' * indent + f'time_dim={self.time_dim},\n' repr_str += ' ' * indent + 'cur_voxel_generator=\n' repr_str += f'{_auto_indent(repr(self.cur_voxel_generator), 8)},\n' repr_str += ' ' * indent + 'prev_voxel_generator=\n' repr_str += f'{_auto_indent(repr(self.prev_voxel_generator), 8)})' return repr_str @PIPELINES.register_module() class AffineResize(object): """Get the affine transform matrices to the target size. Different from :class:`RandomAffine` in MMDetection, this class can calculate the affine transform matrices while resizing the input image to a fixed size. The affine transform matrices include: 1) matrix transforming original image to the network input image size. 2) matrix transforming original image to the network output feature map size. Args: img_scale (tuple): Images scales for resizing. down_ratio (int): The down ratio of feature map. Actually the arg should be >= 1. bbox_clip_border (bool, optional): Whether clip the objects outside the border of the image. Defaults to True. """ def __init__(self, img_scale, down_ratio, bbox_clip_border=True): self.img_scale = img_scale self.down_ratio = down_ratio self.bbox_clip_border = bbox_clip_border def __call__(self, results): """Call function to do affine transform to input image and labels. Args: results (dict): Result dict from loading pipeline. Returns: dict: Results after affine resize, 'affine_aug', 'trans_mat' keys are added in the result dict. """ # The results have gone through RandomShiftScale before AffineResize if 'center' not in results: img = results['img'] height, width = img.shape[:2] center = np.array([width / 2, height / 2], dtype=np.float32) size = np.array([width, height], dtype=np.float32) results['affine_aug'] = False else: # The results did not go through RandomShiftScale before # AffineResize img = results['img'] center = results['center'] size = results['size'] trans_affine = self._get_transform_matrix(center, size, self.img_scale) img = cv2.warpAffine(img, trans_affine[:2, :], self.img_scale) if isinstance(self.down_ratio, tuple): trans_mat = [ self._get_transform_matrix( center, size, (self.img_scale[0] // ratio, self.img_scale[1] // ratio)) for ratio in self.down_ratio ] # (3, 3) else: trans_mat = self._get_transform_matrix( center, size, (self.img_scale[0] // self.down_ratio, self.img_scale[1] // self.down_ratio)) results['img'] = img results['img_shape'] = img.shape results['pad_shape'] = img.shape results['trans_mat'] = trans_mat self._affine_bboxes(results, trans_affine) if 'centers2d' in results: centers2d = self._affine_transform(results['centers2d'], trans_affine) valid_index = (centers2d[:, 0] > 0) & (centers2d[:, 0] < self.img_scale[0]) & (centers2d[:, 1] > 0) & ( centers2d[:, 1] < self.img_scale[1]) results['centers2d'] = centers2d[valid_index] for key in results.get('bbox_fields', []): if key in ['gt_bboxes']: results[key] = results[key][valid_index] if 'gt_labels' in results: results['gt_labels'] = results['gt_labels'][ valid_index] if 'gt_masks' in results: raise NotImplementedError( 'AffineResize only supports bbox.') for key in results.get('bbox3d_fields', []): if key in ['gt_bboxes_3d']: results[key].tensor = results[key].tensor[valid_index] if 'gt_labels_3d' in results: results['gt_labels_3d'] = results['gt_labels_3d'][ valid_index] results['depths'] = results['depths'][valid_index] return results def _affine_bboxes(self, results, matrix): """Affine transform bboxes to input image. Args: results (dict): Result dict from loading pipeline. matrix (np.ndarray): Matrix transforming original image to the network input image size. shape: (3, 3) """ for key in results.get('bbox_fields', []): bboxes = results[key] bboxes[:, :2] = self._affine_transform(bboxes[:, :2], matrix) bboxes[:, 2:] = self._affine_transform(bboxes[:, 2:], matrix) if self.bbox_clip_border: bboxes[:, [0, 2]] = bboxes[:, [0, 2]].clip(0, self.img_scale[0] - 1) bboxes[:, [1, 3]] = bboxes[:, [1, 3]].clip(0, self.img_scale[1] - 1) results[key] = bboxes def _affine_transform(self, points, matrix): """Affine transform bbox points to input image. Args: points (np.ndarray): Points to be transformed. shape: (N, 2) matrix (np.ndarray): Affine transform matrix. shape: (3, 3) Returns: np.ndarray: Transformed points. """ num_points = points.shape[0] hom_points_2d = np.concatenate((points, np.ones((num_points, 1))), axis=1) hom_points_2d = hom_points_2d.T affined_points = np.matmul(matrix, hom_points_2d).T return affined_points[:, :2] def _get_transform_matrix(self, center, scale, output_scale): """Get affine transform matrix. Args: center (tuple): Center of current image. scale (tuple): Scale of current image. output_scale (tuple[float]): The transform target image scales. Returns: np.ndarray: Affine transform matrix. """ # TODO: further add rot and shift here. src_w = scale[0] dst_w = output_scale[0] dst_h = output_scale[1] src_dir = np.array([0, src_w * -0.5]) dst_dir = np.array([0, dst_w * -0.5]) src = np.zeros((3, 2), dtype=np.float32) dst = np.zeros((3, 2), dtype=np.float32) src[0, :] = center src[1, :] = center + src_dir dst[0, :] = np.array([dst_w * 0.5, dst_h * 0.5]) dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir src[2, :] = self._get_ref_point(src[0, :], src[1, :]) dst[2, :] = self._get_ref_point(dst[0, :], dst[1, :]) get_matrix = cv2.getAffineTransform(src, dst) matrix = np.concatenate((get_matrix, [[0., 0., 1.]])) return matrix.astype(np.float32) def _get_ref_point(self, ref_point1, ref_point2): """Get reference point to calculate affine transform matrix. While using opencv to calculate the affine matrix, we need at least three corresponding points separately on original image and target image. Here we use two points to get the the third reference point. """ d = ref_point1 - ref_point2 ref_point3 = ref_point2 + np.array([-d[1], d[0]]) return ref_point3 def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(img_scale={self.img_scale}, ' repr_str += f'down_ratio={self.down_ratio}) ' return repr_str @PIPELINES.register_module() class RandomShiftScale(object): """Random shift scale. Different from the normal shift and scale function, it doesn't directly shift or scale image. It can record the shift and scale infos into loading pipelines. It's designed to be used with AffineResize together. Args: shift_scale (tuple[float]): Shift and scale range. aug_prob (float): The shifting and scaling probability. """ def __init__(self, shift_scale, aug_prob): self.shift_scale = shift_scale self.aug_prob = aug_prob def __call__(self, results): """Call function to record random shift and scale infos. Args: results (dict): Result dict from loading pipeline. Returns: dict: Results after random shift and scale, 'center', 'size' and 'affine_aug' keys are added in the result dict. """ img = results['img'] height, width = img.shape[:2] center = np.array([width / 2, height / 2], dtype=np.float32) size = np.array([width, height], dtype=np.float32) if random.random() < self.aug_prob: shift, scale = self.shift_scale[0], self.shift_scale[1] shift_ranges = np.arange(-shift, shift + 0.1, 0.1) center[0] += size[0] * random.choice(shift_ranges) center[1] += size[1] * random.choice(shift_ranges) scale_ranges = np.arange(1 - scale, 1 + scale + 0.1, 0.1) size *= random.choice(scale_ranges) results['affine_aug'] = True else: results['affine_aug'] = False results['center'] = center results['size'] = size return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(shift_scale={self.shift_scale}, ' repr_str += f'aug_prob={self.aug_prob}) ' return repr_str ================================================ FILE: mmdet3d/datasets/s3dis_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp import numpy as np from mmdet3d.core import show_seg_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmseg.datasets import DATASETS as SEG_DATASETS from .builder import DATASETS from .custom_3d import Custom3DDataset from .custom_3d_seg import Custom3DSegDataset from .pipelines import Compose @DATASETS.register_module() class S3DISDataset(Custom3DDataset): r"""S3DIS Dataset for Detection Task. This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we often train on 5 of them and test on the remaining one. The one for test is Area_5 as suggested in `GSDN `_. To concatenate 5 areas during training `mmdet.datasets.dataset_wrappers.ConcatDataset` should be used. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'Depth' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('table', 'chair', 'sofa', 'bookcase', 'board') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=None, box_type_3d='Depth', filter_empty_gt=True, test_mode=False, *kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, *kwargs) def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] if info['annos']['gt_num'] != 0: gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype( np.float32) # k, 6 gt_labels_3d = info['annos']['class'].astype(np.int64) else: gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32) gt_labels_3d = np.zeros((0, ), dtype=np.int64) # to target box structure gt_bboxes_3d = DepthInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], with_yaw=False, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) pts_instance_mask_path = osp.join(self.data_root, info['pts_instance_mask_path']) pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, pts_instance_mask_path=pts_instance_mask_path, pts_semantic_mask_path=pts_semantic_mask_path) return anns_results def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - ann_info (dict): Annotation info. """ info = self.data_infos[index] pts_filename = osp.join(self.data_root, info['pts_path']) input_dict = dict(pts_filename=pts_filename) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any(): return None return input_dict def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['points']) ] return Compose(pipeline) class _S3DISSegDataset(Custom3DSegDataset): r"""S3DIS Dataset for Semantic Segmentation Task. This class is the inner dataset for S3DIS. Since S3DIS has 6 areas, we often train on 5 of them and test on the remaining one. However, there is not a fixed train-test split of S3DIS. People often test on Area_5 as suggested by `SEGCloud `_. But many papers also report the average results of 6-fold cross validation over the 6 areas (e.g. `DGCNN `_). Therefore, we use an inner dataset for one area, and further use a dataset wrapper to concat all the provided data in different areas. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. palette (list[list[int]], optional): The palette of segmentation map. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES). Defaults to None. scene_idxs (np.ndarray | str, optional): Precomputed index to load data. For scenes with many points, we may sample it several times. Defaults to None. """ CLASSES = ('ceiling', 'floor', 'wall', 'beam', 'column', 'window', 'door', 'table', 'chair', 'sofa', 'bookcase', 'board', 'clutter') VALID_CLASS_IDS = tuple(range(13)) ALL_CLASS_IDS = tuple(range(14)) # possibly with 'stair' class PALETTE = [[0, 255, 0], [0, 0, 255], [0, 255, 255], [255, 255, 0], [255, 0, 255], [100, 100, 255], [200, 200, 100], [170, 120, 200], [255, 0, 0], [200, 100, 100], [10, 200, 100], [200, 200, 200], [50, 50, 50]] def __init__(self, data_root, ann_file, pipeline=None, classes=None, palette=None, modality=None, test_mode=False, ignore_index=None, scene_idxs=None, **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, palette=palette, modality=modality, test_mode=test_mode, ignore_index=ignore_index, scene_idxs=scene_idxs, **kwargs) def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path) return anns_results def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=self.VALID_CLASS_IDS, max_cat_id=np.max(self.ALL_CLASS_IDS)), dict( type='DefaultFormatBundle3D', with_label=False, class_names=self.CLASSES), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] return Compose(pipeline) def show(self, results, out_dir, show=True, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): data_info = self.data_infos[i] pts_path = data_info['pts_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points, gt_sem_mask = self._extract_data( i, pipeline, ['points', 'pts_semantic_mask'], load_annos=True) points = points.numpy() pred_sem_mask = result['semantic_mask'].numpy() show_seg_result(points, gt_sem_mask, pred_sem_mask, out_dir, file_name, np.array(self.PALETTE), self.ignore_index, show) def get_scene_idxs(self, scene_idxs): """Compute scene_idxs for data sampling. We sample more times for scenes with more points. """ # when testing, we load one whole scene every time if not self.test_mode and scene_idxs is None: raise NotImplementedError( 'please provide re-sampled scene indexes for training') return super().get_scene_idxs(scene_idxs) @DATASETS.register_module() @SEG_DATASETS.register_module() class S3DISSegDataset(_S3DISSegDataset): r"""S3DIS Dataset for Semantic Segmentation Task. This class serves as the API for experiments on the S3DIS Dataset. It wraps the provided datasets of different areas. We don't use `mmdet.datasets.dataset_wrappers.ConcatDataset` because we need to concat the `scene_idxs` of different areas. Please refer to the `google form `_ for data downloading. Args: data_root (str): Path of dataset root. ann_files (list[str]): Path of several annotation files. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. palette (list[list[int]], optional): The palette of segmentation map. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES). Defaults to None. scene_idxs (list[np.ndarray] | list[str], optional): Precomputed index to load data. For scenes with many points, we may sample it several times. Defaults to None. """ def __init__(self, data_root, ann_files, pipeline=None, classes=None, palette=None, modality=None, test_mode=False, ignore_index=None, scene_idxs=None, **kwargs): # make sure that ann_files and scene_idxs have same length ann_files = self._check_ann_files(ann_files) scene_idxs = self._check_scene_idxs(scene_idxs, len(ann_files)) # initialize some attributes as datasets[0] super().__init__( data_root=data_root, ann_file=ann_files[0], pipeline=pipeline, classes=classes, palette=palette, modality=modality, test_mode=test_mode, ignore_index=ignore_index, scene_idxs=scene_idxs[0], **kwargs) datasets = [ _S3DISSegDataset( data_root=data_root, ann_file=ann_files[i], pipeline=pipeline, classes=classes, palette=palette, modality=modality, test_mode=test_mode, ignore_index=ignore_index, scene_idxs=scene_idxs[i], **kwargs) for i in range(len(ann_files)) ] # data_infos and scene_idxs need to be concat self.concat_data_infos([dst.data_infos for dst in datasets]) self.concat_scene_idxs([dst.scene_idxs for dst in datasets]) # set group flag for the sampler if not self.test_mode: self._set_group_flag() def concat_data_infos(self, data_infos): """Concat data_infos from several datasets to form self.data_infos. Args: data_infos (list[list[dict]]) """ self.data_infos = [ info for one_data_infos in data_infos for info in one_data_infos ] def concat_scene_idxs(self, scene_idxs): """Concat scene_idxs from several datasets to form self.scene_idxs. Needs to manually add offset to scene_idxs[1, 2, ...]. Args: scene_idxs (list[np.ndarray]) """ self.scene_idxs = np.array([], dtype=np.int32) offset = 0 for one_scene_idxs in scene_idxs: self.scene_idxs = np.concatenate( [self.scene_idxs, one_scene_idxs + offset]).astype(np.int32) offset = np.unique(self.scene_idxs).max() + 1 @staticmethod def _duplicate_to_list(x, num): """Repeat x `num` times to form a list.""" return [x for _ in range(num)] def _check_ann_files(self, ann_file): """Make ann_files as list/tuple.""" # ann_file could be str if not isinstance(ann_file, (list, tuple)): ann_file = self._duplicate_to_list(ann_file, 1) return ann_file def _check_scene_idxs(self, scene_idx, num): """Make scene_idxs as list/tuple.""" if scene_idx is None: return self._duplicate_to_list(scene_idx, num) # scene_idx could be str, np.ndarray, list or tuple if isinstance(scene_idx, str): # str return self._duplicate_to_list(scene_idx, num) if isinstance(scene_idx[0], str): # list of str return scene_idx if isinstance(scene_idx[0], (list, tuple, np.ndarray)): # list of idx return scene_idx # single idx return self._duplicate_to_list(scene_idx, num) ================================================ FILE: mmdet3d/datasets/samplers/__init__.py ================================================ from .infinite_group_each_sample_in_batch_sampler import InfiniteGroupEachSampleInBatchSampler, InfiniteGroupEachSampleInBatchSamplerEval, TTADistributedSampler from .d_sampler import CustomDistributedSampler ================================================ FILE: mmdet3d/datasets/samplers/d_sampler.py ================================================ import math import torch from torch.utils.data import DistributedSampler as _DistributedSampler from torch.utils.data.sampler import Sampler class CustomDistributedSampler(_DistributedSampler): def __init__(self, dataset=None, num_replicas=None, rank=None, shuffle=False, seed=0): super().__init__( dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) # for the compatibility from PyTorch 1.3+ self.seed = seed if seed is not None else 0 def __iter__(self): # deterministically shuffle based on epoch if self.shuffle: assert False else: indices = torch.arange(len(self.dataset)).tolist() # add extra samples to make it evenly divisible # in case that indices is shorter than half of total_size indices = (indices * math.ceil(self.total_size / len(indices)))[:self.total_size] assert len(indices) == self.total_size # subsample per_replicas = self.total_size//self.num_replicas # indices = indices[self.rank:self.total_size:self.num_replicas] indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas] assert len(indices) == self.num_samples return iter(indices) ================================================ FILE: mmdet3d/datasets/samplers/infinite_group_each_sample_in_batch_sampler.py ================================================ import itertools import copy import numpy as np import torch import torch.distributed as dist from mmcv.runner import get_dist_info from torch.utils.data.sampler import Sampler # https://github.com/open-mmlab/mmdetection/blob/3b72b12fe9b14de906d1363982b9fba05e7d47c1/mmdet/core/utils/dist_utils.py#L157 def sync_random_seed(seed=None, device='cuda'): """Make sure different ranks share the same seed. All workers must call this function, otherwise it will deadlock. This method is generally used in `DistributedSampler`, because the seed should be identical across all processes in the distributed group. In distributed sampling, different ranks should sample non-overlapped data in the dataset. Therefore, this function is used to make sure that each rank shuffles the data indices in the same order based on the same seed. Then different ranks could use different indices to select non-overlapped data from the same data list. Args: seed (int, Optional): The seed. Default to None. device (str): The device where the seed will be put on. Default to 'cuda'. Returns: int: Seed to be used. """ if seed is None: seed = np.random.randint(2**31) assert isinstance(seed, int) rank, world_size = get_dist_info() if world_size == 1: return seed if rank == 0: random_num = torch.tensor(seed, dtype=torch.int32, device=device) else: random_num = torch.tensor(0, dtype=torch.int32, device=device) dist.broadcast(random_num, src=0) return random_num.item() class InfiniteGroupEachSampleInBatchSampler(Sampler): """ Pardon this horrendous name. Basically, we want every sample to be from its own group. If batch size is 4 and # of GPUs is 8, each sample of these 32 should be operating on its own group. Shuffling is only done for group order, not done within groups. """ def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0): _rank, _world_size = get_dist_info() if world_size is None: world_size = _world_size if rank is None: rank = _rank self.dataset = dataset self.batch_size = batch_size self.world_size = world_size self.rank = rank self.seed = sync_random_seed(seed) self.size = len(self.dataset) assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.groups_num = len(self.group_sizes) self.global_batch_size = batch_size * world_size assert self.groups_num >= self.global_batch_size # Now, for efficiency, make a dict group_idx: List[dataset sample_idxs] self.group_idx_to_sample_idxs = { group_idx: np.where(self.flag == group_idx)[0].tolist() for group_idx in range(self.groups_num)} # Get a generator per sample idx. Considering samples over all # GPUs, each sample position has its own generator self.group_indices_per_global_sample_idx = [ self._group_indices_per_global_sample_idx(self.rank * self.batch_size + local_sample_idx) for local_sample_idx in range(self.batch_size)] # Keep track of a buffer of dataset sample idxs for each local sample idx self.buffer_per_local_sample = [[] for _ in range(self.batch_size)] def _infinite_group_indices(self): g = torch.Generator() g.manual_seed(self.seed) while True: yield from torch.randperm(self.groups_num, generator=g).tolist() def _group_indices_per_global_sample_idx(self, global_sample_idx): yield from itertools.islice(self._infinite_group_indices(), global_sample_idx, None, self.global_batch_size) def __iter__(self): while True: curr_batch = [] for local_sample_idx in range(self.batch_size): if len(self.buffer_per_local_sample[local_sample_idx]) == 0: # Finished current group, refill with next group new_group_idx = next(self.group_indices_per_global_sample_idx[local_sample_idx]) self.buffer_per_local_sample[local_sample_idx] = \ copy.deepcopy( self.group_idx_to_sample_idxs[new_group_idx]) curr_batch.append(self.buffer_per_local_sample[local_sample_idx].pop(0)) yield curr_batch def __len__(self): """Length of base dataset.""" return self.size def set_epoch(self, epoch): self.epoch = epoch class InfiniteGroupEachSampleInBatchSamplerEval(Sampler): """ Pardon this horrendous name. Basically, we want every sample to be from its own group. If batch size is 4 and # of GPUs is 8, each sample of these 32 should be operating on its own group. Shuffling is only done for group order, not done within groups. """ def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0): _rank, _world_size = get_dist_info() if world_size is None: world_size = _world_size if rank is None: rank = _rank self.dataset = dataset self.batch_size = batch_size self.world_size = world_size self.rank = rank self.seed = sync_random_seed(seed) self.size = len(self.dataset) assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.groups_num = len(self.group_sizes) self.global_batch_size = batch_size * world_size assert self.groups_num >= self.global_batch_size # Now, for efficiency, make a dict group_idx: List[dataset sample_idxs] self.group_idx_to_sample_idxs = { group_idx: np.where(self.flag == group_idx)[0].tolist() for group_idx in range(self.groups_num)} # Get a generator per sample idx. Considering samples over all # GPUs, each sample position has its own generator self.group_indices_per_global_sample_idx = [ self._group_indices_per_global_sample_idx(self.rank * self.batch_size + local_sample_idx) for local_sample_idx in range(self.batch_size)] # Keep track of a buffer of dataset sample idxs for each local sample idx self.buffer_per_local_sample = [[] for _ in range(self.batch_size)] def _infinite_group_indices(self): g = torch.Generator() g.manual_seed(self.seed) while True: yield from torch.randperm(self.groups_num, generator=g).tolist() def _group_indices_per_global_sample_idx(self, global_sample_idx): yield from itertools.islice(self._infinite_group_indices(), global_sample_idx, None, self.global_batch_size) def __iter__(self): t = (len(self.flag)+self.world_size*16 + 1)//self.world_size for i in range(t): if i == 0: self.buffer_per_local_sample = [[] for _ in range(self.batch_size)] curr_batch = [] for local_sample_idx in range(self.batch_size): if len(self.buffer_per_local_sample[local_sample_idx]) == 0: # Finished current group, refill with next group new_group_idx = next(self.group_indices_per_global_sample_idx[local_sample_idx]) self.buffer_per_local_sample[local_sample_idx] = \ copy.deepcopy( self.group_idx_to_sample_idxs[new_group_idx]) curr_batch.append(self.buffer_per_local_sample[local_sample_idx].pop(0)) yield curr_batch def __len__(self): """Length of base dataset.""" return self.size def set_epoch(self, epoch): self.epoch = epoch class TTADistributedSampler(Sampler): def __init__(self, dataset, batch_size=1, world_size=None, rank=None, seed=0): _rank, _world_size = get_dist_info() if world_size is None: world_size = _world_size if rank is None: rank = _rank self.dataset = dataset assert batch_size == 1 self.batch_size = batch_size self.world_size = world_size self.rank = rank self.seed = sync_random_seed(seed) self.size = len(self.dataset) def __iter__(self): indices = torch.arange(len(self.dataset)).tolist() for i in indices: yield [i] def __len__(self): """Length of base dataset.""" return self.size * 8 ================================================ FILE: mmdet3d/datasets/scannet_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import tempfile import warnings from os import path as osp import numpy as np from mmdet3d.core import instance_seg_eval, show_result, show_seg_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmseg.datasets import DATASETS as SEG_DATASETS from .builder import DATASETS from .custom_3d import Custom3DDataset from .custom_3d_seg import Custom3DSegDataset from .pipelines import Compose @DATASETS.register_module() class ScanNetDataset(Custom3DDataset): r"""ScanNet Dataset for Detection Task. This class serves as the API for experiments on the ScanNet Dataset. Please refer to the `github repo `_ for data downloading. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'Depth' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=dict(use_camera=False, use_depth=True), box_type_3d='Depth', filter_empty_gt=True, test_mode=False, **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, **kwargs) assert 'use_camera' in self.modality and \ 'use_depth' in self.modality assert self.modality['use_camera'] or self.modality['use_depth'] def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - img_prefix (str, optional): Prefix of image files. - img_info (dict, optional): Image info. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['point_cloud']['lidar_idx'] pts_filename = osp.join(self.data_root, info['pts_path']) input_dict = dict(sample_idx=sample_idx) if self.modality['use_depth']: input_dict['pts_filename'] = pts_filename input_dict['file_name'] = pts_filename if self.modality['use_camera']: img_info = [] for img_path in info['img_paths']: img_info.append( dict(filename=osp.join(self.data_root, img_path))) intrinsic = info['intrinsics'] axis_align_matrix = self._get_axis_align_matrix(info) depth2img = [] for extrinsic in info['extrinsics']: depth2img.append( intrinsic @ np.linalg.inv(axis_align_matrix @ extrinsic)) input_dict['img_prefix'] = None input_dict['img_info'] = img_info input_dict['depth2img'] = depth2img if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any(): return None return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. - pts_semantic_mask_path (str): Path of semantic masks. - axis_align_matrix (np.ndarray): Transformation matrix for global scene alignment. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] if info['annos']['gt_num'] != 0: gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype( np.float32) # k, 6 gt_labels_3d = info['annos']['class'].astype(np.int64) else: gt_bboxes_3d = np.zeros((0, 6), dtype=np.float32) gt_labels_3d = np.zeros((0, ), dtype=np.int64) # to target box structure gt_bboxes_3d = DepthInstance3DBoxes( gt_bboxes_3d, box_dim=gt_bboxes_3d.shape[-1], with_yaw=False, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) pts_instance_mask_path = osp.join(self.data_root, info['pts_instance_mask_path']) pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) axis_align_matrix = self._get_axis_align_matrix(info) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, pts_instance_mask_path=pts_instance_mask_path, pts_semantic_mask_path=pts_semantic_mask_path, axis_align_matrix=axis_align_matrix) return anns_results def prepare_test_data(self, index): """Prepare data for testing. We should take axis_align_matrix from self.data_infos since we need to align point clouds. Args: index (int): Index for accessing the target data. Returns: dict: Testing data dict of the corresponding index. """ input_dict = self.get_data_info(index) # take the axis_align_matrix from data_infos input_dict['ann_info'] = dict( axis_align_matrix=self._get_axis_align_matrix( self.data_infos[index])) self.pre_pipeline(input_dict) example = self.pipeline(input_dict) return example @staticmethod def _get_axis_align_matrix(info): """Get axis_align_matrix from info. If not exist, return identity mat. Args: info (dict): one data info term. Returns: np.ndarray: 4x4 transformation matrix. """ if 'axis_align_matrix' in info['annos'].keys(): return info['annos']['axis_align_matrix'].astype(np.float32) else: warnings.warn( 'axis_align_matrix is not found in ScanNet data info, please ' 'use new pre-process scripts to re-generate ScanNet data') return np.eye(4).astype(np.float32) def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2]), dict(type='GlobalAlignment', rotation_axis=2), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['points']) ] return Compose(pipeline) def show(self, results, out_dir, show=True, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): data_info = self.data_infos[i] pts_path = data_info['pts_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points = self._extract_data(i, pipeline, 'points').numpy() gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy() pred_bboxes = result['boxes_3d'].tensor.numpy() show_result(points, gt_bboxes, pred_bboxes, out_dir, file_name, show) @DATASETS.register_module() @SEG_DATASETS.register_module() class ScanNetSegDataset(Custom3DSegDataset): r"""ScanNet Dataset for Semantic Segmentation Task. This class serves as the API for experiments on the ScanNet Dataset. Please refer to the `github repo `_ for data downloading. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. palette (list[list[int]], optional): The palette of segmentation map. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES). Defaults to None. scene_idxs (np.ndarray | str, optional): Precomputed index to load data. For scenes with many points, we may sample it several times. Defaults to None. """ CLASSES = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'otherfurniture') VALID_CLASS_IDS = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39) ALL_CLASS_IDS = tuple(range(41)) PALETTE = [ [174, 199, 232], [152, 223, 138], [31, 119, 180], [255, 187, 120], [188, 189, 34], [140, 86, 75], [255, 152, 150], [214, 39, 40], [197, 176, 213], [148, 103, 189], [196, 156, 148], [23, 190, 207], [247, 182, 210], [219, 219, 141], [255, 127, 14], [158, 218, 229], [44, 160, 44], [112, 128, 144], [227, 119, 194], [82, 84, 163], ] def __init__(self, data_root, ann_file, pipeline=None, classes=None, palette=None, modality=None, test_mode=False, ignore_index=None, scene_idxs=None, **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, palette=palette, modality=modality, test_mode=test_mode, ignore_index=ignore_index, scene_idxs=scene_idxs, **kwargs) def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path) return anns_results def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=False, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=self.VALID_CLASS_IDS, max_cat_id=np.max(self.ALL_CLASS_IDS)), dict( type='DefaultFormatBundle3D', with_label=False, class_names=self.CLASSES), dict(type='Collect3D', keys=['points', 'pts_semantic_mask']) ] return Compose(pipeline) def show(self, results, out_dir, show=True, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): data_info = self.data_infos[i] pts_path = data_info['pts_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points, gt_sem_mask = self._extract_data( i, pipeline, ['points', 'pts_semantic_mask'], load_annos=True) points = points.numpy() pred_sem_mask = result['semantic_mask'].numpy() show_seg_result(points, gt_sem_mask, pred_sem_mask, out_dir, file_name, np.array(self.PALETTE), self.ignore_index, show) def get_scene_idxs(self, scene_idxs): """Compute scene_idxs for data sampling. We sample more times for scenes with more points. """ # when testing, we load one whole scene every time if not self.test_mode and scene_idxs is None: raise NotImplementedError( 'please provide re-sampled scene indexes for training') return super().get_scene_idxs(scene_idxs) def format_results(self, results, txtfile_prefix=None): r"""Format the results to txt file. Refer to `ScanNet documentation `_. Args: outputs (list[dict]): Testing results of the dataset. txtfile_prefix (str): The prefix of saved files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. Returns: tuple: (outputs, tmp_dir), outputs is the detection results, tmp_dir is the temporal directory created for saving submission files when ``submission_prefix`` is not specified. """ import mmcv if txtfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() txtfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None mmcv.mkdir_or_exist(txtfile_prefix) # need to map network output to original label idx pred2label = np.zeros(len(self.VALID_CLASS_IDS)).astype(np.int) for original_label, output_idx in self.label_map.items(): if output_idx != self.ignore_index: pred2label[output_idx] = original_label outputs = [] for i, result in enumerate(results): info = self.data_infos[i] sample_idx = info['point_cloud']['lidar_idx'] pred_sem_mask = result['semantic_mask'].numpy().astype(np.int) pred_label = pred2label[pred_sem_mask] curr_file = f'{txtfile_prefix}/{sample_idx}.txt' np.savetxt(curr_file, pred_label, fmt='%d') outputs.append(dict(seg_mask=pred_label)) return outputs, tmp_dir @DATASETS.register_module() @SEG_DATASETS.register_module() class ScanNetInstanceSegDataset(Custom3DSegDataset): CLASSES = ('cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin') VALID_CLASS_IDS = (3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39) ALL_CLASS_IDS = tuple(range(41)) def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - pts_semantic_mask_path (str): Path of semantic masks. - pts_instance_mask_path (str): Path of instance masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] pts_instance_mask_path = osp.join(self.data_root, info['pts_instance_mask_path']) pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict( pts_instance_mask_path=pts_instance_mask_path, pts_semantic_mask_path=pts_semantic_mask_path) return anns_results def get_classes_and_palette(self, classes=None, palette=None): """Get class names of current dataset. Palette is simply ignored for instance segmentation. Args: classes (Sequence[str] | str | None): If classes is None, use default CLASSES defined by builtin dataset. If classes is a string, take it as a file name. The file contains the name of classes where each line contains one class name. If classes is a tuple or list, override the CLASSES defined by the dataset. Defaults to None. palette (Sequence[Sequence[int]]] | np.ndarray | None): The palette of segmentation map. If None is given, random palette will be generated. Defaults to None. """ if classes is not None: return classes, None return self.CLASSES, None def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, use_color=True, load_dim=6, use_dim=[0, 1, 2, 3, 4, 5]), dict( type='LoadAnnotations3D', with_bbox_3d=False, with_label_3d=False, with_mask_3d=True, with_seg_3d=True), dict( type='PointSegClassMapping', valid_cat_ids=self.VALID_CLASS_IDS, max_cat_id=40), dict( type='DefaultFormatBundle3D', with_label=False, class_names=self.CLASSES), dict( type='Collect3D', keys=['points', 'pts_semantic_mask', 'pts_instance_mask']) ] return Compose(pipeline) def evaluate(self, results, metric=None, options=None, logger=None, show=False, out_dir=None, pipeline=None): """Evaluation in instance segmentation protocol. Args: results (list[dict]): List of results. metric (str | list[str]): Metrics to be evaluated. options (dict, optional): options for instance_seg_eval. logger (logging.Logger | None | str): Logger used for printing related information during evaluation. Defaults to None. show (bool, optional): Whether to visualize. Defaults to False. out_dir (str, optional): Path to save the visualization results. Defaults to None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict: Evaluation results. """ assert isinstance( results, list), f'Expect results to be list, got {type(results)}.' assert len(results) > 0, 'Expect length of results > 0.' assert len(results) == len(self.data_infos) assert isinstance( results[0], dict ), f'Expect elements in results to be dict, got {type(results[0])}.' load_pipeline = self._get_pipeline(pipeline) pred_instance_masks = [result['instance_mask'] for result in results] pred_instance_labels = [result['instance_label'] for result in results] pred_instance_scores = [result['instance_score'] for result in results] gt_semantic_masks, gt_instance_masks = zip(*[ self._extract_data( index=i, pipeline=load_pipeline, key=['pts_semantic_mask', 'pts_instance_mask'], load_annos=True) for i in range(len(self.data_infos)) ]) ret_dict = instance_seg_eval( gt_semantic_masks, gt_instance_masks, pred_instance_masks, pred_instance_labels, pred_instance_scores, valid_class_ids=self.VALID_CLASS_IDS, class_labels=self.CLASSES, options=options, logger=logger) if show: raise NotImplementedError('show is not implemented for now') return ret_dict ================================================ FILE: mmdet3d/datasets/semantickitti_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp from .builder import DATASETS from .custom_3d import Custom3DDataset @DATASETS.register_module() class SemanticKITTIDataset(Custom3DDataset): r"""SemanticKITTI Dataset. This class serves as the API for experiments on the SemanticKITTI Dataset Please refer to `_ for data downloading Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): NO 3D box for this dataset. You can choose any type Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('unlabeled', 'car', 'bicycle', 'motorcycle', 'truck', 'bus', 'person', 'bicyclist', 'motorcyclist', 'road', 'parking', 'sidewalk', 'other-ground', 'building', 'fence', 'vegetation', 'trunck', 'terrian', 'pole', 'traffic-sign') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=None, box_type_3d='Lidar', filter_empty_gt=False, test_mode=False): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode) def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - file_name (str): Filename of point clouds. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['point_cloud']['lidar_idx'] pts_filename = osp.join(self.data_root, info['pts_path']) input_dict = dict( pts_filename=pts_filename, sample_idx=sample_idx, file_name=pts_filename) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and ~(annos['gt_labels_3d'] != -1).any(): return None return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] pts_semantic_mask_path = osp.join(self.data_root, info['pts_semantic_mask_path']) anns_results = dict(pts_semantic_mask_path=pts_semantic_mask_path) return anns_results ================================================ FILE: mmdet3d/datasets/sunrgbd_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict from os import path as osp import numpy as np from mmdet3d.core import show_multi_modality_result, show_result from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet.core import eval_map from .builder import DATASETS from .custom_3d import Custom3DDataset from .pipelines import Compose @DATASETS.register_module() class SUNRGBDDataset(Custom3DDataset): r"""SUNRGBD Dataset. This class serves as the API for experiments on the SUNRGBD Dataset. See the `download page `_ for data downloading. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'Depth' in this dataset. Available options includes - 'LiDAR': Box in LiDAR coordinates. - 'Depth': Box in depth coordinates, usually for indoor dataset. - 'Camera': Box in camera coordinates. filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. """ CLASSES = ('bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub') def __init__(self, data_root, ann_file, pipeline=None, classes=None, modality=dict(use_camera=True, use_lidar=True), box_type_3d='Depth', filter_empty_gt=True, test_mode=False, **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, **kwargs) assert 'use_camera' in self.modality and \ 'use_lidar' in self.modality assert self.modality['use_camera'] or self.modality['use_lidar'] def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str, optional): Filename of point clouds. - file_name (str, optional): Filename of point clouds. - img_prefix (str, optional): Prefix of image files. - img_info (dict, optional): Image info. - calib (dict, optional): Camera calibration info. - ann_info (dict): Annotation info. """ info = self.data_infos[index] sample_idx = info['point_cloud']['lidar_idx'] assert info['point_cloud']['lidar_idx'] == info['image']['image_idx'] input_dict = dict(sample_idx=sample_idx) if self.modality['use_lidar']: pts_filename = osp.join(self.data_root, info['pts_path']) input_dict['pts_filename'] = pts_filename input_dict['file_name'] = pts_filename if self.modality['use_camera']: img_filename = osp.join( osp.join(self.data_root, 'sunrgbd_trainval'), info['image']['image_path']) input_dict['img_prefix'] = None input_dict['img_info'] = dict(filename=img_filename) calib = info['calib'] rt_mat = calib['Rt'] # follow Coord3DMode.convert_point rt_mat = np.array([[1, 0, 0], [0, 0, -1], [0, 1, 0] ]) @ rt_mat.transpose(1, 0) depth2img = calib['K'] @ rt_mat input_dict['depth2img'] = depth2img if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos if self.filter_empty_gt and len(annos['gt_bboxes_3d']) == 0: return None return input_dict def get_ann_info(self, index): """Get annotation info according to the given index. Args: index (int): Index of the annotation data to get. Returns: dict: annotation information consists of the following keys: - gt_bboxes_3d (:obj:`DepthInstance3DBoxes`): 3D ground truth bboxes - gt_labels_3d (np.ndarray): Labels of ground truths. - pts_instance_mask_path (str): Path of instance masks. - pts_semantic_mask_path (str): Path of semantic masks. """ # Use index to get the annos, thus the evalhook could also use this api info = self.data_infos[index] if info['annos']['gt_num'] != 0: gt_bboxes_3d = info['annos']['gt_boxes_upright_depth'].astype( np.float32) # k, 6 gt_labels_3d = info['annos']['class'].astype(np.int64) else: gt_bboxes_3d = np.zeros((0, 7), dtype=np.float32) gt_labels_3d = np.zeros((0, ), dtype=np.int64) # to target box structure gt_bboxes_3d = DepthInstance3DBoxes( gt_bboxes_3d, origin=(0.5, 0.5, 0.5)).convert_to(self.box_mode_3d) anns_results = dict( gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d) if self.modality['use_camera']: if info['annos']['gt_num'] != 0: gt_bboxes_2d = info['annos']['bbox'].astype(np.float32) else: gt_bboxes_2d = np.zeros((0, 4), dtype=np.float32) anns_results['bboxes'] = gt_bboxes_2d anns_results['labels'] = gt_labels_3d return anns_results def _build_default_pipeline(self): """Build the default pipeline for this dataset.""" pipeline = [ dict( type='LoadPointsFromFile', coord_type='DEPTH', shift_height=False, load_dim=6, use_dim=[0, 1, 2]), dict( type='DefaultFormatBundle3D', class_names=self.CLASSES, with_label=False), dict(type='Collect3D', keys=['points']) ] if self.modality['use_camera']: pipeline.insert(0, dict(type='LoadImageFromFile')) return Compose(pipeline) def show(self, results, out_dir, show=True, pipeline=None): """Results visualization. Args: results (list[dict]): List of bounding boxes results. out_dir (str): Output directory of visualization result. show (bool): Visualize the results online. pipeline (list[dict], optional): raw data loading for showing. Default: None. """ assert out_dir is not None, 'Expect out_dir, got none.' pipeline = self._get_pipeline(pipeline) for i, result in enumerate(results): data_info = self.data_infos[i] pts_path = data_info['pts_path'] file_name = osp.split(pts_path)[-1].split('.')[0] points, img_metas, img = self._extract_data( i, pipeline, ['points', 'img_metas', 'img']) # scale colors to [0, 255] points = points.numpy() points[:, 3:] *= 255 gt_bboxes = self.get_ann_info(i)['gt_bboxes_3d'].tensor.numpy() pred_bboxes = result['boxes_3d'].tensor.numpy() show_result(points, gt_bboxes.copy(), pred_bboxes.copy(), out_dir, file_name, show) # multi-modality visualization if self.modality['use_camera']: img = img.numpy() # need to transpose channel to first dim img = img.transpose(1, 2, 0) pred_bboxes = DepthInstance3DBoxes( pred_bboxes, origin=(0.5, 0.5, 0)) gt_bboxes = DepthInstance3DBoxes( gt_bboxes, origin=(0.5, 0.5, 0)) show_multi_modality_result( img, gt_bboxes, pred_bboxes, None, out_dir, file_name, box_mode='depth', img_metas=img_metas, show=show) def evaluate(self, results, metric=None, iou_thr=(0.25, 0.5), iou_thr_2d=(0.5, ), logger=None, show=False, out_dir=None, pipeline=None): """Evaluate. Evaluation in indoor protocol. Args: results (list[dict]): List of results. metric (str | list[str], optional): Metrics to be evaluated. Default: None. iou_thr (list[float], optional): AP IoU thresholds for 3D evaluation. Default: (0.25, 0.5). iou_thr_2d (list[float], optional): AP IoU thresholds for 2D evaluation. Default: (0.5, ). show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict: Evaluation results. """ # evaluate 3D detection performance if isinstance(results[0], dict): return super().evaluate(results, metric, iou_thr, logger, show, out_dir, pipeline) # evaluate 2D detection performance else: eval_results = OrderedDict() annotations = [self.get_ann_info(i) for i in range(len(self))] iou_thr_2d = (iou_thr_2d) if isinstance(iou_thr_2d, float) else iou_thr_2d for iou_thr_2d_single in iou_thr_2d: mean_ap, _ = eval_map( results, annotations, scale_ranges=None, iou_thr=iou_thr_2d_single, dataset=self.CLASSES, logger=logger) eval_results['mAP_' + str(iou_thr_2d_single)] = mean_ap return eval_results ================================================ FILE: mmdet3d/datasets/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv # yapf: disable from mmdet3d.datasets.pipelines import (Collect3D, DefaultFormatBundle3D, LoadAnnotations3D, LoadImageFromFileMono3D, LoadMultiViewImageFromFiles, LoadPointsFromFile, LoadPointsFromMultiSweeps, MultiScaleFlipAug3D, PointSegClassMapping) from mmdet.datasets.pipelines import LoadImageFromFile, MultiScaleFlipAug # yapf: enable from .builder import PIPELINES def is_loading_function(transform): """Judge whether a transform function is a loading function. Note: `MultiScaleFlipAug3D` is a wrapper for multiple pipeline functions, so we need to search if its inner transforms contain any loading function. Args: transform (dict | :obj:`Pipeline`): A transform config or a function. Returns: bool: Whether it is a loading function. None means can't judge. When transform is `MultiScaleFlipAug3D`, we return None. """ # TODO: use more elegant way to distinguish loading modules loading_functions = (LoadImageFromFile, LoadPointsFromFile, LoadAnnotations3D, LoadMultiViewImageFromFiles, LoadPointsFromMultiSweeps, DefaultFormatBundle3D, Collect3D, LoadImageFromFileMono3D, PointSegClassMapping) if isinstance(transform, dict): obj_cls = PIPELINES.get(transform['type']) if obj_cls is None: return False if obj_cls in loading_functions: return True if obj_cls in (MultiScaleFlipAug3D, MultiScaleFlipAug): return None elif callable(transform): if isinstance(transform, loading_functions): return True if isinstance(transform, (MultiScaleFlipAug3D, MultiScaleFlipAug)): return None return False def get_loading_pipeline(pipeline): """Only keep loading image, points and annotations related configuration. Args: pipeline (list[dict] | list[:obj:`Pipeline`]): Data pipeline configs or list of pipeline functions. Returns: list[dict] | list[:obj:`Pipeline`]): The new pipeline list with only keep loading image, points and annotations related configuration. Examples: >>> pipelines = [ ... dict(type='LoadPointsFromFile', ... coord_type='LIDAR', load_dim=4, use_dim=4), ... dict(type='LoadImageFromFile'), ... dict(type='LoadAnnotations3D', ... with_bbox=True, with_label_3d=True), ... dict(type='Resize', ... img_scale=[(640, 192), (2560, 768)], keep_ratio=True), ... dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), ... dict(type='PointsRangeFilter', ... point_cloud_range=point_cloud_range), ... dict(type='ObjectRangeFilter', ... point_cloud_range=point_cloud_range), ... dict(type='PointShuffle'), ... dict(type='Normalize', **img_norm_cfg), ... dict(type='Pad', size_divisor=32), ... dict(type='DefaultFormatBundle3D', class_names=class_names), ... dict(type='Collect3D', ... keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']) ... ] >>> expected_pipelines = [ ... dict(type='LoadPointsFromFile', ... coord_type='LIDAR', load_dim=4, use_dim=4), ... dict(type='LoadImageFromFile'), ... dict(type='LoadAnnotations3D', ... with_bbox=True, with_label_3d=True), ... dict(type='DefaultFormatBundle3D', class_names=class_names), ... dict(type='Collect3D', ... keys=['points', 'img', 'gt_bboxes_3d', 'gt_labels_3d']) ... ] >>> assert expected_pipelines == \ ... get_loading_pipeline(pipelines) """ loading_pipeline = [] for transform in pipeline: is_loading = is_loading_function(transform) if is_loading is None: # MultiScaleFlipAug3D # extract its inner pipeline if isinstance(transform, dict): inner_pipeline = transform.get('transforms', []) else: inner_pipeline = transform.transforms.transforms loading_pipeline.extend(get_loading_pipeline(inner_pipeline)) elif is_loading: loading_pipeline.append(transform) assert len(loading_pipeline) > 0, \ 'The data pipeline in your config file must include ' \ 'loading step.' return loading_pipeline def extract_result_dict(results, key): """Extract and return the data corresponding to key in result dict. ``results`` is a dict output from `pipeline(input_dict)`, which is the loaded data from ``Dataset`` class. The data terms inside may be wrapped in list, tuple and DataContainer, so this function essentially extracts data from these wrappers. Args: results (dict): Data loaded using pipeline. key (str): Key of the desired data. Returns: np.ndarray | torch.Tensor: Data term. """ if key not in results.keys(): return None # results[key] may be data or list[data] or tuple[data] # data may be wrapped inside DataContainer data = results[key] if isinstance(data, (list, tuple)): data = data[0] if isinstance(data, mmcv.parallel.DataContainer): data = data._data return data import numpy as np from pyquaternion import Quaternion def nuscenes_get_rt_matrix( src_sample, dest_sample, src_mod, dest_mod): """ CAM_FRONT_XYD indicates going from 2d image coords + depth Note that image coords need to multiplied with said depths first to bring it into 2d hom coords. CAM_FRONT indicates going from camera coordinates xyz Method is: whatever the input is, transform to global first. """ possible_mods = ['CAM_FRONT_XYD', 'CAM_FRONT_RIGHT_XYD', 'CAM_FRONT_LEFT_XYD', 'CAM_BACK_XYD', 'CAM_BACK_LEFT_XYD', 'CAM_BACK_RIGHT_XYD', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', 'lidar', 'ego', 'global'] assert src_mod in possible_mods and dest_mod in possible_mods src_lidar_to_ego = np.eye(4, 4) src_lidar_to_ego[:3, :3] = Quaternion(src_sample['lidar2ego_rotation']).rotation_matrix src_lidar_to_ego[:3, 3] = np.array(src_sample['lidar2ego_translation']) src_ego_to_global = np.eye(4, 4) src_ego_to_global[:3, :3] = Quaternion(src_sample['ego2global_rotation']).rotation_matrix src_ego_to_global[:3, 3] = np.array(src_sample['ego2global_translation']) dest_lidar_to_ego = np.eye(4, 4) dest_lidar_to_ego[:3, :3] = Quaternion(dest_sample['lidar2ego_rotation']).rotation_matrix dest_lidar_to_ego[:3, 3] = np.array(dest_sample['lidar2ego_translation']) dest_ego_to_global = np.eye(4, 4) dest_ego_to_global[:3, :3] = Quaternion(dest_sample['ego2global_rotation']).rotation_matrix dest_ego_to_global[:3, 3] = np.array(dest_sample['ego2global_translation']) src_mod_to_global = None dest_global_to_mod = None if src_mod == "global": src_mod_to_global = np.eye(4, 4) elif src_mod == "ego": src_mod_to_global = src_ego_to_global elif src_mod == "lidar": src_mod_to_global = src_ego_to_global @ src_lidar_to_ego elif "CAM" in src_mod: src_sample_cam = src_sample['cams'][src_mod.replace("_XYD", "")] src_cam_to_lidar = np.eye(4, 4) src_cam_to_lidar[:3, :3] = src_sample_cam['sensor2lidar_rotation'] src_cam_to_lidar[:3, 3] = src_sample_cam['sensor2lidar_translation'] src_cam_intrinsics = np.eye(4, 4) src_cam_intrinsics[:3, :3] = src_sample_cam['cam_intrinsic'] if "XYD" not in src_mod: src_mod_to_global = (src_ego_to_global @ src_lidar_to_ego @ src_cam_to_lidar) else: src_mod_to_global = (src_ego_to_global @ src_lidar_to_ego @ src_cam_to_lidar @ np.linalg.inv(src_cam_intrinsics)) if dest_mod == "global": dest_global_to_mod = np.eye(4, 4) elif dest_mod == "ego": dest_global_to_mod = np.linalg.inv(dest_ego_to_global) elif dest_mod == "lidar": dest_global_to_mod = np.linalg.inv(dest_ego_to_global @ dest_lidar_to_ego) elif "CAM" in dest_mod: dest_sample_cam = dest_sample['cams'][dest_mod.replace("_XYD", "")] dest_cam_to_lidar = np.eye(4, 4) dest_cam_to_lidar[:3, :3] = dest_sample_cam['sensor2lidar_rotation'] dest_cam_to_lidar[:3, 3] = dest_sample_cam['sensor2lidar_translation'] dest_cam_intrinsics = np.eye(4, 4) dest_cam_intrinsics[:3, :3] = dest_sample_cam['cam_intrinsic'] if "XYD" not in dest_mod: dest_global_to_mod = np.linalg.inv(dest_ego_to_global @ dest_lidar_to_ego @ dest_cam_to_lidar) else: dest_global_to_mod = np.linalg.inv(dest_ego_to_global @ dest_lidar_to_ego @ dest_cam_to_lidar @ np.linalg.inv(dest_cam_intrinsics)) return dest_global_to_mod @ src_mod_to_global ================================================ FILE: mmdet3d/datasets/vector_map.py ================================================ import os import json import copy import tempfile from typing import Dict, List import numpy as np import pyquaternion import mmcv from os import path as osp from mmdet.datasets import DATASETS import torch import numpy as np from nuscenes.eval.common.utils import quaternion_yaw, Quaternion # from .vad_custom_nuscenes_eval import NuScenesEval_custom from nuscenes.eval.common.utils import center_distance # from projects.mmdet3d_plugin.models.utils.visual import save_tensor from mmcv.parallel import DataContainer as DC import random from nuscenes.utils.data_classes import Box as NuScenesBox # from projects.mmdet3d_plugin.core.bbox.structures.nuscenes_box import CustomNuscenesBox from shapely import affinity, ops from shapely.geometry import LineString, box, MultiPolygon, MultiLineString from mmdet.datasets.pipelines import to_tensor from nuscenes.map_expansion.map_api import NuScenesMap, NuScenesMapExplorer from nuscenes.eval.detection.constants import DETECTION_NAMES class LiDARInstanceLines(object): """Line instance in LIDAR coordinates """ def __init__(self, instance_line_list, sample_dist=1, num_samples=250, padding=False, fixed_num=-1, padding_value=-10000, patch_size=None): assert isinstance(instance_line_list, list) assert patch_size is not None if len(instance_line_list) != 0: assert isinstance(instance_line_list[0], LineString) self.patch_size = patch_size self.max_x = self.patch_size[1] / 2 self.max_y = self.patch_size[0] / 2 self.sample_dist = sample_dist self.num_samples = num_samples self.padding = padding self.fixed_num = fixed_num self.padding_value = padding_value self.instance_list = instance_line_list @property def start_end_points(self): """ return torch.Tensor([N,4]), in xstart, ystart, xend, yend form """ assert len(self.instance_list) != 0 instance_se_points_list = [] for instance in self.instance_list: se_points = [] se_points.extend(instance.coords[0]) se_points.extend(instance.coords[-1]) instance_se_points_list.append(se_points) instance_se_points_array = np.array(instance_se_points_list) instance_se_points_tensor = to_tensor(instance_se_points_array) instance_se_points_tensor = instance_se_points_tensor.to( dtype=torch.float32) instance_se_points_tensor[:,0] = torch.clamp(instance_se_points_tensor[:,0], min=-self.max_x,max=self.max_x) instance_se_points_tensor[:,1] = torch.clamp(instance_se_points_tensor[:,1], min=-self.max_y,max=self.max_y) instance_se_points_tensor[:,2] = torch.clamp(instance_se_points_tensor[:,2], min=-self.max_x,max=self.max_x) instance_se_points_tensor[:,3] = torch.clamp(instance_se_points_tensor[:,3], min=-self.max_y,max=self.max_y) return instance_se_points_tensor @property def bbox(self): """ return torch.Tensor([N,4]), in xmin, ymin, xmax, ymax form """ assert len(self.instance_list) != 0 instance_bbox_list = [] for instance in self.instance_list: # bounds is bbox: [xmin, ymin, xmax, ymax] instance_bbox_list.append(instance.bounds) instance_bbox_array = np.array(instance_bbox_list) instance_bbox_tensor = to_tensor(instance_bbox_array) instance_bbox_tensor = instance_bbox_tensor.to( dtype=torch.float32) instance_bbox_tensor[:,0] = torch.clamp(instance_bbox_tensor[:,0], min=-self.max_x,max=self.max_x) instance_bbox_tensor[:,1] = torch.clamp(instance_bbox_tensor[:,1], min=-self.max_y,max=self.max_y) instance_bbox_tensor[:,2] = torch.clamp(instance_bbox_tensor[:,2], min=-self.max_x,max=self.max_x) instance_bbox_tensor[:,3] = torch.clamp(instance_bbox_tensor[:,3], min=-self.max_y,max=self.max_y) return instance_bbox_tensor @property def fixed_num_sampled_points(self): """ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form N means the num of instances """ assert len(self.instance_list) != 0 instance_points_list = [] for instance in self.instance_list: distances = np.linspace(0, instance.length, self.fixed_num) sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) instance_points_list.append(sampled_points) instance_points_array = np.array(instance_points_list) instance_points_tensor = to_tensor(instance_points_array) instance_points_tensor = instance_points_tensor.to( dtype=torch.float32) instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) return instance_points_tensor @property def fixed_num_sampled_points_ambiguity(self): """ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form N means the num of instances """ assert len(self.instance_list) != 0 instance_points_list = [] for instance in self.instance_list: distances = np.linspace(0, instance.length, self.fixed_num) sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) instance_points_list.append(sampled_points) instance_points_array = np.array(instance_points_list) instance_points_tensor = to_tensor(instance_points_array) instance_points_tensor = instance_points_tensor.to( dtype=torch.float32) instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) instance_points_tensor = instance_points_tensor.unsqueeze(1) return instance_points_tensor @property def fixed_num_sampled_points_torch(self): """ return torch.Tensor([N,fixed_num,2]), in xmin, ymin, xmax, ymax form N means the num of instances """ assert len(self.instance_list) != 0 instance_points_list = [] for instance in self.instance_list: # distances = np.linspace(0, instance.length, self.fixed_num) # sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) poly_pts = to_tensor(np.array(list(instance.coords))) poly_pts = poly_pts.unsqueeze(0).permute(0,2,1) sampled_pts = torch.nn.functional.interpolate(poly_pts,size=(self.fixed_num),mode='linear',align_corners=True) sampled_pts = sampled_pts.permute(0,2,1).squeeze(0) instance_points_list.append(sampled_pts) # instance_points_array = np.array(instance_points_list) # instance_points_tensor = to_tensor(instance_points_array) instance_points_tensor = torch.stack(instance_points_list,dim=0) instance_points_tensor = instance_points_tensor.to( dtype=torch.float32) instance_points_tensor[:,:,0] = torch.clamp(instance_points_tensor[:,:,0], min=-self.max_x,max=self.max_x) instance_points_tensor[:,:,1] = torch.clamp(instance_points_tensor[:,:,1], min=-self.max_y,max=self.max_y) return instance_points_tensor @property def shift_fixed_num_sampled_points(self): """ return [instances_num, num_shifts, fixed_num, 2] """ fixed_num_sampled_points = self.fixed_num_sampled_points instances_list = [] is_poly = False # is_line = False # import pdb;pdb.set_trace() for fixed_num_pts in fixed_num_sampled_points: # [fixed_num, 2] is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) fixed_num = fixed_num_pts.shape[0] shift_pts_list = [] if is_poly: # import pdb;pdb.set_trace() for shift_right_i in range(fixed_num): shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) else: shift_pts_list.append(fixed_num_pts) shift_pts_list.append(fixed_num_pts.flip(0)) shift_pts = torch.stack(shift_pts_list,dim=0) shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) if not is_poly: padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value) shift_pts = torch.cat([shift_pts,padding],dim=0) # padding = np.zeros((self.num_samples - len(sampled_points), 2)) # sampled_points = np.concatenate([sampled_points, padding], axis=0) instances_list.append(shift_pts) instances_tensor = torch.stack(instances_list, dim=0) instances_tensor = instances_tensor.to( dtype=torch.float32) return instances_tensor @property def shift_fixed_num_sampled_points_v1(self): """ return [instances_num, num_shifts, fixed_num, 2] """ fixed_num_sampled_points = self.fixed_num_sampled_points instances_list = [] is_poly = False # is_line = False # import pdb;pdb.set_trace() for fixed_num_pts in fixed_num_sampled_points: # [fixed_num, 2] is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) pts_num = fixed_num_pts.shape[0] shift_num = pts_num - 1 if is_poly: pts_to_shift = fixed_num_pts[:-1,:] shift_pts_list = [] if is_poly: for shift_right_i in range(shift_num): shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) else: shift_pts_list.append(fixed_num_pts) shift_pts_list.append(fixed_num_pts.flip(0)) shift_pts = torch.stack(shift_pts_list,dim=0) if is_poly: _, _, num_coords = shift_pts.shape tmp_shift_pts = shift_pts.new_zeros((shift_num, pts_num, num_coords)) tmp_shift_pts[:,:-1,:] = shift_pts tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] shift_pts = tmp_shift_pts shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) if not is_poly: padding = torch.full([shift_num-shift_pts.shape[0],pts_num,2], self.padding_value) shift_pts = torch.cat([shift_pts,padding],dim=0) # padding = np.zeros((self.num_samples - len(sampled_points), 2)) # sampled_points = np.concatenate([sampled_points, padding], axis=0) instances_list.append(shift_pts) instances_tensor = torch.stack(instances_list, dim=0) instances_tensor = instances_tensor.to( dtype=torch.float32) return instances_tensor @property def shift_fixed_num_sampled_points_v2(self): """ return [instances_num, num_shifts, fixed_num, 2] """ assert len(self.instance_list) != 0 instances_list = [] for instance in self.instance_list: distances = np.linspace(0, instance.length, self.fixed_num) poly_pts = np.array(list(instance.coords)) start_pts = poly_pts[0] end_pts = poly_pts[-1] is_poly = np.equal(start_pts, end_pts) is_poly = is_poly.all() shift_pts_list = [] pts_num, coords_num = poly_pts.shape shift_num = pts_num - 1 final_shift_num = self.fixed_num - 1 if is_poly: pts_to_shift = poly_pts[:-1,:] for shift_right_i in range(shift_num): shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) pts_to_concat = shift_pts[0] pts_to_concat = np.expand_dims(pts_to_concat,axis=0) shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) shift_instance = LineString(shift_pts) shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) shift_pts_list.append(shift_sampled_points) # import pdb;pdb.set_trace() else: sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) flip_sampled_points = np.flip(sampled_points, axis=0) shift_pts_list.append(sampled_points) shift_pts_list.append(flip_sampled_points) multi_shifts_pts = np.stack(shift_pts_list,axis=0) shifts_num,_,_ = multi_shifts_pts.shape if shifts_num > final_shift_num: index = np.random.choice(multi_shifts_pts.shape[0], final_shift_num, replace=False) multi_shifts_pts = multi_shifts_pts[index] multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( dtype=torch.float32) multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) # if not is_poly: if multi_shifts_pts_tensor.shape[0] < final_shift_num: padding = torch.full([final_shift_num-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) instances_list.append(multi_shifts_pts_tensor) instances_tensor = torch.stack(instances_list, dim=0) instances_tensor = instances_tensor.to( dtype=torch.float32) return instances_tensor @property def shift_fixed_num_sampled_points_v3(self): """ return [instances_num, num_shifts, fixed_num, 2] """ assert len(self.instance_list) != 0 instances_list = [] for instance in self.instance_list: distances = np.linspace(0, instance.length, self.fixed_num) poly_pts = np.array(list(instance.coords)) start_pts = poly_pts[0] end_pts = poly_pts[-1] is_poly = np.equal(start_pts, end_pts) is_poly = is_poly.all() shift_pts_list = [] pts_num, coords_num = poly_pts.shape shift_num = pts_num - 1 final_shift_num = self.fixed_num - 1 if is_poly: pts_to_shift = poly_pts[:-1,:] for shift_right_i in range(shift_num): shift_pts = np.roll(pts_to_shift,shift_right_i,axis=0) pts_to_concat = shift_pts[0] pts_to_concat = np.expand_dims(pts_to_concat,axis=0) shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) shift_instance = LineString(shift_pts) shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) shift_pts_list.append(shift_sampled_points) flip_pts_to_shift = np.flip(pts_to_shift, axis=0) for shift_right_i in range(shift_num): shift_pts = np.roll(flip_pts_to_shift,shift_right_i,axis=0) pts_to_concat = shift_pts[0] pts_to_concat = np.expand_dims(pts_to_concat,axis=0) shift_pts = np.concatenate((shift_pts,pts_to_concat),axis=0) shift_instance = LineString(shift_pts) shift_sampled_points = np.array([list(shift_instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) shift_pts_list.append(shift_sampled_points) # import pdb;pdb.set_trace() else: sampled_points = np.array([list(instance.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) flip_sampled_points = np.flip(sampled_points, axis=0) shift_pts_list.append(sampled_points) shift_pts_list.append(flip_sampled_points) multi_shifts_pts = np.stack(shift_pts_list,axis=0) shifts_num,_,_ = multi_shifts_pts.shape # import pdb;pdb.set_trace() if shifts_num > 2*final_shift_num: index = np.random.choice(shift_num, final_shift_num, replace=False) flip0_shifts_pts = multi_shifts_pts[index] flip1_shifts_pts = multi_shifts_pts[index+shift_num] multi_shifts_pts = np.concatenate((flip0_shifts_pts,flip1_shifts_pts),axis=0) multi_shifts_pts_tensor = to_tensor(multi_shifts_pts) multi_shifts_pts_tensor = multi_shifts_pts_tensor.to( dtype=torch.float32) multi_shifts_pts_tensor[:,:,0] = torch.clamp(multi_shifts_pts_tensor[:,:,0], min=-self.max_x,max=self.max_x) multi_shifts_pts_tensor[:,:,1] = torch.clamp(multi_shifts_pts_tensor[:,:,1], min=-self.max_y,max=self.max_y) # if not is_poly: if multi_shifts_pts_tensor.shape[0] < 2*final_shift_num: padding = torch.full([final_shift_num*2-multi_shifts_pts_tensor.shape[0],self.fixed_num,2], self.padding_value) multi_shifts_pts_tensor = torch.cat([multi_shifts_pts_tensor,padding],dim=0) instances_list.append(multi_shifts_pts_tensor) instances_tensor = torch.stack(instances_list, dim=0) instances_tensor = instances_tensor.to( dtype=torch.float32) return instances_tensor @property def shift_fixed_num_sampled_points_v4(self): """ return [instances_num, num_shifts, fixed_num, 2] """ fixed_num_sampled_points = self.fixed_num_sampled_points instances_list = [] is_poly = False # is_line = False # import pdb;pdb.set_trace() for fixed_num_pts in fixed_num_sampled_points: # [fixed_num, 2] is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) pts_num = fixed_num_pts.shape[0] shift_num = pts_num - 1 shift_pts_list = [] if is_poly: pts_to_shift = fixed_num_pts[:-1,:] for shift_right_i in range(shift_num): shift_pts_list.append(pts_to_shift.roll(shift_right_i,0)) flip_pts_to_shift = pts_to_shift.flip(0) for shift_right_i in range(shift_num): shift_pts_list.append(flip_pts_to_shift.roll(shift_right_i,0)) else: shift_pts_list.append(fixed_num_pts) shift_pts_list.append(fixed_num_pts.flip(0)) shift_pts = torch.stack(shift_pts_list,dim=0) if is_poly: _, _, num_coords = shift_pts.shape tmp_shift_pts = shift_pts.new_zeros((shift_num*2, pts_num, num_coords)) tmp_shift_pts[:,:-1,:] = shift_pts tmp_shift_pts[:,-1,:] = shift_pts[:,0,:] shift_pts = tmp_shift_pts shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) if not is_poly: padding = torch.full([shift_num*2-shift_pts.shape[0],pts_num,2], self.padding_value) shift_pts = torch.cat([shift_pts,padding],dim=0) # padding = np.zeros((self.num_samples - len(sampled_points), 2)) # sampled_points = np.concatenate([sampled_points, padding], axis=0) instances_list.append(shift_pts) instances_tensor = torch.stack(instances_list, dim=0) instances_tensor = instances_tensor.to( dtype=torch.float32) return instances_tensor @property def shift_fixed_num_sampled_points_torch(self): """ return [instances_num, num_shifts, fixed_num, 2] """ fixed_num_sampled_points = self.fixed_num_sampled_points_torch instances_list = [] is_poly = False # is_line = False # import pdb;pdb.set_trace() for fixed_num_pts in fixed_num_sampled_points: # [fixed_num, 2] is_poly = fixed_num_pts[0].equal(fixed_num_pts[-1]) fixed_num = fixed_num_pts.shape[0] shift_pts_list = [] if is_poly: # import pdb;pdb.set_trace() for shift_right_i in range(fixed_num): shift_pts_list.append(fixed_num_pts.roll(shift_right_i,0)) else: shift_pts_list.append(fixed_num_pts) shift_pts_list.append(fixed_num_pts.flip(0)) shift_pts = torch.stack(shift_pts_list,dim=0) shift_pts[:,:,0] = torch.clamp(shift_pts[:,:,0], min=-self.max_x,max=self.max_x) shift_pts[:,:,1] = torch.clamp(shift_pts[:,:,1], min=-self.max_y,max=self.max_y) if not is_poly: padding = torch.full([fixed_num-shift_pts.shape[0],fixed_num,2], self.padding_value) shift_pts = torch.cat([shift_pts,padding],dim=0) # padding = np.zeros((self.num_samples - len(sampled_points), 2)) # sampled_points = np.concatenate([sampled_points, padding], axis=0) instances_list.append(shift_pts) instances_tensor = torch.stack(instances_list, dim=0) instances_tensor = instances_tensor.to( dtype=torch.float32) return instances_tensor # @property # def polyline_points(self): # """ # return [[x0,y0],[x1,y1],...] # """ # assert len(self.instance_list) != 0 # for instance in self.instance_list: class VectorizedLocalMap(object): CLASS2LABEL = { 'road_divider': 0, 'lane_divider': 0, 'ped_crossing': 1, 'contours': 2, 'others': -1 } def __init__(self, dataroot, patch_size, map_classes=['divider','ped_crossing','boundary'], line_classes=['road_divider', 'lane_divider'], ped_crossing_classes=['ped_crossing'], contour_classes=['road_segment', 'lane'], sample_dist=1, num_samples=250, padding=False, fixed_ptsnum_per_line=-1, padding_value=-10000,): ''' Args: fixed_ptsnum_per_line = -1 : no fixed num ''' super().__init__() self.data_root = dataroot self.MAPS = ['boston-seaport', 'singapore-hollandvillage', 'singapore-onenorth', 'singapore-queenstown'] self.vec_classes = map_classes self.line_classes = line_classes self.ped_crossing_classes = ped_crossing_classes self.polygon_classes = contour_classes self.nusc_maps = {} self.map_explorer = {} for loc in self.MAPS: self.nusc_maps[loc] = NuScenesMap(dataroot=self.data_root, map_name=loc) self.map_explorer[loc] = NuScenesMapExplorer(self.nusc_maps[loc]) self.patch_size = patch_size self.sample_dist = sample_dist self.num_samples = num_samples self.padding = padding self.fixed_num = fixed_ptsnum_per_line self.padding_value = padding_value def gen_vectorized_samples(self, location, lidar2global_translation, patch_angle, flip_dx, flip_dy): ''' use lidar2global to get gt map layers ''' map_pose = lidar2global_translation[:2] # rotation = Quaternion(lidar2global_rotation) patch_box = (map_pose[0], map_pose[1], self.patch_size[0], self.patch_size[1]) # patch_angle = quaternion_yaw(rotation) / np.pi * 180 # import pdb;pdb.set_trace() vectors = [] for vec_class in self.vec_classes: if vec_class == 'divider': line_geom = self.get_map_geom(patch_box, patch_angle, self.line_classes, location, flip_dx, flip_dy) line_instances_dict = self.line_geoms_to_instances(line_geom) for line_type, instances in line_instances_dict.items(): for instance in instances: vectors.append((instance, self.CLASS2LABEL.get(line_type, -1))) elif vec_class == 'ped_crossing': ped_geom = self.get_map_geom(patch_box, patch_angle, self.ped_crossing_classes, location, flip_dx, flip_dy) # ped_vector_list = self.ped_geoms_to_vectors(ped_geom) ped_instance_list = self.ped_poly_geoms_to_instances(ped_geom) # import pdb;pdb.set_trace() for instance in ped_instance_list: vectors.append((instance, self.CLASS2LABEL.get('ped_crossing', -1))) elif vec_class == 'boundary': polygon_geom = self.get_map_geom(patch_box, patch_angle, self.polygon_classes, location, flip_dx, flip_dy) # import pdb;pdb.set_trace() poly_bound_list = self.poly_geoms_to_instances(polygon_geom) # import pdb;pdb.set_trace() for contour in poly_bound_list: vectors.append((contour, self.CLASS2LABEL.get('contours', -1))) else: raise ValueError(f'WRONG vec_class: {vec_class}') # filter out -1 filtered_vectors = [] gt_pts_loc_3d = [] gt_pts_num_3d = [] gt_labels = [] gt_instance = [] for instance, type in vectors: if type != -1: gt_instance.append(instance) gt_labels.append(type) gt_instance = LiDARInstanceLines(gt_instance,self.sample_dist, self.num_samples, self.padding, self.fixed_num,self.padding_value, patch_size=self.patch_size) anns_results = dict( gt_vecs_pts_loc=gt_instance, gt_vecs_label=gt_labels, ) # import pdb;pdb.set_trace() return anns_results def get_map_geom(self, patch_box, patch_angle, layer_names, location, flip_dx, flip_dy): map_geom = [] for layer_name in layer_names: if layer_name in self.line_classes: # import pdb;pdb.set_trace() geoms = self.get_divider_line(patch_box, patch_angle, layer_name, location, flip_dx, flip_dy) # import pdb;pdb.set_trace() # geoms = self.map_explorer[location]._get_layer_line(patch_box, patch_angle, layer_name) map_geom.append((layer_name, geoms)) elif layer_name in self.polygon_classes: geoms = self.get_contour_line(patch_box, patch_angle, layer_name, location, flip_dx, flip_dy) # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name) map_geom.append((layer_name, geoms)) elif layer_name in self.ped_crossing_classes: geoms = self.get_ped_crossing_line(patch_box, patch_angle, location, flip_dx, flip_dy) # geoms = self.map_explorer[location]._get_layer_polygon(patch_box, patch_angle, layer_name) map_geom.append((layer_name, geoms)) return map_geom def _one_type_line_geom_to_vectors(self, line_geom): line_vectors = [] for line in line_geom: if not line.is_empty: if line.geom_type == 'MultiLineString': for single_line in line.geoms: line_vectors.append(self.sample_pts_from_line(single_line)) elif line.geom_type == 'LineString': line_vectors.append(self.sample_pts_from_line(line)) else: raise NotImplementedError return line_vectors def _one_type_line_geom_to_instances(self, line_geom): line_instances = [] for line in line_geom: if not line.is_empty: if line.geom_type == 'MultiLineString': for single_line in line.geoms: line_instances.append(single_line) elif line.geom_type == 'LineString': line_instances.append(line) else: raise NotImplementedError return line_instances def poly_geoms_to_vectors(self, polygon_geom): roads = polygon_geom[0][1] lanes = polygon_geom[1][1] union_roads = ops.unary_union(roads) union_lanes = ops.unary_union(lanes) union_segments = ops.unary_union([union_roads, union_lanes]) max_x = self.patch_size[1] / 2 max_y = self.patch_size[0] / 2 local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) exteriors = [] interiors = [] if union_segments.geom_type != 'MultiPolygon': union_segments = MultiPolygon([union_segments]) for poly in union_segments.geoms: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: if ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) for inter in interiors: if not inter.is_ccw: inter.coords = list(inter.coords)[::-1] lines = inter.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) return self._one_type_line_geom_to_vectors(results) def ped_poly_geoms_to_instances(self, ped_geom): # import pdb;pdb.set_trace() ped = ped_geom[0][1] union_segments = ops.unary_union(ped) max_x = self.patch_size[1] / 2 max_y = self.patch_size[0] / 2 # local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) local_patch = box(-max_x - 0.2, -max_y - 0.2, max_x + 0.2, max_y + 0.2) exteriors = [] interiors = [] if union_segments.geom_type != 'MultiPolygon': union_segments = MultiPolygon([union_segments]) for poly in union_segments.geoms: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: if ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) for inter in interiors: if not inter.is_ccw: inter.coords = list(inter.coords)[::-1] lines = inter.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) return self._one_type_line_geom_to_instances(results) def poly_geoms_to_instances(self, polygon_geom): roads = polygon_geom[0][1] lanes = polygon_geom[1][1] union_roads = ops.unary_union(roads) union_lanes = ops.unary_union(lanes) union_segments = ops.unary_union([union_roads, union_lanes]) max_x = self.patch_size[1] / 2 max_y = self.patch_size[0] / 2 local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) exteriors = [] interiors = [] if union_segments.geom_type != 'MultiPolygon': union_segments = MultiPolygon([union_segments]) for poly in union_segments.geoms: exteriors.append(poly.exterior) for inter in poly.interiors: interiors.append(inter) results = [] for ext in exteriors: if ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) for inter in interiors: if not inter.is_ccw: inter.coords = list(inter.coords)[::-1] lines = inter.intersection(local_patch) if isinstance(lines, MultiLineString): lines = ops.linemerge(lines) results.append(lines) return self._one_type_line_geom_to_instances(results) def line_geoms_to_vectors(self, line_geom): line_vectors_dict = dict() for line_type, a_type_of_lines in line_geom: one_type_vectors = self._one_type_line_geom_to_vectors(a_type_of_lines) line_vectors_dict[line_type] = one_type_vectors return line_vectors_dict def line_geoms_to_instances(self, line_geom): line_instances_dict = dict() for line_type, a_type_of_lines in line_geom: one_type_instances = self._one_type_line_geom_to_instances(a_type_of_lines) line_instances_dict[line_type] = one_type_instances return line_instances_dict def ped_geoms_to_vectors(self, ped_geom): ped_geom = ped_geom[0][1] union_ped = ops.unary_union(ped_geom) if union_ped.geom_type != 'MultiPolygon': union_ped = MultiPolygon([union_ped]) max_x = self.patch_size[1] / 2 max_y = self.patch_size[0] / 2 local_patch = box(-max_x + 0.2, -max_y + 0.2, max_x - 0.2, max_y - 0.2) results = [] for ped_poly in union_ped: # rect = ped_poly.minimum_rotated_rectangle ext = ped_poly.exterior if not ext.is_ccw: ext.coords = list(ext.coords)[::-1] lines = ext.intersection(local_patch) results.append(lines) return self._one_type_line_geom_to_vectors(results) def get_contour_line(self,patch_box,patch_angle,layer_name,location, flip_dx, flip_dy): if layer_name not in self.map_explorer[location].map_api.non_geometric_polygon_layers: raise ValueError('{} is not a polygonal layer'.format(layer_name)) patch_x = patch_box[0] patch_y = patch_box[1] patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) records = getattr(self.map_explorer[location].map_api, layer_name) polygon_list = [] if layer_name == 'drivable_area': for record in records: polygons = [self.map_explorer[location].map_api.extract_polygon(polygon_token) for polygon_token in record['polygon_tokens']] for polygon in polygons: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) a = 1. if not flip_dx else -1. e = 1. if not flip_dy else -1. new_polygon = affinity.affine_transform(new_polygon, [1, 0.0, 0.0, 1, -patch_x, -patch_y]) new_polygon = affinity.affine_transform(new_polygon, [a, 0.0, 0.0, e, 0, 0]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) else: for record in records: polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token']) if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) a = 1. if not flip_dx else -1. e = 1. if not flip_dy else -1. new_polygon = affinity.affine_transform(new_polygon, [1, 0.0, 0.0, 1, -patch_x, -patch_y]) new_polygon = affinity.affine_transform(new_polygon, [a, 0.0, 0.0, e, 0, 0]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) return polygon_list def get_divider_line(self,patch_box,patch_angle,layer_name,location, flip_dx, flip_dy): if layer_name not in self.map_explorer[location].map_api.non_geometric_line_layers: raise ValueError("{} is not a line layer".format(layer_name)) if layer_name == 'traffic_light': return None patch_x = patch_box[0] patch_y = patch_box[1] patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) line_list = [] records = getattr(self.map_explorer[location].map_api, layer_name) for record in records: line = self.map_explorer[location].map_api.extract_line(record['line_token']) if line.is_empty: # Skip lines without nodes. continue new_line = line.intersection(patch) if not new_line.is_empty: new_line = affinity.rotate(new_line, -patch_angle, origin=(patch_x, patch_y), use_radians=False) a = 1. if not flip_dx else -1. e = 1. if not flip_dy else -1. new_line = affinity.affine_transform(new_line, [1, 0.0, 0.0, 1, -patch_x, -patch_y]) new_line = affinity.affine_transform(new_line, [a, 0.0, 0.0, e, 0, 0]) # [a, b, d, e, xoff, yoff] # which represents the augmented matrix:: # [x'] / a b xoff \ [x] # [y'] = | d e yoff | [y] # [1 ] \ 0 0 1 / [1] # or the equations for the transformed coordinates:: # x' = a * x + b * y + xoff # y' = d * x + e * y + yoff line_list.append(new_line) return line_list def get_ped_crossing_line(self, patch_box, patch_angle, location, flip_dx, flip_dy): patch_x = patch_box[0] patch_y = patch_box[1] patch = self.map_explorer[location].get_patch_coord(patch_box, patch_angle) polygon_list = [] records = getattr(self.map_explorer[location].map_api, 'ped_crossing') # records = getattr(self.nusc_maps[location], 'ped_crossing') for record in records: polygon = self.map_explorer[location].map_api.extract_polygon(record['polygon_token']) if polygon.is_valid: new_polygon = polygon.intersection(patch) if not new_polygon.is_empty: new_polygon = affinity.rotate(new_polygon, -patch_angle, origin=(patch_x, patch_y), use_radians=False) a = 1. if not flip_dx else -1. e = 1. if not flip_dy else -1. new_polygon = affinity.affine_transform(new_polygon, [1, 0.0, 0.0, 1, -patch_x, -patch_y]) new_polygon = affinity.affine_transform(new_polygon, [a, 0.0, 0.0, e, 0, 0]) if new_polygon.geom_type == 'Polygon': new_polygon = MultiPolygon([new_polygon]) polygon_list.append(new_polygon) return polygon_list def sample_pts_from_line(self, line): if self.fixed_num < 0: distances = np.arange(0, line.length, self.sample_dist) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) else: # fixed number of points, so distance is line.length / self.fixed_num distances = np.linspace(0, line.length, self.fixed_num) sampled_points = np.array([list(line.interpolate(distance).coords) for distance in distances]).reshape(-1, 2) # tmpdistances = np.linspace(0, line.length, 2) # tmpsampled_points = np.array([list(line.interpolate(tmpdistance).coords) for tmpdistance in tmpdistances]).reshape(-1, 2) # import pdb;pdb.set_trace() # if self.normalize: # sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]]) num_valid = len(sampled_points) if not self.padding or self.fixed_num > 0: # fixed num sample can return now! return sampled_points, num_valid # fixed distance sampling need padding! num_valid = len(sampled_points) if self.fixed_num < 0: if num_valid < self.num_samples: padding = np.zeros((self.num_samples - len(sampled_points), 2)) sampled_points = np.concatenate([sampled_points, padding], axis=0) else: sampled_points = sampled_points[:self.num_samples, :] num_valid = self.num_samples # if self.normalize: # sampled_points = sampled_points / np.array([self.patch_size[1], self.patch_size[0]]) # num_valid = len(sampled_points) return sampled_points, num_valid ================================================ FILE: mmdet3d/datasets/waymo_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import tempfile from os import path as osp import mmcv import numpy as np import torch from mmcv.utils import print_log from ..core.bbox import Box3DMode, points_cam2img from .builder import DATASETS from .kitti_dataset import KittiDataset @DATASETS.register_module() class WaymoDataset(KittiDataset): """Waymo Dataset. This class serves as the API for experiments on the Waymo Dataset. Please refer to ``_for data downloading. It is recommended to symlink the dataset root to $MMDETECTION3D/data and organize them as the doc shows. Args: data_root (str): Path of dataset root. ann_file (str): Path of annotation file. split (str): Split of input data. pts_prefix (str, optional): Prefix of points files. Defaults to 'velodyne'. pipeline (list[dict], optional): Pipeline used for data processing. Defaults to None. classes (tuple[str], optional): Classes used in the dataset. Defaults to None. modality (dict, optional): Modality to specify the sensor data used as input. Defaults to None. box_type_3d (str, optional): Type of 3D box of this dataset. Based on the `box_type_3d`, the dataset will encapsulate the box to its original format then converted them to `box_type_3d`. Defaults to 'LiDAR' in this dataset. Available options includes - 'LiDAR': box in LiDAR coordinates - 'Depth': box in depth coordinates, usually for indoor dataset - 'Camera': box in camera coordinates filter_empty_gt (bool, optional): Whether to filter empty GT. Defaults to True. test_mode (bool, optional): Whether the dataset is in test mode. Defaults to False. pcd_limit_range (list(float), optional): The range of point cloud used to filter invalid predicted boxes. Default: [-85, -85, -5, 85, 85, 5]. """ CLASSES = ('Car', 'Cyclist', 'Pedestrian') def __init__(self, data_root, ann_file, split, pts_prefix='velodyne', pipeline=None, classes=None, modality=None, box_type_3d='LiDAR', filter_empty_gt=True, test_mode=False, load_interval=1, pcd_limit_range=[-85, -85, -5, 85, 85, 5], **kwargs): super().__init__( data_root=data_root, ann_file=ann_file, split=split, pts_prefix=pts_prefix, pipeline=pipeline, classes=classes, modality=modality, box_type_3d=box_type_3d, filter_empty_gt=filter_empty_gt, test_mode=test_mode, pcd_limit_range=pcd_limit_range, **kwargs) # to load a subset, just set the load_interval in the dataset config self.data_infos = self.data_infos[::load_interval] if hasattr(self, 'flag'): self.flag = self.flag[::load_interval] def _get_pts_filename(self, idx): pts_filename = osp.join(self.root_split, self.pts_prefix, f'{idx:07d}.bin') return pts_filename def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Standard input_dict consists of the data information. - sample_idx (str): sample index - pts_filename (str): filename of point clouds - img_prefix (str): prefix of image files - img_info (dict): image info - lidar2img (list[np.ndarray], optional): transformations from lidar to different cameras - ann_info (dict): annotation info """ info = self.data_infos[index] sample_idx = info['image']['image_idx'] img_filename = os.path.join(self.data_root, info['image']['image_path']) # TODO: consider use torch.Tensor only rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P0 = info['calib']['P0'].astype(np.float32) lidar2img = P0 @ rect @ Trv2c pts_filename = self._get_pts_filename(sample_idx) input_dict = dict( sample_idx=sample_idx, pts_filename=pts_filename, img_prefix=None, img_info=dict(filename=img_filename), lidar2img=lidar2img) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos return input_dict def format_results(self, outputs, pklfile_prefix=None, submission_prefix=None, data_format='waymo'): """Format the results to pkl file. Args: outputs (list[dict]): Testing results of the dataset. pklfile_prefix (str): The prefix of pkl files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str): The prefix of submitted files. It includes the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. data_format (str, optional): Output data format. Default: 'waymo'. Another supported choice is 'kitti'. Returns: tuple: (result_files, tmp_dir), result_files is a dict containing the json filepaths, tmp_dir is the temporal directory created for saving json files when jsonfile_prefix is not specified. """ if pklfile_prefix is None: tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(tmp_dir.name, 'results') else: tmp_dir = None assert ('waymo' in data_format or 'kitti' in data_format), \ f'invalid data_format {data_format}' if (not isinstance(outputs[0], dict)) or 'img_bbox' in outputs[0]: raise TypeError('Not supported type for reformat results.') elif 'pts_bbox' in outputs[0]: result_files = dict() for name in outputs[0]: results_ = [out[name] for out in outputs] pklfile_prefix_ = pklfile_prefix + name if submission_prefix is not None: submission_prefix_ = f'{submission_prefix}_{name}' else: submission_prefix_ = None result_files_ = self.bbox2result_kitti(results_, self.CLASSES, pklfile_prefix_, submission_prefix_) result_files[name] = result_files_ else: result_files = self.bbox2result_kitti(outputs, self.CLASSES, pklfile_prefix, submission_prefix) if 'waymo' in data_format: from ..core.evaluation.waymo_utils.prediction_kitti_to_waymo import \ KITTI2Waymo # noqa waymo_root = osp.join( self.data_root.split('kitti_format')[0], 'waymo_format') if self.split == 'training': waymo_tfrecords_dir = osp.join(waymo_root, 'validation') prefix = '1' elif self.split == 'testing': waymo_tfrecords_dir = osp.join(waymo_root, 'testing') prefix = '2' else: raise ValueError('Not supported split value.') save_tmp_dir = tempfile.TemporaryDirectory() waymo_results_save_dir = save_tmp_dir.name waymo_results_final_path = f'{pklfile_prefix}.bin' if 'pts_bbox' in result_files: converter = KITTI2Waymo(result_files['pts_bbox'], waymo_tfrecords_dir, waymo_results_save_dir, waymo_results_final_path, prefix) else: converter = KITTI2Waymo(result_files, waymo_tfrecords_dir, waymo_results_save_dir, waymo_results_final_path, prefix) converter.convert() save_tmp_dir.cleanup() return result_files, tmp_dir def evaluate(self, results, metric='waymo', logger=None, pklfile_prefix=None, submission_prefix=None, show=False, out_dir=None, pipeline=None): """Evaluation in KITTI protocol. Args: results (list[dict]): Testing results of the dataset. metric (str | list[str], optional): Metrics to be evaluated. Default: 'waymo'. Another supported metric is 'kitti'. logger (logging.Logger | str, optional): Logger used for printing related information during evaluation. Default: None. pklfile_prefix (str, optional): The prefix of pkl files including the file path and the prefix of filename, e.g., "a/b/prefix". If not specified, a temp file will be created. Default: None. submission_prefix (str, optional): The prefix of submission data. If not specified, the submission data will not be generated. show (bool, optional): Whether to visualize. Default: False. out_dir (str, optional): Path to save the visualization results. Default: None. pipeline (list[dict], optional): raw data loading for showing. Default: None. Returns: dict[str: float]: results of each evaluation metric """ assert ('waymo' in metric or 'kitti' in metric), \ f'invalid metric {metric}' if 'kitti' in metric: result_files, tmp_dir = self.format_results( results, pklfile_prefix, submission_prefix, data_format='kitti') from mmdet3d.core.evaluation import kitti_eval gt_annos = [info['annos'] for info in self.data_infos] if isinstance(result_files, dict): ap_dict = dict() for name, result_files_ in result_files.items(): eval_types = ['bev', '3d'] ap_result_str, ap_dict_ = kitti_eval( gt_annos, result_files_, self.CLASSES, eval_types=eval_types) for ap_type, ap in ap_dict_.items(): ap_dict[f'{name}/{ap_type}'] = float( '{:.4f}'.format(ap)) print_log( f'Results of {name}:\n' + ap_result_str, logger=logger) else: ap_result_str, ap_dict = kitti_eval( gt_annos, result_files, self.CLASSES, eval_types=['bev', '3d']) print_log('\n' + ap_result_str, logger=logger) if 'waymo' in metric: waymo_root = osp.join( self.data_root.split('kitti_format')[0], 'waymo_format') if pklfile_prefix is None: eval_tmp_dir = tempfile.TemporaryDirectory() pklfile_prefix = osp.join(eval_tmp_dir.name, 'results') else: eval_tmp_dir = None result_files, tmp_dir = self.format_results( results, pklfile_prefix, submission_prefix, data_format='waymo') import subprocess ret_bytes = subprocess.check_output( 'mmdet3d/core/evaluation/waymo_utils/' + f'compute_detection_metrics_main {pklfile_prefix}.bin ' + f'{waymo_root}/gt.bin', shell=True) ret_texts = ret_bytes.decode('utf-8') print_log(ret_texts) # parse the text to get ap_dict ap_dict = { 'Vehicle/L1 mAP': 0, 'Vehicle/L1 mAPH': 0, 'Vehicle/L2 mAP': 0, 'Vehicle/L2 mAPH': 0, 'Pedestrian/L1 mAP': 0, 'Pedestrian/L1 mAPH': 0, 'Pedestrian/L2 mAP': 0, 'Pedestrian/L2 mAPH': 0, 'Sign/L1 mAP': 0, 'Sign/L1 mAPH': 0, 'Sign/L2 mAP': 0, 'Sign/L2 mAPH': 0, 'Cyclist/L1 mAP': 0, 'Cyclist/L1 mAPH': 0, 'Cyclist/L2 mAP': 0, 'Cyclist/L2 mAPH': 0, 'Overall/L1 mAP': 0, 'Overall/L1 mAPH': 0, 'Overall/L2 mAP': 0, 'Overall/L2 mAPH': 0 } mAP_splits = ret_texts.split('mAP ') mAPH_splits = ret_texts.split('mAPH ') for idx, key in enumerate(ap_dict.keys()): split_idx = int(idx / 2) + 1 if idx % 2 == 0: # mAP ap_dict[key] = float(mAP_splits[split_idx].split(']')[0]) else: # mAPH ap_dict[key] = float(mAPH_splits[split_idx].split(']')[0]) ap_dict['Overall/L1 mAP'] = \ (ap_dict['Vehicle/L1 mAP'] + ap_dict['Pedestrian/L1 mAP'] + ap_dict['Cyclist/L1 mAP']) / 3 ap_dict['Overall/L1 mAPH'] = \ (ap_dict['Vehicle/L1 mAPH'] + ap_dict['Pedestrian/L1 mAPH'] + ap_dict['Cyclist/L1 mAPH']) / 3 ap_dict['Overall/L2 mAP'] = \ (ap_dict['Vehicle/L2 mAP'] + ap_dict['Pedestrian/L2 mAP'] + ap_dict['Cyclist/L2 mAP']) / 3 ap_dict['Overall/L2 mAPH'] = \ (ap_dict['Vehicle/L2 mAPH'] + ap_dict['Pedestrian/L2 mAPH'] + ap_dict['Cyclist/L2 mAPH']) / 3 if eval_tmp_dir is not None: eval_tmp_dir.cleanup() if tmp_dir is not None: tmp_dir.cleanup() if show or out_dir: self.show(results, out_dir, show=show, pipeline=pipeline) return ap_dict def bbox2result_kitti(self, net_outputs, class_names, pklfile_prefix=None, submission_prefix=None): """Convert results to kitti format for evaluation and test submission. Args: net_outputs (List[np.ndarray]): list of array storing the bbox and score class_nanes (List[String]): A list of class names pklfile_prefix (str): The prefix of pkl file. submission_prefix (str): The prefix of submission file. Returns: List[dict]: A list of dict have the kitti 3d format """ assert len(net_outputs) == len(self.data_infos), \ 'invalid list length of network outputs' if submission_prefix is not None: mmcv.mkdir_or_exist(submission_prefix) det_annos = [] print('\nConverting prediction to KITTI format') for idx, pred_dicts in enumerate( mmcv.track_iter_progress(net_outputs)): annos = [] info = self.data_infos[idx] sample_idx = info['image']['image_idx'] image_shape = info['image']['image_shape'][:2] box_dict = self.convert_valid_bboxes(pred_dicts, info) if len(box_dict['bbox']) > 0: box_2d_preds = box_dict['bbox'] box_preds = box_dict['box3d_camera'] scores = box_dict['scores'] box_preds_lidar = box_dict['box3d_lidar'] label_preds = box_dict['label_preds'] anno = { 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [], 'score': [] } for box, box_lidar, bbox, score, label in zip( box_preds, box_preds_lidar, box_2d_preds, scores, label_preds): bbox[2:] = np.minimum(bbox[2:], image_shape[::-1]) bbox[:2] = np.maximum(bbox[:2], [0, 0]) anno['name'].append(class_names[int(label)]) anno['truncated'].append(0.0) anno['occluded'].append(0) anno['alpha'].append( -np.arctan2(-box_lidar[1], box_lidar[0]) + box[6]) anno['bbox'].append(bbox) anno['dimensions'].append(box[3:6]) anno['location'].append(box[:3]) anno['rotation_y'].append(box[6]) anno['score'].append(score) anno = {k: np.stack(v) for k, v in anno.items()} annos.append(anno) if submission_prefix is not None: curr_file = f'{submission_prefix}/{sample_idx:07d}.txt' with open(curr_file, 'w') as f: bbox = anno['bbox'] loc = anno['location'] dims = anno['dimensions'] # lhw -> hwl for idx in range(len(bbox)): print( '{} -1 -1 {:.4f} {:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} ' '{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'. format(anno['name'][idx], anno['alpha'][idx], bbox[idx][0], bbox[idx][1], bbox[idx][2], bbox[idx][3], dims[idx][1], dims[idx][2], dims[idx][0], loc[idx][0], loc[idx][1], loc[idx][2], anno['rotation_y'][idx], anno['score'][idx]), file=f) else: annos.append({ 'name': np.array([]), 'truncated': np.array([]), 'occluded': np.array([]), 'alpha': np.array([]), 'bbox': np.zeros([0, 4]), 'dimensions': np.zeros([0, 3]), 'location': np.zeros([0, 3]), 'rotation_y': np.array([]), 'score': np.array([]), }) annos[-1]['sample_idx'] = np.array( [sample_idx] * len(annos[-1]['score']), dtype=np.int64) det_annos += annos if pklfile_prefix is not None: if not pklfile_prefix.endswith(('.pkl', '.pickle')): out = f'{pklfile_prefix}.pkl' mmcv.dump(det_annos, out) print(f'Result is saved to {out}.') return det_annos def convert_valid_bboxes(self, box_dict, info): """Convert the boxes into valid format. Args: box_dict (dict): Bounding boxes to be converted. - boxes_3d (:obj:``LiDARInstance3DBoxes``): 3D bounding boxes. - scores_3d (np.ndarray): Scores of predicted boxes. - labels_3d (np.ndarray): Class labels of predicted boxes. info (dict): Dataset information dictionary. Returns: dict: Valid boxes after conversion. - bbox (np.ndarray): 2D bounding boxes (in camera 0). - box3d_camera (np.ndarray): 3D boxes in camera coordinates. - box3d_lidar (np.ndarray): 3D boxes in lidar coordinates. - scores (np.ndarray): Scores of predicted boxes. - label_preds (np.ndarray): Class labels of predicted boxes. - sample_idx (np.ndarray): Sample index. """ # TODO: refactor this function box_preds = box_dict['boxes_3d'] scores = box_dict['scores_3d'] labels = box_dict['labels_3d'] sample_idx = info['image']['image_idx'] box_preds.limit_yaw(offset=0.5, period=np.pi * 2) if len(box_preds) == 0: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx) rect = info['calib']['R0_rect'].astype(np.float32) Trv2c = info['calib']['Tr_velo_to_cam'].astype(np.float32) P0 = info['calib']['P0'].astype(np.float32) P0 = box_preds.tensor.new_tensor(P0) box_preds_camera = box_preds.convert_to(Box3DMode.CAM, rect @ Trv2c) box_corners = box_preds_camera.corners box_corners_in_image = points_cam2img(box_corners, P0) # box_corners_in_image: [N, 8, 2] minxy = torch.min(box_corners_in_image, dim=1)[0] maxxy = torch.max(box_corners_in_image, dim=1)[0] box_2d_preds = torch.cat([minxy, maxxy], dim=1) # Post-processing # check box_preds limit_range = box_preds.tensor.new_tensor(self.pcd_limit_range) valid_pcd_inds = ((box_preds.center > limit_range[:3]) & (box_preds.center < limit_range[3:])) valid_inds = valid_pcd_inds.all(-1) if valid_inds.sum() > 0: return dict( bbox=box_2d_preds[valid_inds, :].numpy(), box3d_camera=box_preds_camera[valid_inds].tensor.numpy(), box3d_lidar=box_preds[valid_inds].tensor.numpy(), scores=scores[valid_inds].numpy(), label_preds=labels[valid_inds].numpy(), sample_idx=sample_idx, ) else: return dict( bbox=np.zeros([0, 4]), box3d_camera=np.zeros([0, 7]), box3d_lidar=np.zeros([0, 7]), scores=np.zeros([0]), label_preds=np.zeros([0, 4]), sample_idx=sample_idx, ) ================================================ FILE: mmdet3d/models/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .backbones import * # noqa: F401,F403 from .builder import (BACKBONES, DETECTORS, FUSION_LAYERS, HEADS, LOSSES, MIDDLE_ENCODERS, NECKS, ROI_EXTRACTORS, SEGMENTORS, SHARED_HEADS, VOXEL_ENCODERS, build_backbone, build_detector, build_fusion_layer, build_head, build_loss, build_middle_encoder, build_model, build_neck, build_roi_extractor, build_shared_head, build_voxel_encoder) from .decode_heads import * # noqa: F401,F403 from .dense_heads import * # noqa: F401,F403 from .detectors import * # noqa: F401,F403 from .fusion_layers import * # noqa: F401,F403 from .losses import * # noqa: F401,F403 from .middle_encoders import * # noqa: F401,F403 from .model_utils import * # noqa: F401,F403 from .necks import * # noqa: F401,F403 from .roi_heads import * # noqa: F401,F403 from .segmentors import * # noqa: F401,F403 from .voxel_encoders import * # noqa: F401,F403 from .fbbev import * __all__ = [ 'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES', 'DETECTORS', 'SEGMENTORS', 'VOXEL_ENCODERS', 'MIDDLE_ENCODERS', 'FUSION_LAYERS', 'build_backbone', 'build_neck', 'build_roi_extractor', 'build_shared_head', 'build_head', 'build_loss', 'build_detector', 'build_fusion_layer', 'build_model', 'build_middle_encoder', 'build_voxel_encoder' ] ================================================ FILE: mmdet3d/models/backbones/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.backbones import SSDVGG, HRNet, ResNet, ResNetV1d, ResNeXt from .dgcnn import DGCNNBackbone from .dla import DLANet from .mink_resnet import MinkResNet from .multi_backbone import MultiBackbone from .nostem_regnet import NoStemRegNet from .pointnet2_sa_msg import PointNet2SAMSG from .pointnet2_sa_ssg import PointNet2SASSG from .resnet import CustomResNet from .second import SECOND from .convnext import ConvNeXt from .vovnet import VoVNetCP from .swin import SwinTransformer __all__ = [ 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'NoStemRegNet', 'SECOND', 'DGCNNBackbone', 'PointNet2SASSG', 'PointNet2SAMSG', 'MultiBackbone', 'DLANet', 'MinkResNet', 'CustomResNet' ] ================================================ FILE: mmdet3d/models/backbones/base_pointnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from abc import ABCMeta from mmcv.runner import BaseModule class BasePointNet(BaseModule, metaclass=ABCMeta): """Base class for PointNet.""" def __init__(self, init_cfg=None, pretrained=None): super(BasePointNet, self).__init__(init_cfg) self.fp16_enabled = False assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) @staticmethod def _split_point_feats(points): """Split coordinates and features of input points. Args: points (torch.Tensor): Point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. """ xyz = points[..., 0:3].contiguous() if points.size(-1) > 3: features = points[..., 3:].transpose(1, 2).contiguous() else: features = None return xyz, features ================================================ FILE: mmdet3d/models/backbones/convnext.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from functools import partial from itertools import chain from typing import Sequence import torch import torch.nn as nn import torch.nn.functional as F import torch.utils.checkpoint as cp from mmcv.cnn.bricks import (NORM_LAYERS, DropPath, build_activation_layer, build_norm_layer) from mmcv.runner import BaseModule from mmcv.runner.base_module import ModuleList, Sequential from mmdet.models.builder import BACKBONES # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod from mmcv.runner import BaseModule class BaseBackbone(BaseModule, metaclass=ABCMeta): """Base backbone. This class defines the basic functions of a backbone. Any backbone that inherits this class should at least define its own `forward` function. """ def __init__(self, init_cfg=None): super(BaseBackbone, self).__init__(init_cfg) @abstractmethod def forward(self, x): """Forward computation. Args: x (tensor | tuple[tensor]): x could be a Torch.tensor or a tuple of Torch.tensor, containing input data for forward computation. """ pass def train(self, mode=True): """Set module status before forward computation. Args: mode (bool): Whether it is train_mode or test_mode """ super(BaseBackbone, self).train(mode) @NORM_LAYERS.register_module('LN2d') class LayerNorm2d(nn.LayerNorm): """LayerNorm on channels for 2d images. Args: num_channels (int): The number of channels of the input tensor. eps (float): a value added to the denominator for numerical stability. Defaults to 1e-5. elementwise_affine (bool): a boolean value that when set to ``True``, this module has learnable per-element affine parameters initialized to ones (for weights) and zeros (for biases). Defaults to True. """ def __init__(self, num_channels: int, **kwargs) -> None: super().__init__(num_channels, **kwargs) self.num_channels = self.normalized_shape[0] def forward(self, x): assert x.dim() == 4, 'LayerNorm2d only supports inputs with shape ' \ f'(N, C, H, W), but got tensor with shape {x.shape}' return F.layer_norm( x.permute(0, 2, 3, 1).contiguous(), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2).contiguous() class ConvNeXtBlock(BaseModule): """ConvNeXt Block. Args: in_channels (int): The number of input channels. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='LN2d', eps=1e-6)``. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. mlp_ratio (float): The expansion ratio in both pointwise convolution. Defaults to 4. linear_pw_conv (bool): Whether to use linear layer to do pointwise convolution. More details can be found in the note. Defaults to True. drop_path_rate (float): Stochastic depth rate. Defaults to 0. layer_scale_init_value (float): Init value for Layer Scale. Defaults to 1e-6. Note: There are two equivalent implementations: 1. DwConv -> LayerNorm -> 1x1 Conv -> GELU -> 1x1 Conv; all outputs are in (N, C, H, W). 2. DwConv -> LayerNorm -> Permute to (N, H, W, C) -> Linear -> GELU -> Linear; Permute back As default, we use the second to align with the official repository. And it may be slightly faster. """ def __init__(self, in_channels, norm_cfg=dict(type='LN2d', eps=1e-6), act_cfg=dict(type='GELU'), mlp_ratio=4., linear_pw_conv=True, drop_path_rate=0., layer_scale_init_value=1e-6, with_cp=False): super().__init__() self.with_cp = with_cp self.depthwise_conv = nn.Conv2d( in_channels, in_channels, kernel_size=7, padding=3, groups=in_channels) self.linear_pw_conv = linear_pw_conv self.norm = build_norm_layer(norm_cfg, in_channels)[1] mid_channels = int(mlp_ratio * in_channels) if self.linear_pw_conv: # Use linear layer to do pointwise conv. pw_conv = nn.Linear else: pw_conv = partial(nn.Conv2d, kernel_size=1) self.pointwise_conv1 = pw_conv(in_channels, mid_channels) self.act = build_activation_layer(act_cfg) self.pointwise_conv2 = pw_conv(mid_channels, in_channels) self.gamma = nn.Parameter( layer_scale_init_value * torch.ones((in_channels)), requires_grad=True) if layer_scale_init_value > 0 else None self.drop_path = DropPath( drop_path_rate) if drop_path_rate > 0. else nn.Identity() def forward(self, x): def _inner_forward(x): shortcut = x x = self.depthwise_conv(x) x = self.norm(x) if self.linear_pw_conv: x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) x = self.pointwise_conv1(x) x = self.act(x) x = self.pointwise_conv2(x) if self.linear_pw_conv: x = x.permute(0, 3, 1, 2) # permute back if self.gamma is not None: x = x.mul(self.gamma.view(1, -1, 1, 1)) x = shortcut + self.drop_path(x) return x if self.with_cp and x.requires_grad: x = cp.checkpoint(_inner_forward, x) else: x = _inner_forward(x) return x @BACKBONES.register_module() class ConvNeXt(BaseBackbone): """ConvNeXt. A PyTorch implementation of : `A ConvNet for the 2020s `_ Modified from the `official repo `_ and `timm `_. Args: arch (str | dict): The model's architecture. If string, it should be one of architecture in ``ConvNeXt.arch_settings``. And if dict, it should include the following two keys: - depths (list[int]): Number of blocks at each stage. - channels (list[int]): The number of channels at each stage. Defaults to 'tiny'. in_channels (int): Number of input image channels. Defaults to 3. stem_patch_size (int): The size of one patch in the stem layer. Defaults to 4. norm_cfg (dict): The config dict for norm layers. Defaults to ``dict(type='LN2d', eps=1e-6)``. act_cfg (dict): The config dict for activation between pointwise convolution. Defaults to ``dict(type='GELU')``. linear_pw_conv (bool): Whether to use linear layer to do pointwise convolution. Defaults to True. drop_path_rate (float): Stochastic depth rate. Defaults to 0. layer_scale_init_value (float): Init value for Layer Scale. Defaults to 1e-6. out_indices (Sequence | int): Output from which stages. Defaults to -1, means the last stage. frozen_stages (int): Stages to be frozen (all param fixed). Defaults to 0, which means not freezing any parameters. gap_before_final_norm (bool): Whether to globally average the feature map before the final norm layer. In the official repo, it's only used in classification task. Defaults to True. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. Defaults to False. init_cfg (dict, optional): Initialization config dict """ # noqa: E501 arch_settings = { 'tiny': { 'depths': [3, 3, 9, 3], 'channels': [96, 192, 384, 768] }, 'small': { 'depths': [3, 3, 27, 3], 'channels': [96, 192, 384, 768] }, 'base': { 'depths': [3, 3, 27, 3], 'channels': [128, 256, 512, 1024] }, 'large': { 'depths': [3, 3, 27, 3], 'channels': [192, 384, 768, 1536] }, 'xlarge': { 'depths': [3, 3, 27, 3], 'channels': [256, 512, 1024, 2048] }, } def __init__(self, arch='tiny', in_channels=3, stem_patch_size=4, norm_cfg=dict(type='LN2d', eps=1e-6), act_cfg=dict(type='GELU'), linear_pw_conv=True, drop_path_rate=0., layer_scale_init_value=1e-6, out_indices=-1, frozen_stages=0, gap_before_final_norm=True, with_cp=False, init_cfg=None): super().__init__(init_cfg=init_cfg) if isinstance(arch, str): assert arch in self.arch_settings, \ f'Unavailable arch, please choose from ' \ f'({set(self.arch_settings)}) or pass a dict.' arch = self.arch_settings[arch] elif isinstance(arch, dict): assert 'depths' in arch and 'channels' in arch, \ f'The arch dict must have "depths" and "channels", ' \ f'but got {list(arch.keys())}.' self.depths = arch['depths'] self.channels = arch['channels'] assert (isinstance(self.depths, Sequence) and isinstance(self.channels, Sequence) and len(self.depths) == len(self.channels)), \ f'The "depths" ({self.depths}) and "channels" ({self.channels}) ' \ 'should be both sequence with the same length.' self.num_stages = len(self.depths) if isinstance(out_indices, int): out_indices = [out_indices] assert isinstance(out_indices, Sequence), \ f'"out_indices" must by a sequence or int, ' \ f'get {type(out_indices)} instead.' for i, index in enumerate(out_indices): if index < 0: out_indices[i] = 4 + index assert out_indices[i] >= 0, f'Invalid out_indices {index}' self.out_indices = out_indices self.frozen_stages = frozen_stages self.gap_before_final_norm = gap_before_final_norm # stochastic depth decay rule dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths)) ] block_idx = 0 # 4 downsample layers between stages, including the stem layer. self.downsample_layers = ModuleList() stem = nn.Sequential( nn.Conv2d( in_channels, self.channels[0], kernel_size=stem_patch_size, stride=stem_patch_size), build_norm_layer(norm_cfg, self.channels[0])[1], ) self.downsample_layers.append(stem) # 4 feature resolution stages, each consisting of multiple residual # blocks self.stages = nn.ModuleList() for i in range(self.num_stages): depth = self.depths[i] channels = self.channels[i] if i >= 1: downsample_layer = nn.Sequential( LayerNorm2d(self.channels[i - 1]), nn.Conv2d( self.channels[i - 1], channels, kernel_size=2, stride=2), ) self.downsample_layers.append(downsample_layer) stage = Sequential(*[ ConvNeXtBlock( in_channels=channels, drop_path_rate=dpr[block_idx + j], norm_cfg=norm_cfg, act_cfg=act_cfg, linear_pw_conv=linear_pw_conv, layer_scale_init_value=layer_scale_init_value, with_cp=with_cp) for j in range(depth) ]) block_idx += depth self.stages.append(stage) if i in self.out_indices: norm_layer = build_norm_layer(norm_cfg, channels)[1] self.add_module(f'norm{i}', norm_layer) self._freeze_stages() def forward(self, x): outs = [] for i, stage in enumerate(self.stages): x = self.downsample_layers[i](x) x = stage(x) # x = cp.checkpoint(stage, x) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') if self.gap_before_final_norm: gap = x.mean([-2, -1], keepdim=True) outs.append(norm_layer(gap).flatten(1)) else: # The output of LayerNorm2d may be discontiguous, which # may cause some problem in the downstream tasks outs.append(norm_layer(x).contiguous()) return tuple(outs) def _freeze_stages(self): for i in range(self.frozen_stages): downsample_layer = self.downsample_layers[i] stage = self.stages[i] downsample_layer.eval() stage.eval() for param in chain(downsample_layer.parameters(), stage.parameters()): param.requires_grad = False def train(self, mode=True): super(ConvNeXt, self).train(mode) self._freeze_stages() ================================================ FILE: mmdet3d/models/backbones/dgcnn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn from mmdet3d.ops import DGCNNFAModule, DGCNNGFModule from ..builder import BACKBONES @BACKBONES.register_module() class DGCNNBackbone(BaseModule): """Backbone network for DGCNN. Args: in_channels (int): Input channels of point cloud. num_samples (tuple[int], optional): The number of samples for knn or ball query in each graph feature (GF) module. Defaults to (20, 20, 20). knn_modes (tuple[str], optional): Mode of KNN of each knn module. Defaults to ('D-KNN', 'F-KNN', 'F-KNN'). radius (tuple[float], optional): Sampling radii of each GF module. Defaults to (None, None, None). gf_channels (tuple[tuple[int]], optional): Out channels of each mlp in GF module. Defaults to ((64, 64), (64, 64), (64, )). fa_channels (tuple[int], optional): Out channels of each mlp in FA module. Defaults to (1024, ). act_cfg (dict, optional): Config of activation layer. Defaults to dict(type='ReLU'). init_cfg (dict, optional): Initialization config. Defaults to None. """ def __init__(self, in_channels, num_samples=(20, 20, 20), knn_modes=('D-KNN', 'F-KNN', 'F-KNN'), radius=(None, None, None), gf_channels=((64, 64), (64, 64), (64, )), fa_channels=(1024, ), act_cfg=dict(type='ReLU'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.num_gf = len(gf_channels) assert len(num_samples) == len(knn_modes) == len(radius) == len( gf_channels), 'Num_samples, knn_modes, radius and gf_channels \ should have the same length.' self.GF_modules = nn.ModuleList() gf_in_channel = in_channels * 2 skip_channel_list = [gf_in_channel] # input channel list for gf_index in range(self.num_gf): cur_gf_mlps = list(gf_channels[gf_index]) cur_gf_mlps = [gf_in_channel] + cur_gf_mlps gf_out_channel = cur_gf_mlps[-1] self.GF_modules.append( DGCNNGFModule( mlp_channels=cur_gf_mlps, num_sample=num_samples[gf_index], knn_mode=knn_modes[gf_index], radius=radius[gf_index], act_cfg=act_cfg)) skip_channel_list.append(gf_out_channel) gf_in_channel = gf_out_channel * 2 fa_in_channel = sum(skip_channel_list[1:]) cur_fa_mlps = list(fa_channels) cur_fa_mlps = [fa_in_channel] + cur_fa_mlps self.FA_module = DGCNNFAModule( mlp_channels=cur_fa_mlps, act_cfg=act_cfg) @auto_fp16(apply_to=('points', )) def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, in_channels). Returns: dict[str, list[torch.Tensor]]: Outputs after graph feature (GF) and feature aggregation (FA) modules. - gf_points (list[torch.Tensor]): Outputs after each GF module. - fa_points (torch.Tensor): Outputs after FA module. """ gf_points = [points] for i in range(self.num_gf): cur_points = self.GF_modules[i](gf_points[i]) gf_points.append(cur_points) fa_points = self.FA_module(gf_points) out = dict(gf_points=gf_points, fa_points=fa_points) return out ================================================ FILE: mmdet3d/models/backbones/dla.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import torch from mmcv.cnn import build_conv_layer, build_norm_layer from mmcv.runner import BaseModule from torch import nn from ..builder import BACKBONES def dla_build_norm_layer(cfg, num_features): """Build normalization layer specially designed for DLANet. Args: cfg (dict): The norm layer config, which should contain: - type (str): Layer type. - layer args: Args needed to instantiate a norm layer. - requires_grad (bool, optional): Whether stop gradient updates. num_features (int): Number of input channels. Returns: Function: Build normalization layer in mmcv. """ cfg_ = cfg.copy() if cfg_['type'] == 'GN': if num_features % 32 == 0: return build_norm_layer(cfg_, num_features) else: assert 'num_groups' in cfg_ cfg_['num_groups'] = cfg_['num_groups'] // 2 return build_norm_layer(cfg_, num_features) else: return build_norm_layer(cfg_, num_features) class BasicBlock(BaseModule): """BasicBlock in DLANet. Args: in_channels (int): Input feature channel. out_channels (int): Output feature channel. norm_cfg (dict): Dictionary to construct and config norm layer. conv_cfg (dict): Dictionary to construct and config conv layer. stride (int, optional): Conv stride. Default: 1. dilation (int, optional): Conv dilation. Default: 1. init_cfg (dict, optional): Initialization config. Default: None. """ def __init__(self, in_channels, out_channels, norm_cfg, conv_cfg, stride=1, dilation=1, init_cfg=None): super(BasicBlock, self).__init__(init_cfg) self.conv1 = build_conv_layer( conv_cfg, in_channels, out_channels, 3, stride=stride, padding=dilation, dilation=dilation, bias=False) self.norm1 = dla_build_norm_layer(norm_cfg, out_channels)[1] self.relu = nn.ReLU(inplace=True) self.conv2 = build_conv_layer( conv_cfg, out_channels, out_channels, 3, stride=1, padding=dilation, dilation=dilation, bias=False) self.norm2 = dla_build_norm_layer(norm_cfg, out_channels)[1] self.stride = stride def forward(self, x, identity=None): """Forward function.""" if identity is None: identity = x out = self.conv1(x) out = self.norm1(out) out = self.relu(out) out = self.conv2(out) out = self.norm2(out) out += identity out = self.relu(out) return out class Root(BaseModule): """Root in DLANet. Args: in_channels (int): Input feature channel. out_channels (int): Output feature channel. norm_cfg (dict): Dictionary to construct and config norm layer. conv_cfg (dict): Dictionary to construct and config conv layer. kernel_size (int): Size of convolution kernel. add_identity (bool): Whether to add identity in root. init_cfg (dict, optional): Initialization config. Default: None. """ def __init__(self, in_channels, out_channels, norm_cfg, conv_cfg, kernel_size, add_identity, init_cfg=None): super(Root, self).__init__(init_cfg) self.conv = build_conv_layer( conv_cfg, in_channels, out_channels, 1, stride=1, padding=(kernel_size - 1) // 2, bias=False) self.norm = dla_build_norm_layer(norm_cfg, out_channels)[1] self.relu = nn.ReLU(inplace=True) self.add_identity = add_identity def forward(self, feat_list): """Forward function. Args: feat_list (list[torch.Tensor]): Output features from multiple layers. """ children = feat_list x = self.conv(torch.cat(feat_list, 1)) x = self.norm(x) if self.add_identity: x += children[0] x = self.relu(x) return x class Tree(BaseModule): """Tree in DLANet. Args: levels (int): The level of the tree. block (nn.Module): The block module in tree. in_channels: Input feature channel. out_channels: Output feature channel. norm_cfg (dict): Dictionary to construct and config norm layer. conv_cfg (dict): Dictionary to construct and config conv layer. stride (int, optional): Convolution stride. Default: 1. level_root (bool, optional): whether belongs to the root layer. root_dim (int, optional): Root input feature channel. root_kernel_size (int, optional): Size of root convolution kernel. Default: 1. dilation (int, optional): Conv dilation. Default: 1. add_identity (bool, optional): Whether to add identity in root. Default: False. init_cfg (dict, optional): Initialization config. Default: None. """ def __init__(self, levels, block, in_channels, out_channels, norm_cfg, conv_cfg, stride=1, level_root=False, root_dim=None, root_kernel_size=1, dilation=1, add_identity=False, init_cfg=None): super(Tree, self).__init__(init_cfg) if root_dim is None: root_dim = 2 * out_channels if level_root: root_dim += in_channels if levels == 1: self.root = Root(root_dim, out_channels, norm_cfg, conv_cfg, root_kernel_size, add_identity) self.tree1 = block( in_channels, out_channels, norm_cfg, conv_cfg, stride, dilation=dilation) self.tree2 = block( out_channels, out_channels, norm_cfg, conv_cfg, 1, dilation=dilation) else: self.tree1 = Tree( levels - 1, block, in_channels, out_channels, norm_cfg, conv_cfg, stride, root_dim=None, root_kernel_size=root_kernel_size, dilation=dilation, add_identity=add_identity) self.tree2 = Tree( levels - 1, block, out_channels, out_channels, norm_cfg, conv_cfg, root_dim=root_dim + out_channels, root_kernel_size=root_kernel_size, dilation=dilation, add_identity=add_identity) self.level_root = level_root self.root_dim = root_dim self.downsample = None self.project = None self.levels = levels if stride > 1: self.downsample = nn.MaxPool2d(stride, stride=stride) if in_channels != out_channels: self.project = nn.Sequential( build_conv_layer( conv_cfg, in_channels, out_channels, 1, stride=1, bias=False), dla_build_norm_layer(norm_cfg, out_channels)[1]) def forward(self, x, identity=None, children=None): children = [] if children is None else children bottom = self.downsample(x) if self.downsample else x identity = self.project(bottom) if self.project else bottom if self.level_root: children.append(bottom) x1 = self.tree1(x, identity) if self.levels == 1: x2 = self.tree2(x1) feat_list = [x2, x1] + children x = self.root(feat_list) else: children.append(x1) x = self.tree2(x1, children=children) return x @BACKBONES.register_module() class DLANet(BaseModule): r"""`DLA backbone `_. Args: depth (int): Depth of DLA. Default: 34. in_channels (int, optional): Number of input image channels. Default: 3. norm_cfg (dict, optional): Dictionary to construct and config norm layer. Default: None. conv_cfg (dict, optional): Dictionary to construct and config conv layer. Default: None. layer_with_level_root (list[bool], optional): Whether to apply level_root in each DLA layer, this is only used for tree levels. Default: (False, True, True, True). with_identity_root (bool, optional): Whether to add identity in root layer. Default: False. pretrained (str, optional): model pretrained path. Default: None. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ arch_settings = { 34: (BasicBlock, (1, 1, 1, 2, 2, 1), (16, 32, 64, 128, 256, 512)), } def __init__(self, depth, in_channels=3, out_indices=(0, 1, 2, 3, 4, 5), frozen_stages=-1, norm_cfg=None, conv_cfg=None, layer_with_level_root=(False, True, True, True), with_identity_root=False, pretrained=None, init_cfg=None): super(DLANet, self).__init__(init_cfg) if depth not in self.arch_settings: raise KeyError(f'invalida depth {depth} for DLA') assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) elif pretrained is None: if init_cfg is None: self.init_cfg = [ dict(type='Kaiming', layer='Conv2d'), dict( type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm']) ] block, levels, channels = self.arch_settings[depth] self.channels = channels self.num_levels = len(levels) self.frozen_stages = frozen_stages self.out_indices = out_indices assert max(out_indices) < self.num_levels self.base_layer = nn.Sequential( build_conv_layer( conv_cfg, in_channels, channels[0], 7, stride=1, padding=3, bias=False), dla_build_norm_layer(norm_cfg, channels[0])[1], nn.ReLU(inplace=True)) # DLANet first uses two conv layers then uses several # Tree layers for i in range(2): level_layer = self._make_conv_level( channels[0], channels[i], levels[i], norm_cfg, conv_cfg, stride=i + 1) layer_name = f'level{i}' self.add_module(layer_name, level_layer) for i in range(2, self.num_levels): dla_layer = Tree( levels[i], block, channels[i - 1], channels[i], norm_cfg, conv_cfg, 2, level_root=layer_with_level_root[i - 2], add_identity=with_identity_root) layer_name = f'level{i}' self.add_module(layer_name, dla_layer) self._freeze_stages() def _make_conv_level(self, in_channels, out_channels, num_convs, norm_cfg, conv_cfg, stride=1, dilation=1): """Conv modules. Args: in_channels (int): Input feature channel. out_channels (int): Output feature channel. num_convs (int): Number of Conv module. norm_cfg (dict): Dictionary to construct and config norm layer. conv_cfg (dict): Dictionary to construct and config conv layer. stride (int, optional): Conv stride. Default: 1. dilation (int, optional): Conv dilation. Default: 1. """ modules = [] for i in range(num_convs): modules.extend([ build_conv_layer( conv_cfg, in_channels, out_channels, 3, stride=stride if i == 0 else 1, padding=dilation, bias=False, dilation=dilation), dla_build_norm_layer(norm_cfg, out_channels)[1], nn.ReLU(inplace=True) ]) in_channels = out_channels return nn.Sequential(*modules) def _freeze_stages(self): if self.frozen_stages >= 0: self.base_layer.eval() for param in self.base_layer.parameters(): param.requires_grad = False for i in range(2): m = getattr(self, f'level{i}') m.eval() for param in m.parameters(): param.requires_grad = False for i in range(1, self.frozen_stages + 1): m = getattr(self, f'level{i+1}') m.eval() for param in m.parameters(): param.requires_grad = False def forward(self, x): outs = [] x = self.base_layer(x) for i in range(self.num_levels): x = getattr(self, 'level{}'.format(i))(x) if i in self.out_indices: outs.append(x) return tuple(outs) ================================================ FILE: mmdet3d/models/backbones/load.py ================================================ # Copyright (c) Open-MMLab. All rights reserved. import os.path as osp import time from tempfile import TemporaryDirectory import torch from torch.optim import Optimizer import mmcv from mmcv.parallel import is_module_wrapper from mmcv.runner.checkpoint import weights_to_cpu, get_state_dict try: import apex except: print('apex is not installed') def save_checkpoint(model, filename, optimizer=None, meta=None): """Save checkpoint to file. The checkpoint will have 4 fields: ``meta``, ``state_dict`` and ``optimizer``, ``amp``. By default ``meta`` will contain version and time info. Args: model (Module): Module whose params are to be saved. filename (str): Checkpoint filename. optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. meta (dict, optional): Metadata to be saved in checkpoint. """ if meta is None: meta = {} elif not isinstance(meta, dict): raise TypeError(f'meta must be a dict or None, but got {type(meta)}') meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) if is_module_wrapper(model): model = model.module if hasattr(model, 'CLASSES') and model.CLASSES is not None: # save class name to the meta meta.update(CLASSES=model.CLASSES) checkpoint = { 'meta': meta, 'state_dict': weights_to_cpu(get_state_dict(model)) } # save optimizer state dict in the checkpoint if isinstance(optimizer, Optimizer): checkpoint['optimizer'] = optimizer.state_dict() elif isinstance(optimizer, dict): checkpoint['optimizer'] = {} for name, optim in optimizer.items(): checkpoint['optimizer'][name] = optim.state_dict() # save amp state dict in the checkpoint # checkpoint['amp'] = apex.amp.state_dict() if filename.startswith('pavi://'): try: from pavi import modelcloud from pavi.exception import NodeNotFoundError except ImportError: raise ImportError( 'Please install pavi to load checkpoint from modelcloud.') model_path = filename[7:] root = modelcloud.Folder() model_dir, model_name = osp.split(model_path) try: model = modelcloud.get(model_dir) except NodeNotFoundError: model = root.create_training_model(model_dir) with TemporaryDirectory() as tmp_dir: checkpoint_file = osp.join(tmp_dir, model_name) with open(checkpoint_file, 'wb') as f: torch.save(checkpoint, f) f.flush() model.create_file(checkpoint_file, name=model_name) else: mmcv.mkdir_or_exist(osp.dirname(filename)) # immediately flush buffer with open(filename, 'wb') as f: torch.save(checkpoint, f) f.flush() ================================================ FILE: mmdet3d/models/backbones/mink_resnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Follow https://github.com/NVIDIA/MinkowskiEngine/blob/master/examples/resnet.py # noqa # and mmcv.cnn.ResNet try: import MinkowskiEngine as ME from MinkowskiEngine.modules.resnet_block import BasicBlock, Bottleneck except ImportError: # Please follow getting_started.md to install MinkowskiEngine. # blocks are used in the static part of MinkResNet BasicBlock, Bottleneck = None, None import torch.nn as nn from mmdet3d.models.builder import BACKBONES @BACKBONES.register_module() class MinkResNet(nn.Module): r"""Minkowski ResNet backbone. See `4D Spatio-Temporal ConvNets `_ for more details. Args: depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. in_channels (ont): Number of input channels, 3 for RGB. num_stages (int, optional): Resnet stages. Default: 4. pool (bool, optional): Add max pooling after first conv if True. Default: True. """ arch_settings = { 18: (BasicBlock, (2, 2, 2, 2)), 34: (BasicBlock, (3, 4, 6, 3)), 50: (Bottleneck, (3, 4, 6, 3)), 101: (Bottleneck, (3, 4, 23, 3)), 152: (Bottleneck, (3, 8, 36, 3)) } def __init__(self, depth, in_channels, num_stages=4, pool=True): super(MinkResNet, self).__init__() if depth not in self.arch_settings: raise KeyError(f'invalid depth {depth} for resnet') assert 4 >= num_stages >= 1 block, stage_blocks = self.arch_settings[depth] stage_blocks = stage_blocks[:num_stages] self.num_stages = num_stages self.pool = pool self.inplanes = 64 self.conv1 = ME.MinkowskiConvolution( in_channels, self.inplanes, kernel_size=3, stride=2, dimension=3) # May be BatchNorm is better, but we follow original implementation. self.norm1 = ME.MinkowskiInstanceNorm(self.inplanes) self.relu = ME.MinkowskiReLU(inplace=True) if self.pool: self.maxpool = ME.MinkowskiMaxPooling( kernel_size=2, stride=2, dimension=3) for i, num_blocks in enumerate(stage_blocks): setattr( self, f'layer{i + 1}', self._make_layer(block, 64 * 2**i, stage_blocks[i], stride=2)) def init_weights(self): for m in self.modules(): if isinstance(m, ME.MinkowskiConvolution): ME.utils.kaiming_normal_( m.kernel, mode='fan_out', nonlinearity='relu') if isinstance(m, ME.MinkowskiBatchNorm): nn.init.constant_(m.bn.weight, 1) nn.init.constant_(m.bn.bias, 0) def _make_layer(self, block, planes, blocks, stride): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( ME.MinkowskiConvolution( self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, dimension=3), ME.MinkowskiBatchNorm(planes * block.expansion)) layers = [] layers.append( block( self.inplanes, planes, stride=stride, downsample=downsample, dimension=3)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes, stride=1, dimension=3)) return nn.Sequential(*layers) def forward(self, x): """Forward pass of ResNet. Args: x (ME.SparseTensor): Input sparse tensor. Returns: list[ME.SparseTensor]: Output sparse tensors. """ x = self.conv1(x) x = self.norm1(x) x = self.relu(x) if self.pool: x = self.maxpool(x) outs = [] for i in range(self.num_stages): x = getattr(self, f'layer{i + 1}')(x) outs.append(x) return outs ================================================ FILE: mmdet3d/models/backbones/multi_backbone.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import warnings import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn from ..builder import BACKBONES, build_backbone @BACKBONES.register_module() class MultiBackbone(BaseModule): """MultiBackbone with different configs. Args: num_streams (int): The number of backbones. backbones (list or dict): A list of backbone configs. aggregation_mlp_channels (list[int]): Specify the mlp layers for feature aggregation. conv_cfg (dict): Config dict of convolutional layers. norm_cfg (dict): Config dict of normalization layers. act_cfg (dict): Config dict of activation layers. suffixes (list): A list of suffixes to rename the return dict for each backbone. """ def __init__(self, num_streams, backbones, aggregation_mlp_channels=None, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-5, momentum=0.01), act_cfg=dict(type='ReLU'), suffixes=('net0', 'net1'), init_cfg=None, pretrained=None, **kwargs): super().__init__(init_cfg=init_cfg) assert isinstance(backbones, dict) or isinstance(backbones, list) if isinstance(backbones, dict): backbones_list = [] for ind in range(num_streams): backbones_list.append(copy.deepcopy(backbones)) backbones = backbones_list assert len(backbones) == num_streams assert len(suffixes) == num_streams self.backbone_list = nn.ModuleList() # Rename the ret_dict with different suffixs. self.suffixes = suffixes out_channels = 0 for backbone_cfg in backbones: out_channels += backbone_cfg['fp_channels'][-1][-1] self.backbone_list.append(build_backbone(backbone_cfg)) # Feature aggregation layers if aggregation_mlp_channels is None: aggregation_mlp_channels = [ out_channels, out_channels // 2, out_channels // len(self.backbone_list) ] else: aggregation_mlp_channels.insert(0, out_channels) self.aggregation_layers = nn.Sequential() for i in range(len(aggregation_mlp_channels) - 1): self.aggregation_layers.add_module( f'layer{i}', ConvModule( aggregation_mlp_channels[i], aggregation_mlp_channels[i + 1], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=True, inplace=True)) assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) @auto_fp16() def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: dict[str, list[torch.Tensor]]: Outputs from multiple backbones. - fp_xyz[suffix] (list[torch.Tensor]): The coordinates of each fp features. - fp_features[suffix] (list[torch.Tensor]): The features from each Feature Propagate Layers. - fp_indices[suffix] (list[torch.Tensor]): Indices of the input points. - hd_feature (torch.Tensor): The aggregation feature from multiple backbones. """ ret = {} fp_features = [] for ind in range(len(self.backbone_list)): cur_ret = self.backbone_list[ind](points) cur_suffix = self.suffixes[ind] fp_features.append(cur_ret['fp_features'][-1]) if cur_suffix != '': for k in cur_ret.keys(): cur_ret[k + '_' + cur_suffix] = cur_ret.pop(k) ret.update(cur_ret) # Combine the features here hd_feature = torch.cat(fp_features, dim=1) hd_feature = self.aggregation_layers(hd_feature) ret['hd_feature'] = hd_feature return ret ================================================ FILE: mmdet3d/models/backbones/nostem_regnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.backbones import RegNet from ..builder import BACKBONES @BACKBONES.register_module() class NoStemRegNet(RegNet): """RegNet backbone without Stem for 3D detection. More details can be found in `paper `_ . Args: arch (dict): The parameter of RegNets. - w0 (int): Initial width. - wa (float): Slope of width. - wm (float): Quantization parameter to quantize the width. - depth (int): Depth of the backbone. - group_w (int): Width of group. - bot_mul (float): Bottleneck ratio, i.e. expansion of bottleneck. strides (Sequence[int]): Strides of the first block of each stage. base_channels (int): Base channels after stem layer. in_channels (int): Number of input image channels. Normally 3. dilations (Sequence[int]): Dilation of each stage. out_indices (Sequence[int]): Output from which stages. style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two layer is the 3x3 conv layer, otherwise the stride-two layer is the first 1x1 conv layer. frozen_stages (int): Stages to be frozen (all param fixed). -1 means not freezing any parameters. norm_cfg (dict): Dictionary to construct and config norm layer. norm_eval (bool): Whether to set norm layers to eval mode, namely, freeze running stats (mean and var). Note: Effect on Batch Norm and its variants only. with_cp (bool): Use checkpoint or not. Using checkpoint will save some memory while slowing down the training speed. zero_init_residual (bool): Whether to use zero init for last norm layer in resblocks to let them behave as identity. Example: >>> from mmdet3d.models import NoStemRegNet >>> import torch >>> self = NoStemRegNet( arch=dict( w0=88, wa=26.31, wm=2.25, group_w=48, depth=25, bot_mul=1.0)) >>> self.eval() >>> inputs = torch.rand(1, 64, 16, 16) >>> level_outputs = self.forward(inputs) >>> for level_out in level_outputs: ... print(tuple(level_out.shape)) (1, 96, 8, 8) (1, 192, 4, 4) (1, 432, 2, 2) (1, 1008, 1, 1) """ def __init__(self, arch, init_cfg=None, **kwargs): super(NoStemRegNet, self).__init__(arch, init_cfg=init_cfg, **kwargs) def _make_stem_layer(self, in_channels, base_channels): """Override the original function that do not initialize a stem layer since 3D detector's voxel encoder works like a stem layer.""" return def forward(self, x): """Forward function of backbone. Args: x (torch.Tensor): Features in shape (N, C, H, W). Returns: tuple[torch.Tensor]: Multi-scale features. """ outs = [] for i, layer_name in enumerate(self.res_layers): res_layer = getattr(self, layer_name) x = res_layer(x) if i in self.out_indices: outs.append(x) return tuple(outs) ================================================ FILE: mmdet3d/models/backbones/pointnet2_sa_msg.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import build_sa_module from ..builder import BACKBONES from .base_pointnet import BasePointNet @BACKBONES.register_module() class PointNet2SAMSG(BasePointNet): """PointNet2 with Multi-scale grouping. Args: in_channels (int): Input channels of point cloud. num_points (tuple[int]): The number of points which each SA module samples. radii (tuple[float]): Sampling radii of each SA module. num_samples (tuple[int]): The number of samples for ball query in each SA module. sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module. aggregation_channels (tuple[int]): Out channels of aggregation multi-scale grouping features. fps_mods (tuple[int]): Mod of FPS for each SA module. fps_sample_range_lists (tuple[tuple[int]]): The number of sampling points which each SA module samples. dilated_group (tuple[bool]): Whether to use dilated ball query for out_indices (Sequence[int]): Output from which stages. norm_cfg (dict): Config of normalization layer. sa_cfg (dict): Config of set abstraction module, which may contain the following keys and values: - pool_mod (str): Pool method ('max' or 'avg') for SA modules. - use_xyz (bool): Whether to use xyz as a part of features. - normalize_xyz (bool): Whether to normalize xyz with radii in each SA module. """ def __init__(self, in_channels, num_points=(2048, 1024, 512, 256), radii=((0.2, 0.4, 0.8), (0.4, 0.8, 1.6), (1.6, 3.2, 4.8)), num_samples=((32, 32, 64), (32, 32, 64), (32, 32, 32)), sa_channels=(((16, 16, 32), (16, 16, 32), (32, 32, 64)), ((64, 64, 128), (64, 64, 128), (64, 96, 128)), ((128, 128, 256), (128, 192, 256), (128, 256, 256))), aggregation_channels=(64, 128, 256), fps_mods=(('D-FPS'), ('FS'), ('F-FPS', 'D-FPS')), fps_sample_range_lists=((-1), (-1), (512, -1)), dilated_group=(True, True, True), out_indices=(2, ), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModuleMSG', pool_mod='max', use_xyz=True, normalize_xyz=False), init_cfg=None): super().__init__(init_cfg=init_cfg) self.num_sa = len(sa_channels) self.out_indices = out_indices assert max(out_indices) < self.num_sa assert len(num_points) == len(radii) == len(num_samples) == len( sa_channels) if aggregation_channels is not None: assert len(sa_channels) == len(aggregation_channels) else: aggregation_channels = [None] * len(sa_channels) self.SA_modules = nn.ModuleList() self.aggregation_mlps = nn.ModuleList() sa_in_channel = in_channels - 3 # number of channels without xyz skip_channel_list = [sa_in_channel] for sa_index in range(self.num_sa): cur_sa_mlps = list(sa_channels[sa_index]) sa_out_channel = 0 for radius_index in range(len(radii[sa_index])): cur_sa_mlps[radius_index] = [sa_in_channel] + list( cur_sa_mlps[radius_index]) sa_out_channel += cur_sa_mlps[radius_index][-1] if isinstance(fps_mods[sa_index], tuple): cur_fps_mod = list(fps_mods[sa_index]) else: cur_fps_mod = list([fps_mods[sa_index]]) if isinstance(fps_sample_range_lists[sa_index], tuple): cur_fps_sample_range_list = list( fps_sample_range_lists[sa_index]) else: cur_fps_sample_range_list = list( [fps_sample_range_lists[sa_index]]) self.SA_modules.append( build_sa_module( num_point=num_points[sa_index], radii=radii[sa_index], sample_nums=num_samples[sa_index], mlp_channels=cur_sa_mlps, fps_mod=cur_fps_mod, fps_sample_range_list=cur_fps_sample_range_list, dilated_group=dilated_group[sa_index], norm_cfg=norm_cfg, cfg=sa_cfg, bias=True)) skip_channel_list.append(sa_out_channel) cur_aggregation_channel = aggregation_channels[sa_index] if cur_aggregation_channel is None: self.aggregation_mlps.append(None) sa_in_channel = sa_out_channel else: self.aggregation_mlps.append( ConvModule( sa_out_channel, cur_aggregation_channel, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), kernel_size=1, bias=True)) sa_in_channel = cur_aggregation_channel @auto_fp16(apply_to=('points', )) def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: dict[str, torch.Tensor]: Outputs of the last SA module. - sa_xyz (torch.Tensor): The coordinates of sa features. - sa_features (torch.Tensor): The features from the last Set Aggregation Layers. - sa_indices (torch.Tensor): Indices of the input points. """ xyz, features = self._split_point_feats(points) batch, num_points = xyz.shape[:2] indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat( batch, 1).long() sa_xyz = [xyz] sa_features = [features] sa_indices = [indices] out_sa_xyz = [xyz] out_sa_features = [features] out_sa_indices = [indices] for i in range(self.num_sa): cur_xyz, cur_features, cur_indices = self.SA_modules[i]( sa_xyz[i], sa_features[i]) if self.aggregation_mlps[i] is not None: cur_features = self.aggregation_mlps[i](cur_features) sa_xyz.append(cur_xyz) sa_features.append(cur_features) sa_indices.append( torch.gather(sa_indices[-1], 1, cur_indices.long())) if i in self.out_indices: out_sa_xyz.append(sa_xyz[-1]) out_sa_features.append(sa_features[-1]) out_sa_indices.append(sa_indices[-1]) return dict( sa_xyz=out_sa_xyz, sa_features=out_sa_features, sa_indices=out_sa_indices) ================================================ FILE: mmdet3d/models/backbones/pointnet2_sa_ssg.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import PointFPModule, build_sa_module from ..builder import BACKBONES from .base_pointnet import BasePointNet @BACKBONES.register_module() class PointNet2SASSG(BasePointNet): """PointNet2 with Single-scale grouping. Args: in_channels (int): Input channels of point cloud. num_points (tuple[int]): The number of points which each SA module samples. radius (tuple[float]): Sampling radii of each SA module. num_samples (tuple[int]): The number of samples for ball query in each SA module. sa_channels (tuple[tuple[int]]): Out channels of each mlp in SA module. fp_channels (tuple[tuple[int]]): Out channels of each mlp in FP module. norm_cfg (dict): Config of normalization layer. sa_cfg (dict): Config of set abstraction module, which may contain the following keys and values: - pool_mod (str): Pool method ('max' or 'avg') for SA modules. - use_xyz (bool): Whether to use xyz as a part of features. - normalize_xyz (bool): Whether to normalize xyz with radii in each SA module. """ def __init__(self, in_channels, num_points=(2048, 1024, 512, 256), radius=(0.2, 0.4, 0.8, 1.2), num_samples=(64, 32, 16, 16), sa_channels=((64, 64, 128), (128, 128, 256), (128, 128, 256), (128, 128, 256)), fp_channels=((256, 256), (256, 256)), norm_cfg=dict(type='BN2d'), sa_cfg=dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True), init_cfg=None): super().__init__(init_cfg=init_cfg) self.num_sa = len(sa_channels) self.num_fp = len(fp_channels) assert len(num_points) == len(radius) == len(num_samples) == len( sa_channels) assert len(sa_channels) >= len(fp_channels) self.SA_modules = nn.ModuleList() sa_in_channel = in_channels - 3 # number of channels without xyz skip_channel_list = [sa_in_channel] for sa_index in range(self.num_sa): cur_sa_mlps = list(sa_channels[sa_index]) cur_sa_mlps = [sa_in_channel] + cur_sa_mlps sa_out_channel = cur_sa_mlps[-1] self.SA_modules.append( build_sa_module( num_point=num_points[sa_index], radius=radius[sa_index], num_sample=num_samples[sa_index], mlp_channels=cur_sa_mlps, norm_cfg=norm_cfg, cfg=sa_cfg)) skip_channel_list.append(sa_out_channel) sa_in_channel = sa_out_channel self.FP_modules = nn.ModuleList() fp_source_channel = skip_channel_list.pop() fp_target_channel = skip_channel_list.pop() for fp_index in range(len(fp_channels)): cur_fp_mlps = list(fp_channels[fp_index]) cur_fp_mlps = [fp_source_channel + fp_target_channel] + cur_fp_mlps self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps)) if fp_index != len(fp_channels) - 1: fp_source_channel = cur_fp_mlps[-1] fp_target_channel = skip_channel_list.pop() @auto_fp16(apply_to=('points', )) def forward(self, points): """Forward pass. Args: points (torch.Tensor): point coordinates with features, with shape (B, N, 3 + input_feature_dim). Returns: dict[str, list[torch.Tensor]]: Outputs after SA and FP modules. - fp_xyz (list[torch.Tensor]): The coordinates of each fp features. - fp_features (list[torch.Tensor]): The features from each Feature Propagate Layers. - fp_indices (list[torch.Tensor]): Indices of the input points. """ xyz, features = self._split_point_feats(points) batch, num_points = xyz.shape[:2] indices = xyz.new_tensor(range(num_points)).unsqueeze(0).repeat( batch, 1).long() sa_xyz = [xyz] sa_features = [features] sa_indices = [indices] for i in range(self.num_sa): cur_xyz, cur_features, cur_indices = self.SA_modules[i]( sa_xyz[i], sa_features[i]) sa_xyz.append(cur_xyz) sa_features.append(cur_features) sa_indices.append( torch.gather(sa_indices[-1], 1, cur_indices.long())) fp_xyz = [sa_xyz[-1]] fp_features = [sa_features[-1]] fp_indices = [sa_indices[-1]] for i in range(self.num_fp): fp_features.append(self.FP_modules[i]( sa_xyz[self.num_sa - i - 1], sa_xyz[self.num_sa - i], sa_features[self.num_sa - i - 1], fp_features[-1])) fp_xyz.append(sa_xyz[self.num_sa - i - 1]) fp_indices.append(sa_indices[self.num_sa - i - 1]) ret = dict( fp_xyz=fp_xyz, fp_features=fp_features, fp_indices=fp_indices, sa_xyz=sa_xyz, sa_features=sa_features, sa_indices=sa_indices) return ret ================================================ FILE: mmdet3d/models/backbones/resnet.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import torch.utils.checkpoint as checkpoint from torch import nn from mmdet.models import BACKBONES from mmdet.models.backbones.resnet import BasicBlock, Bottleneck @BACKBONES.register_module() class CustomResNet(nn.Module): def __init__( self, numC_input, num_layer=[2, 2, 2], num_channels=None, stride=[2, 2, 2], backbone_output_ids=None, norm_cfg=dict(type='BN'), with_cp=False, block_type='Basic', ): super(CustomResNet, self).__init__() # build backbone assert len(num_layer) == len(stride) num_channels = [numC_input*2**(i+1) for i in range(len(num_layer))] \ if num_channels is None else num_channels self.backbone_output_ids = range(len(num_layer)) \ if backbone_output_ids is None else backbone_output_ids layers = [] if block_type == 'BottleNeck': curr_numC = numC_input for i in range(len(num_layer)): layer = [ Bottleneck( curr_numC, num_channels[i] // 4, stride=stride[i], downsample=nn.Conv2d(curr_numC, num_channels[i], 3, stride[i], 1), norm_cfg=norm_cfg) ] curr_numC = num_channels[i] layer.extend([ Bottleneck(curr_numC, curr_numC // 4, norm_cfg=norm_cfg) for _ in range(num_layer[i] - 1) ]) layers.append(nn.Sequential(*layer)) elif block_type == 'Basic': curr_numC = numC_input for i in range(len(num_layer)): layer = [ BasicBlock( curr_numC, num_channels[i], stride=stride[i], downsample=nn.Conv2d(curr_numC, num_channels[i], 3, stride[i], 1), norm_cfg=norm_cfg) ] curr_numC = num_channels[i] layer.extend([ BasicBlock(curr_numC, curr_numC, norm_cfg=norm_cfg) for _ in range(num_layer[i] - 1) ]) layers.append(nn.Sequential(*layer)) else: assert False self.layers = nn.Sequential(*layers) self.with_cp = with_cp def forward(self, x): feats = [] x_tmp = x for lid, layer in enumerate(self.layers): if self.with_cp: x_tmp = checkpoint.checkpoint(layer, x_tmp) else: x_tmp = layer(x_tmp) if lid in self.backbone_output_ids: feats.append(x_tmp) return feats ================================================ FILE: mmdet3d/models/backbones/second.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from mmcv.cnn import build_conv_layer, build_norm_layer from mmcv.runner import BaseModule from torch import nn as nn from ..builder import BACKBONES @BACKBONES.register_module() class SECOND(BaseModule): """Backbone network for SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (int): Input channels. out_channels (list[int]): Output channels for multi-scale feature maps. layer_nums (list[int]): Number of layers in each stage. layer_strides (list[int]): Strides of each stage. norm_cfg (dict): Config dict of normalization layers. conv_cfg (dict): Config dict of convolutional layers. """ def __init__(self, in_channels=128, out_channels=[128, 128, 256], layer_nums=[3, 5, 5], layer_strides=[2, 2, 2], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False), init_cfg=None, pretrained=None): super(SECOND, self).__init__(init_cfg=init_cfg) assert len(layer_strides) == len(layer_nums) assert len(out_channels) == len(layer_nums) in_filters = [in_channels, *out_channels[:-1]] # note that when stride > 1, conv2d with same padding isn't # equal to pad-conv2d. we should use pad-conv2d. blocks = [] for i, layer_num in enumerate(layer_nums): block = [ build_conv_layer( conv_cfg, in_filters[i], out_channels[i], 3, stride=layer_strides[i], padding=1), build_norm_layer(norm_cfg, out_channels[i])[1], nn.ReLU(inplace=True), ] for j in range(layer_num): block.append( build_conv_layer( conv_cfg, out_channels[i], out_channels[i], 3, padding=1)) block.append(build_norm_layer(norm_cfg, out_channels[i])[1]) block.append(nn.ReLU(inplace=True)) block = nn.Sequential(*block) blocks.append(block) self.blocks = nn.ModuleList(blocks) assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) else: self.init_cfg = dict(type='Kaiming', layer='Conv2d') def forward(self, x): """Forward function. Args: x (torch.Tensor): Input with shape (N, C, H, W). Returns: tuple[torch.Tensor]: Multi-scale features. """ outs = [] for i in range(len(self.blocks)): x = self.blocks[i](x) outs.append(x) return tuple(outs) ================================================ FILE: mmdet3d/models/backbones/swin.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from copy import deepcopy import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_norm_layer, trunc_normal_init, build_conv_layer from mmcv.cnn.bricks.transformer import FFN, build_dropout from mmcv.cnn.utils.weight_init import constant_init from mmcv.runner import _load_checkpoint from mmcv.runner.base_module import BaseModule, ModuleList from torch.nn.modules.linear import Linear from torch.nn.modules.normalization import LayerNorm import torch.utils.checkpoint as checkpoint from mmseg.ops import resize from ...utils import get_root_logger from ..builder import BACKBONES from mmcv.cnn.bricks.registry import ATTENTION from torch.nn.modules.utils import _pair as to_2tuple from collections import OrderedDict def swin_convert(ckpt): new_ckpt = OrderedDict() def correct_unfold_reduction_order(x): out_channel, in_channel = x.shape x = x.reshape(out_channel, 4, in_channel // 4) x = x[:, [0, 2, 1, 3], :].transpose(1, 2).reshape(out_channel, in_channel) return x def correct_unfold_norm_order(x): in_channel = x.shape[0] x = x.reshape(4, in_channel // 4) x = x[[0, 2, 1, 3], :].transpose(0, 1).reshape(in_channel) return x for k, v in ckpt.items(): if k.startswith('head'): continue elif k.startswith('layers'): new_v = v if 'attn.' in k: new_k = k.replace('attn.', 'attn.w_msa.') elif 'mlp.' in k: if 'mlp.fc1.' in k: new_k = k.replace('mlp.fc1.', 'ffn.layers.0.0.') elif 'mlp.fc2.' in k: new_k = k.replace('mlp.fc2.', 'ffn.layers.1.') else: new_k = k.replace('mlp.', 'ffn.') elif 'downsample' in k: new_k = k if 'reduction.' in k: new_v = correct_unfold_reduction_order(v) elif 'norm.' in k: new_v = correct_unfold_norm_order(v) else: new_k = k new_k = new_k.replace('layers', 'stages', 1) elif k.startswith('patch_embed'): new_v = v if 'proj' in k: new_k = k.replace('proj', 'projection') else: new_k = k else: new_v = v new_k = k new_ckpt[new_k] = new_v return new_ckpt # Modified from Pytorch-Image-Models class PatchEmbed(BaseModule): """Image to Patch Embedding V2. We use a conv layer to implement PatchEmbed. Args: in_channels (int): The num of input channels. Default: 3 embed_dims (int): The dimensions of embedding. Default: 768 conv_type (dict, optional): The config dict for conv layers type selection. Default: None. kernel_size (int): The kernel_size of embedding conv. Default: 16. stride (int): The slide stride of embedding conv. Default: None (Default to be equal with kernel_size). padding (int): The padding length of embedding conv. Default: 0. dilation (int): The dilation rate of embedding conv. Default: 1. pad_to_patch_size (bool, optional): Whether to pad feature map shape to multiple patch size. Default: True. norm_cfg (dict, optional): Config dict for normalization layer. init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. """ def __init__(self, in_channels=3, embed_dims=768, conv_type=None, kernel_size=16, stride=16, padding=0, dilation=1, pad_to_patch_size=True, norm_cfg=None, init_cfg=None): super(PatchEmbed, self).__init__() self.embed_dims = embed_dims self.init_cfg = init_cfg if stride is None: stride = kernel_size self.pad_to_patch_size = pad_to_patch_size # The default setting of patch size is equal to kernel size. patch_size = kernel_size if isinstance(patch_size, int): patch_size = to_2tuple(patch_size) elif isinstance(patch_size, tuple): if len(patch_size) == 1: patch_size = to_2tuple(patch_size[0]) assert len(patch_size) == 2, \ f'The size of patch should have length 1 or 2, ' \ f'but got {len(patch_size)}' self.patch_size = patch_size # Use conv layer to embed conv_type = conv_type or 'Conv2d' self.projection = build_conv_layer( dict(type=conv_type), in_channels=in_channels, out_channels=embed_dims, kernel_size=kernel_size, stride=stride, padding=padding, dilation=dilation) if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, embed_dims)[1] else: self.norm = None def forward(self, x): H, W = x.shape[2], x.shape[3] # TODO: Process overlapping op if self.pad_to_patch_size: # Modify H, W to multiple of patch size. if H % self.patch_size[0] != 0: x = F.pad( x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) if W % self.patch_size[1] != 0: x = F.pad( x, (0, self.patch_size[1] - W % self.patch_size[1], 0, 0)) x = self.projection(x) self.DH, self.DW = x.shape[2], x.shape[3] x = x.flatten(2).transpose(1, 2) if self.norm is not None: x = self.norm(x) return x class PatchMerging(BaseModule): """Merge patch feature map. This layer use nn.Unfold to group feature map by kernel_size, and use norm and linear layer to embed grouped feature map. Args: in_channels (int): The num of input channels. out_channels (int): The num of output channels. stride (int | tuple): the stride of the sliding length in the unfold layer. Defaults: 2. (Default to be equal with kernel_size). bias (bool, optional): Whether to add bias in linear layer or not. Defaults: False. norm_cfg (dict, optional): Config dict for normalization layer. Defaults: dict(type='LN'). init_cfg (dict, optional): The extra config for initialization. Defaults: None. """ def __init__(self, in_channels, out_channels, stride=2, bias=False, norm_cfg=dict(type='LN'), init_cfg=None): super().__init__(init_cfg) self.in_channels = in_channels self.out_channels = out_channels self.stride = stride self.sampler = nn.Unfold( kernel_size=stride, dilation=1, padding=0, stride=stride) sample_dim = stride**2 * in_channels if norm_cfg is not None: self.norm = build_norm_layer(norm_cfg, sample_dim)[1] else: self.norm = None self.reduction = nn.Linear(sample_dim, out_channels, bias=bias) def forward(self, x, hw_shape): """ x: x.shape -> [B, H*W, C] hw_shape: (H, W) """ B, L, C = x.shape H, W = hw_shape assert L == H * W, 'input feature has wrong size' x = x.view(B, H, W, C).permute([0, 3, 1, 2]) # B, C, H, W # stride is fixed to be equal to kernel_size. if (H % self.stride != 0) or (W % self.stride != 0): x = F.pad(x, (0, W % self.stride, 0, H % self.stride)) # Use nn.Unfold to merge patch. About 25% faster than original method, # but need to modify pretrained model for compatibility x = self.sampler(x) # B, 4*C, H/2*W/2 x = x.transpose(1, 2) # B, H/2*W/2, 4*C x = self.norm(x) if self.norm else x x = self.reduction(x) down_hw_shape = (H + 1) // 2, (W + 1) // 2 return x, down_hw_shape @ATTENTION.register_module() class WindowMSA(BaseModule): """Window based multi-head self-attention (W-MSA) module with relative position bias. Args: embed_dims (int): Number of input channels. window_size (tuple[int]): The height and width of the window. num_heads (int): Number of attention heads. qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. attn_drop_rate (float, optional): Dropout ratio of attention weight. Default: 0.0 proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.0 init_cfg (dict | None, optional): The Config for initialization. Default: None. """ def __init__(self, embed_dims, num_heads, window_size, qkv_bias=True, qk_scale=None, attn_drop_rate=0., proj_drop_rate=0., init_cfg=None): super().__init__() self.embed_dims = embed_dims self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_embed_dims = embed_dims // num_heads self.scale = qk_scale or head_embed_dims**-0.5 self.init_cfg = init_cfg # define a parameter table of relative position bias self.relative_position_bias_table = nn.Parameter( torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH # About 2x faster than original impl Wh, Ww = self.window_size rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww) rel_position_index = rel_index_coords + rel_index_coords.T rel_position_index = rel_position_index.flip(1).contiguous() self.register_buffer('relative_position_index', rel_position_index) self.qkv = nn.Linear(embed_dims, embed_dims * 3, bias=qkv_bias) self.attn_drop = nn.Dropout(attn_drop_rate) self.proj = nn.Linear(embed_dims, embed_dims) self.proj_drop = nn.Dropout(proj_drop_rate) self.softmax = nn.Softmax(dim=-1) def init_weights(self): trunc_normal_init(self.relative_position_bias_table, std=0.02) def forward(self, x, mask=None): """ Args: x (tensor): input features with shape of (num_windows*B, N, C) mask (tensor | None, Optional): mask with shape of (num_windows, Wh*Ww, Wh*Ww), value should be between (-inf, 0]. """ B, N, C = x.shape qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) q, k, v = qkv[0], qkv[1], qkv[ 2] # make torchscript happy (cannot use tensor as tuple) q = q * self.scale attn = (q @ k.transpose(-2, -1)) relative_position_bias = self.relative_position_bias_table[ self.relative_position_index.view(-1)].view( self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH relative_position_bias = relative_position_bias.permute( 2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww attn = attn + relative_position_bias.unsqueeze(0) if mask is not None: nW = mask.shape[0] attn = attn.view(B // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) attn = attn.view(-1, self.num_heads, N, N) attn = self.softmax(attn) else: attn = self.softmax(attn) attn = self.attn_drop(attn) x = (attn @ v).transpose(1, 2).reshape(B, N, C) x = self.proj(x) x = self.proj_drop(x) return x @staticmethod def double_step_seq(step1, len1, step2, len2): seq1 = torch.arange(0, step1 * len1, step1) seq2 = torch.arange(0, step2 * len2, step2) return (seq1[:, None] + seq2[None, :]).reshape(1, -1) @ATTENTION.register_module() class ShiftWindowMSA(BaseModule): """Shift Window Multihead Self-Attention Module. Args: embed_dims (int): Number of input channels. num_heads (int): Number of attention heads. window_size (int): The height and width of the window. shift_size (int, optional): The shift step of each window towards right-bottom. If zero, act as regular window-msa. Defaults to 0. qkv_bias (bool, optional): If True, add a learnable bias to q, k, v. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Defaults: None. attn_drop_rate (float, optional): Dropout ratio of attention weight. Defaults: 0. proj_drop_rate (float, optional): Dropout ratio of output. Defaults: 0. dropout_layer (dict, optional): The dropout_layer used before output. Defaults: dict(type='DropPath', drop_prob=0.). init_cfg (dict, optional): The extra config for initialization. Default: None. """ def __init__(self, embed_dims, num_heads, window_size, shift_size=0, qkv_bias=True, qk_scale=None, attn_drop_rate=0, proj_drop_rate=0, dropout_layer=dict(type='DropPath', drop_prob=0.), init_cfg=None): super().__init__(init_cfg) self.window_size = window_size self.shift_size = shift_size assert 0 <= self.shift_size < self.window_size self.w_msa = WindowMSA( embed_dims=embed_dims, num_heads=num_heads, window_size=to_2tuple(window_size), qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=proj_drop_rate, init_cfg=None) self.drop = build_dropout(dropout_layer) def forward(self, query, hw_shape): B, L, C = query.shape H, W = hw_shape assert L == H * W, 'input feature has wrong size' query = query.view(B, H, W, C) # pad feature maps to multiples of window size pad_r = (self.window_size - W % self.window_size) % self.window_size pad_b = (self.window_size - H % self.window_size) % self.window_size query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b)) H_pad, W_pad = query.shape[1], query.shape[2] # cyclic shift if self.shift_size > 0: shifted_query = torch.roll( query, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) # calculate attention mask for SW-MSA img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device) # 1 H W 1 h_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) w_slices = (slice(0, -self.window_size), slice(-self.window_size, -self.shift_size), slice(-self.shift_size, None)) cnt = 0 for h in h_slices: for w in w_slices: img_mask[:, h, w, :] = cnt cnt += 1 # nW, window_size, window_size, 1 mask_windows = self.window_partition(img_mask) mask_windows = mask_windows.view( -1, self.window_size * self.window_size) attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( attn_mask == 0, float(0.0)) else: shifted_query = query attn_mask = None # nW*B, window_size, window_size, C query_windows = self.window_partition(shifted_query) # nW*B, window_size*window_size, C query_windows = query_windows.view(-1, self.window_size**2, C) # W-MSA/SW-MSA (nW*B, window_size*window_size, C) attn_windows = self.w_msa(query_windows, mask=attn_mask) # merge windows attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) # B H' W' C shifted_x = self.window_reverse(attn_windows, H_pad, W_pad) # reverse cyclic shift if self.shift_size > 0: x = torch.roll( shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) else: x = shifted_x if pad_r > 0 or pad_b: x = x[:, :H, :W, :].contiguous() x = x.view(B, H * W, C) x = self.drop(x) return x def window_reverse(self, windows, H, W): """ Args: windows: (num_windows*B, window_size, window_size, C) window_size (int): Window size H (int): Height of image W (int): Width of image Returns: x: (B, H, W, C) """ window_size = self.window_size B = int(windows.shape[0] / (H * W / window_size / window_size)) x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) return x def window_partition(self, x): """ Args: x: (B, H, W, C) window_size (int): window size Returns: windows: (num_windows*B, window_size, window_size, C) """ B, H, W, C = x.shape window_size = self.window_size x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) windows = x.permute(0, 1, 3, 2, 4, 5).contiguous() windows = windows.view(-1, window_size, window_size, C) return windows class SwinBlock(BaseModule): """" Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. window size (int, optional): The local window scale. Default: 7. shift (bool): whether to shift window or not. Default False. qkv_bias (int, optional): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float, optional): Stochastic depth rate. Default: 0.2. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of nomalization. Default: dict(type='LN'). init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, window_size=7, shift=False, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), init_cfg=None): super(SwinBlock, self).__init__() self.init_cfg = init_cfg self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1] self.attn = ShiftWindowMSA( embed_dims=embed_dims, num_heads=num_heads, window_size=window_size, shift_size=window_size // 2 if shift else 0, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop_rate=attn_drop_rate, proj_drop_rate=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), init_cfg=None) self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1] self.ffn = FFN( embed_dims=embed_dims, feedforward_channels=feedforward_channels, num_fcs=2, ffn_drop=drop_rate, dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate), act_cfg=act_cfg, add_identity=True, init_cfg=None) def forward(self, x, hw_shape): identity = x x = self.norm1(x) x = self.attn(x, hw_shape) x = x + identity identity = x x = self.norm2(x) x = self.ffn(x, identity=identity) return x class SwinBlockSequence(BaseModule): """Implements one stage in Swin Transformer. Args: embed_dims (int): The feature dimension. num_heads (int): Parallel attention heads. feedforward_channels (int): The hidden dimension for FFNs. depth (int): The number of blocks in this stage. window size (int): The local window scale. Default: 7. qkv_bias (int): enable bias for qkv if True. Default: True. qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. drop_rate (float, optional): Dropout rate. Default: 0. attn_drop_rate (float, optional): Attention dropout rate. Default: 0. drop_path_rate (float, optional): Stochastic depth rate. Default: 0.2. downsample (BaseModule | None, optional): The downsample operation module. Default: None. act_cfg (dict, optional): The config dict of activation function. Default: dict(type='GELU'). norm_cfg (dict, optional): The config dict of nomalization. Default: dict(type='LN'). init_cfg (dict | list | None, optional): The init config. Default: None. """ def __init__(self, embed_dims, num_heads, feedforward_channels, depth, window_size=7, qkv_bias=True, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., downsample=None, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), init_cfg=None, with_cp=True): super().__init__() self.init_cfg = init_cfg drop_path_rate = drop_path_rate if isinstance( drop_path_rate, list) else [deepcopy(drop_path_rate) for _ in range(depth)] self.blocks = ModuleList() for i in range(depth): block = SwinBlock( embed_dims=embed_dims, num_heads=num_heads, feedforward_channels=feedforward_channels, window_size=window_size, shift=False if i % 2 == 0 else True, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=drop_path_rate[i], act_cfg=act_cfg, norm_cfg=norm_cfg, init_cfg=None) self.blocks.append(block) self.downsample = downsample self.with_cp = with_cp def forward(self, x, hw_shape): for block in self.blocks: if self.with_cp: x = checkpoint.checkpoint(block, x, hw_shape) else: x = block(x, hw_shape) if self.downsample: x_down, down_hw_shape = self.downsample(x, hw_shape) return x_down, down_hw_shape, x, hw_shape else: return x, hw_shape, x, hw_shape @BACKBONES.register_module() class SwinTransformer(BaseModule): """ Swin Transformer A PyTorch implement of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - https://arxiv.org/abs/2103.14030 Inspiration from https://github.com/microsoft/Swin-Transformer Args: pretrain_img_size (int | tuple[int]): The size of input image when pretrain. Defaults: 224. in_channels (int): The num of input channels. Defaults: 3. embed_dims (int): The feature dimension. Default: 96. patch_size (int | tuple[int]): Patch size. Default: 4. window_size (int): Window size. Default: 7. mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. depths (tuple[int]): Depths of each Swin Transformer stage. Default: (2, 2, 6, 2). num_heads (tuple[int]): Parallel attention heads of each Swin Transformer stage. Default: (3, 6, 12, 24). strides (tuple[int]): The patch merging or patch embedding stride of each Swin Transformer stage. (In swin, we set kernel size equal to stride.) Default: (4, 2, 2, 2). out_indices (tuple[int]): Output from which stages. Default: (0, 1, 2, 3). qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. Default: None. patch_norm (bool): If add a norm layer for patch embed and patch merging. Default: True. drop_rate (float): Dropout rate. Defaults: 0. attn_drop_rate (float): Attention dropout rate. Default: 0. drop_path_rate (float): Stochastic depth rate. Defaults: 0.1. use_abs_pos_embed (bool): If True, add absolute position embedding to the patch embedding. Defaults: False. act_cfg (dict): Config dict for activation layer. Default: dict(type='LN'). norm_cfg (dict): Config dict for normalization layer at output of backone. Defaults: dict(type='LN'). pretrain_style (str): Choose to use official or mmcls pretrain weights. Default: official. pretrained (str, optional): model pretrained path. Default: None. init_cfg (dict, optional): The Config for initialization. Defaults to None. """ def __init__(self, pretrain_img_size=224, in_channels=3, embed_dims=96, patch_size=4, window_size=7, mlp_ratio=4, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), strides=(4, 2, 2, 2), out_indices=(0, 1, 2, 3), qkv_bias=True, qk_scale=None, patch_norm=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, use_abs_pos_embed=False, act_cfg=dict(type='GELU'), norm_cfg=dict(type='LN'), pretrain_style='official', pretrained=None, init_cfg=None, with_cp=True, return_stereo_feat=False, output_missing_index_as_none=False, frozen_stages=-1): super(SwinTransformer, self).__init__() if isinstance(pretrain_img_size, int): pretrain_img_size = to_2tuple(pretrain_img_size) elif isinstance(pretrain_img_size, tuple): if len(pretrain_img_size) == 1: pretrain_img_size = to_2tuple(pretrain_img_size[0]) assert len(pretrain_img_size) == 2, \ f'The size of image should have length 1 or 2, ' \ f'but got {len(pretrain_img_size)}' assert pretrain_style in ['official', 'mmcls'], 'We only support load ' 'official ckpt and mmcls ckpt.' if isinstance(pretrained, str) or pretrained is None: warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') else: raise TypeError('pretrained must be a str or None') num_layers = len(depths) self.out_indices = out_indices self.use_abs_pos_embed = use_abs_pos_embed self.pretrain_style = pretrain_style self.pretrained = pretrained self.init_cfg = init_cfg self.frozen_stages = frozen_stages assert strides[0] == patch_size, 'Use non-overlapping patch embed.' self.patch_embed = PatchEmbed( in_channels=in_channels, embed_dims=embed_dims, conv_type='Conv2d', kernel_size=patch_size, stride=strides[0], pad_to_patch_size=True, norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) if self.use_abs_pos_embed: patch_row = pretrain_img_size[0] // patch_size patch_col = pretrain_img_size[1] // patch_size num_patches = patch_row * patch_col self.absolute_pos_embed = nn.Parameter( torch.zeros((1, num_patches, embed_dims))) self.drop_after_pos = nn.Dropout(p=drop_rate) # stochastic depth total_depth = sum(depths) dpr = [ x.item() for x in torch.linspace(0, drop_path_rate, total_depth) ] # stochastic depth decay rule self.stages = ModuleList() in_channels = embed_dims for i in range(num_layers): if i < num_layers - 1: downsample = PatchMerging( in_channels=in_channels, out_channels=2 * in_channels, stride=strides[i + 1], norm_cfg=norm_cfg if patch_norm else None, init_cfg=None) else: downsample = None stage = SwinBlockSequence( embed_dims=in_channels, num_heads=num_heads[i], feedforward_channels=mlp_ratio * in_channels, depth=depths[i], window_size=window_size, qkv_bias=qkv_bias, qk_scale=qk_scale, drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_rate=dpr[:depths[i]], downsample=downsample, act_cfg=act_cfg, norm_cfg=norm_cfg, init_cfg=None, with_cp=with_cp) self.stages.append(stage) dpr = dpr[depths[i]:] if downsample: in_channels = downsample.out_channels self.num_features = [int(embed_dims * 2**i) for i in range(num_layers)] # Add a norm layer for each output for i in out_indices: layer = build_norm_layer(norm_cfg, self.num_features[i])[1] layer_name = f'norm{i}' self.add_module(layer_name, layer) self.output_missing_index_as_none = output_missing_index_as_none self._freeze_stages() self.return_stereo_feat = return_stereo_feat def _freeze_stages(self): if self.frozen_stages >= 0: self.patch_embed.eval() for param in self.patch_embed.parameters(): param.requires_grad = False if self.frozen_stages >= 1 and self.use_abs_pos_embed: self.absolute_pos_embed.requires_grad = False if self.frozen_stages >= 2: self.drop_after_pos.eval() for i in range(0, self.frozen_stages - 1): m = self.stages[i] m.eval() for param in m.parameters(): param.requires_grad = False def init_weights(self): if self.pretrained is None: super().init_weights() if self.use_abs_pos_embed: trunc_normal_init(self.absolute_pos_embed, std=0.02) for m in self.modules(): if isinstance(m, Linear): trunc_normal_init(m.weight, std=.02) if m.bias is not None: constant_init(m.bias, 0) elif isinstance(m, LayerNorm): constant_init(m.bias, 0) constant_init(m.weight, 1.0) elif isinstance(self.pretrained, str): logger = get_root_logger() ckpt = _load_checkpoint( self.pretrained, logger=logger, map_location='cpu') if 'state_dict' in ckpt: state_dict = ckpt['state_dict'] elif 'model' in ckpt: state_dict = ckpt['model'] else: state_dict = ckpt if self.pretrain_style == 'official': state_dict = swin_convert(state_dict) # strip prefix of state_dict if list(state_dict.keys())[0].startswith('module.'): state_dict = {k[7:]: v for k, v in state_dict.items()} # if list(state_dict.keys())[0].startswith('backbone.'): # state_dict = {k[9:]: v for k, v in state_dict.items()} # reshape absolute position embedding if state_dict.get('absolute_pos_embed') is not None: absolute_pos_embed = state_dict['absolute_pos_embed'] N1, L, C1 = absolute_pos_embed.size() N2, C2, H, W = self.absolute_pos_embed.size() if N1 != N2 or C1 != C2 or L != H * W: logger.warning('Error in loading absolute_pos_embed, pass') else: state_dict['absolute_pos_embed'] = absolute_pos_embed.view( N2, H, W, C2).permute(0, 3, 1, 2).contiguous() # interpolate position bias table if needed relative_position_bias_table_keys = [ k for k in state_dict.keys() if 'relative_position_bias_table' in k ] for table_key in relative_position_bias_table_keys: table_pretrained = state_dict[table_key] table_current = self.state_dict()[table_key] L1, nH1 = table_pretrained.size() L2, nH2 = table_current.size() if nH1 != nH2: logger.warning(f'Error in loading {table_key}, pass') else: if L1 != L2: S1 = int(L1**0.5) S2 = int(L2**0.5) table_pretrained_resized = resize( table_pretrained.permute(1, 0).reshape( 1, nH1, S1, S1), size=(S2, S2), mode='bicubic') state_dict[table_key] = table_pretrained_resized.view( nH2, L2).permute(1, 0).contiguous() # load state_dict self.load_state_dict(state_dict, False) def forward(self, x): x = self.patch_embed(x) hw_shape = (self.patch_embed.DH, self.patch_embed.DW) if self.use_abs_pos_embed: x = x + self.absolute_pos_embed x = self.drop_after_pos(x) outs = [] for i, stage in enumerate(self.stages): x, hw_shape, out, out_hw_shape = stage(x, hw_shape) if i == 0 and self.return_stereo_feat: out = out.view(-1, *out_hw_shape, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) if i in self.out_indices: norm_layer = getattr(self, f'norm{i}') out = norm_layer(out) out = out.view(-1, *out_hw_shape, self.num_features[i]).permute(0, 3, 1, 2).contiguous() outs.append(out) elif self.output_missing_index_as_none: outs.append(None) return outs def train(self, mode=True): """Convert the model into training mode while keep normalization layer freezed.""" super(SwinTransformer, self).train(mode) self._freeze_stages() ================================================ FILE: mmdet3d/models/backbones/vovnet.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2022 megvii-model. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from DETR3D (https://github.com/WangYueFt/detr3d) # Copyright (c) 2021 Wang, Yue # ------------------------------------------------------------------------ # Copyright (c) Youngwan Lee (ETRI) All Rights Reserved. # Copyright 2021 Toyota Research Institute. All rights reserved. # ------------------------------------------------------------------------ from collections import OrderedDict from mmcv.runner import BaseModule from mmdet.models.builder import BACKBONES import torch import torch.nn as nn import torch.nn.functional as F from torch.nn.modules.batchnorm import _BatchNorm import warnings import torch.utils.checkpoint as cp VoVNet19_slim_dw_eSE = { 'stem': [64, 64, 64], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], "layer_per_block": 3, "block_per_stage": [1, 1, 1, 1], "eSE": True, "dw": True } VoVNet19_dw_eSE = { 'stem': [64, 64, 64], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 3, "block_per_stage": [1, 1, 1, 1], "eSE": True, "dw": True } VoVNet19_slim_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, "dw": False } VoVNet19_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 3, "block_per_stage": [1, 1, 1, 1], "eSE": True, "dw": False } VoVNet39_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 5, "block_per_stage": [1, 1, 2, 2], "eSE": True, "dw": False } VoVNet57_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 5, "block_per_stage": [1, 1, 4, 3], "eSE": True, "dw": False } VoVNet99_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 5, "block_per_stage": [1, 3, 9, 3], "eSE": True, "dw": False } _STAGE_SPECS = { "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, "V-19-dw-eSE": VoVNet19_dw_eSE, "V-19-slim-eSE": VoVNet19_slim_eSE, "V-19-eSE": VoVNet19_eSE, "V-39-eSE": VoVNet39_eSE, "V-57-eSE": VoVNet57_eSE, "V-99-eSE": VoVNet99_eSE, } def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): """3x3 convolution with padding""" return [ ( '{}_{}/dw_conv3x3'.format(module_name, postfix), nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=out_channels, bias=False ) ), ( '{}_{}/pw_conv1x1'.format(module_name, postfix), nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False) ), ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)), ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)), ] def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): """3x3 convolution with padding""" return [ ( f"{module_name}_{postfix}/conv", nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False, ), ), (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), ] def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): """1x1 convolution with padding""" return [ ( f"{module_name}_{postfix}/conv", nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False, ), ), (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), ] class Hsigmoid(nn.Module): def __init__(self, inplace=True): super(Hsigmoid, self).__init__() self.inplace = inplace def forward(self, x): return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 class eSEModule(nn.Module): def __init__(self, channel, reduction=4): super(eSEModule, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) self.hsigmoid = Hsigmoid() def forward(self, x): input = x x = self.avg_pool(x) x = self.fc(x) x = self.hsigmoid(x) return input * x class _OSA_module(nn.Module): def __init__( self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False, with_cp=True ): super(_OSA_module, self).__init__() self.identity = identity self.depthwise = depthwise self.isReduced = False self.use_checkpoint = with_cp self.layers = nn.ModuleList() in_channel = in_ch if self.depthwise and in_channel != stage_ch: self.isReduced = True self.conv_reduction = nn.Sequential( OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) ) for i in range(layer_per_block): if self.depthwise: self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) else: self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) in_channel = stage_ch # feature aggregation in_channel = in_ch + layer_per_block * stage_ch self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) self.ese = eSEModule(concat_ch) def _forward(self, x): identity_feat = x output = [] output.append(x) if self.depthwise and self.isReduced: x = self.conv_reduction(x) for layer in self.layers: x = layer(x) output.append(x) x = torch.cat(output, dim=1) xt = self.concat(x) xt = self.ese(xt) if self.identity: xt = xt + identity_feat return xt def forward(self, x): if self.use_checkpoint and self.training: xt = cp.checkpoint(self._forward, x) else: xt = self._forward(x) return xt class _OSA_stage(nn.Sequential): def __init__( self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False ): super(_OSA_stage, self).__init__() if not stage_num == 2: self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) if block_per_stage != 1: SE = False module_name = f"OSA{stage_num}_1" self.add_module( module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) ) for i in range(block_per_stage - 1): if i != block_per_stage - 2: # last block SE = False module_name = f"OSA{stage_num}_{i + 2}" self.add_module( module_name, _OSA_module( concat_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, identity=True, depthwise=depthwise ), ) @BACKBONES.register_module() class VoVNetCP(BaseModule): def __init__(self, spec_name, input_ch=3, out_features=None, frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None): """ Args: input_ch(int) : the number of input channel out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "stage2" ... """ super(VoVNetCP, self).__init__(init_cfg) self.frozen_stages = frozen_stages self.norm_eval = norm_eval if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) stage_specs = _STAGE_SPECS[spec_name] stem_ch = stage_specs["stem"] config_stage_ch = stage_specs["stage_conv_ch"] config_concat_ch = stage_specs["stage_out_ch"] block_per_stage = stage_specs["block_per_stage"] layer_per_block = stage_specs["layer_per_block"] SE = stage_specs["eSE"] depthwise = stage_specs["dw"] self._out_features = out_features # Stem module conv_type = dw_conv3x3 if depthwise else conv3x3 stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) self.add_module("stem", nn.Sequential((OrderedDict(stem)))) current_stirde = 4 self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} self._out_feature_channels = {"stem": stem_ch[2]} stem_out_ch = [stem_ch[2]] in_ch_list = stem_out_ch + config_concat_ch[:-1] # OSA stages self.stage_names = [] for i in range(4): # num_stages name = "stage%d" % (i + 2) # stage 2 ... stage 5 self.stage_names.append(name) self.add_module( name, _OSA_stage( in_ch_list[i], config_stage_ch[i], config_concat_ch[i], block_per_stage[i], layer_per_block, i + 2, SE, depthwise, ), ) self._out_feature_channels[name] = config_concat_ch[i] if not i == 0: self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) # initialize weights # self._initialize_weights() def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight) # def forward(self, x): # outputs = {} # x = self.stem(x) # if "stem" in self._out_features: # outputs["stem"] = x # for name in self.stage_names: # x = getattr(self, name)(x) # if name in self._out_features: # outputs[name] = x # return outputs def forward(self, x): outputs = [] x = self.stem(x) if "stem" in self._out_features: outputs.append(x) for name in self.stage_names: x = getattr(self, name)(x) if name in self._out_features: outputs.append(x) return outputs def _freeze_stages(self): if self.frozen_stages >= 0: m = getattr(self, 'stem') m.eval() for param in m.parameters(): param.requires_grad = False for i in range(1, self.frozen_stages + 1): m = getattr(self, f'stage{i+1}') m.eval() for param in m.parameters(): param.requires_grad = False def train(self, mode=True): """Convert the model into training mode while keep normalization layer freezed.""" super(VoVNetCP, self).train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: mmdet3d/models/backbones/vovnet2.py ================================================ from collections import OrderedDict from mmcv.runner import BaseModule from mmdet.models.builder import BACKBONES import torch import torch.nn as nn import torch.nn.functional as F from torch.nn.modules.batchnorm import _BatchNorm import warnings import torch.utils.checkpoint as cp VoVNet19_slim_dw_eSE = { 'stem': [64, 64, 64], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], "layer_per_block": 3, "block_per_stage": [1, 1, 1, 1], "eSE": True, "dw": True } VoVNet19_dw_eSE = { 'stem': [64, 64, 64], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 3, "block_per_stage": [1, 1, 1, 1], "eSE": True, "dw": True } VoVNet19_slim_eSE = { 'stem': [64, 64, 128], 'stage_conv_ch': [64, 80, 96, 112], 'stage_out_ch': [112, 256, 384, 512], 'layer_per_block': 3, 'block_per_stage': [1, 1, 1, 1], 'eSE': True, "dw": False } VoVNet19_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 3, "block_per_stage": [1, 1, 1, 1], "eSE": True, "dw": False } VoVNet39_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 5, "block_per_stage": [1, 1, 2, 2], "eSE": True, "dw": False } VoVNet57_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 5, "block_per_stage": [1, 1, 4, 3], "eSE": True, "dw": False } VoVNet99_eSE = { 'stem': [64, 64, 128], "stage_conv_ch": [128, 160, 192, 224], "stage_out_ch": [256, 512, 768, 1024], "layer_per_block": 5, "block_per_stage": [1, 3, 9, 3], "eSE": True, "dw": False } _STAGE_SPECS = { "V-19-slim-dw-eSE": VoVNet19_slim_dw_eSE, "V-19-dw-eSE": VoVNet19_dw_eSE, "V-19-slim-eSE": VoVNet19_slim_eSE, "V-19-eSE": VoVNet19_eSE, "V-39-eSE": VoVNet39_eSE, "V-57-eSE": VoVNet57_eSE, "V-99-eSE": VoVNet99_eSE, } def dw_conv3x3(in_channels, out_channels, module_name, postfix, stride=1, kernel_size=3, padding=1): """3x3 convolution with padding""" return [ ( '{}_{}/dw_conv3x3'.format(module_name, postfix), nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=out_channels, bias=False ) ), ( '{}_{}/pw_conv1x1'.format(module_name, postfix), nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, groups=1, bias=False) ), ('{}_{}/pw_norm'.format(module_name, postfix), nn.BatchNorm2d(out_channels)), ('{}_{}/pw_relu'.format(module_name, postfix), nn.ReLU(inplace=True)), ] def conv3x3(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=3, padding=1): """3x3 convolution with padding""" return [ ( f"{module_name}_{postfix}/conv", nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False, ), ), (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), ] def conv1x1(in_channels, out_channels, module_name, postfix, stride=1, groups=1, kernel_size=1, padding=0): """1x1 convolution with padding""" return [ ( f"{module_name}_{postfix}/conv", nn.Conv2d( in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups, bias=False, ), ), (f"{module_name}_{postfix}/norm", nn.BatchNorm2d(out_channels)), (f"{module_name}_{postfix}/relu", nn.ReLU(inplace=True)), ] class Hsigmoid(nn.Module): def __init__(self, inplace=True): super(Hsigmoid, self).__init__() self.inplace = inplace def forward(self, x): return F.relu6(x + 3.0, inplace=self.inplace) / 6.0 class eSEModule(nn.Module): def __init__(self, channel, reduction=4): super(eSEModule, self).__init__() self.avg_pool = nn.AdaptiveAvgPool2d(1) self.fc = nn.Conv2d(channel, channel, kernel_size=1, padding=0) self.hsigmoid = Hsigmoid() def forward(self, x): input = x x = self.avg_pool(x) x = self.fc(x) x = self.hsigmoid(x) return input * x class _OSA_module(nn.Module): def __init__( self, in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE=False, identity=False, depthwise=False, with_cp=True ): super(_OSA_module, self).__init__() self.identity = identity self.depthwise = depthwise self.isReduced = False self.use_checkpoint = with_cp self.layers = nn.ModuleList() in_channel = in_ch if self.depthwise and in_channel != stage_ch: self.isReduced = True self.conv_reduction = nn.Sequential( OrderedDict(conv1x1(in_channel, stage_ch, "{}_reduction".format(module_name), "0")) ) for i in range(layer_per_block): if self.depthwise: self.layers.append(nn.Sequential(OrderedDict(dw_conv3x3(stage_ch, stage_ch, module_name, i)))) else: self.layers.append(nn.Sequential(OrderedDict(conv3x3(in_channel, stage_ch, module_name, i)))) in_channel = stage_ch # feature aggregation in_channel = in_ch + layer_per_block * stage_ch self.concat = nn.Sequential(OrderedDict(conv1x1(in_channel, concat_ch, module_name, "concat"))) self.ese = eSEModule(concat_ch) def _forward(self, x): identity_feat = x output = [] output.append(x) if self.depthwise and self.isReduced: x = self.conv_reduction(x) for layer in self.layers: x = layer(x) output.append(x) x = torch.cat(output, dim=1) xt = self.concat(x) xt = self.ese(xt) if self.identity: xt = xt + identity_feat return xt def forward(self, x): if self.use_checkpoint and self.training: xt = cp.checkpoint(self._forward, x) else: xt = self._forward(x) return xt class _OSA_stage(nn.Sequential): def __init__( self, in_ch, stage_ch, concat_ch, block_per_stage, layer_per_block, stage_num, SE=False, depthwise=False ): super(_OSA_stage, self).__init__() if not stage_num == 2: self.add_module("Pooling", nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True)) if block_per_stage != 1: SE = False module_name = f"OSA{stage_num}_1" self.add_module( module_name, _OSA_module(in_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, depthwise=depthwise) ) for i in range(block_per_stage - 1): if i != block_per_stage - 2: # last block SE = False module_name = f"OSA{stage_num}_{i + 2}" self.add_module( module_name, _OSA_module( concat_ch, stage_ch, concat_ch, layer_per_block, module_name, SE, identity=True, depthwise=depthwise ), ) @BACKBONES.register_module() class VoVNet2(BaseModule): def __init__(self, spec_name, input_ch=3, out_features=None, frozen_stages=-1, norm_eval=True, pretrained=None, init_cfg=None): """ Args: input_ch(int) : the number of input channel out_features (list[str]): name of the layers whose outputs should be returned in forward. Can be anything in "stem", "stage2" ... """ super(VoVNet2, self).__init__(init_cfg) self.frozen_stages = frozen_stages self.norm_eval = norm_eval if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) stage_specs = _STAGE_SPECS[spec_name] stem_ch = stage_specs["stem"] config_stage_ch = stage_specs["stage_conv_ch"] config_concat_ch = stage_specs["stage_out_ch"] block_per_stage = stage_specs["block_per_stage"] layer_per_block = stage_specs["layer_per_block"] SE = stage_specs["eSE"] depthwise = stage_specs["dw"] self._out_features = out_features # Stem module conv_type = dw_conv3x3 if depthwise else conv3x3 stem = conv3x3(input_ch, stem_ch[0], "stem", "1", 2) stem += conv_type(stem_ch[0], stem_ch[1], "stem", "2", 1) stem += conv_type(stem_ch[1], stem_ch[2], "stem", "3", 2) self.add_module("stem", nn.Sequential((OrderedDict(stem)))) current_stirde = 4 self._out_feature_strides = {"stem": current_stirde, "stage2": current_stirde} self._out_feature_channels = {"stem": stem_ch[2]} stem_out_ch = [stem_ch[2]] in_ch_list = stem_out_ch + config_concat_ch[:-1] # OSA stages self.stage_names = [] for i in range(4): # num_stages name = "stage%d" % (i + 2) # stage 2 ... stage 5 self.stage_names.append(name) self.add_module( name, _OSA_stage( in_ch_list[i], config_stage_ch[i], config_concat_ch[i], block_per_stage[i], layer_per_block, i + 2, SE, depthwise, ), ) self._out_feature_channels[name] = config_concat_ch[i] if not i == 0: self._out_feature_strides[name] = current_stirde = int(current_stirde * 2) # initialize weights # self._initialize_weights() def _initialize_weights(self): for m in self.modules(): if isinstance(m, nn.Conv2d): nn.init.kaiming_normal_(m.weight) def forward(self, x): outputs = {} x = self.stem(x) if "stem" in self._out_features: outputs["stem"] = x for name in self.stage_names: x = getattr(self, name)(x) if name in self._out_features: outputs[name] = x return outputs def _freeze_stages(self): if self.frozen_stages >= 0: m = getattr(self, 'stem') m.eval() for param in m.parameters(): param.requires_grad = False for i in range(1, self.frozen_stages + 1): m = getattr(self, f'stage{i+1}') m.eval() for param in m.parameters(): param.requires_grad = False def train(self, mode=True): """Convert the model into training mode while keep normalization layer freezed.""" super(VoVNet2, self).train(mode) self._freeze_stages() if mode and self.norm_eval: for m in self.modules(): # trick: eval have effect on BatchNorm only if isinstance(m, _BatchNorm): m.eval() ================================================ FILE: mmdet3d/models/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from mmcv.cnn import MODELS as MMCV_MODELS from mmcv.utils import Registry from mmdet.models.builder import BACKBONES as MMDET_BACKBONES from mmdet.models.builder import DETECTORS as MMDET_DETECTORS from mmdet.models.builder import HEADS as MMDET_HEADS from mmdet.models.builder import LOSSES as MMDET_LOSSES from mmdet.models.builder import NECKS as MMDET_NECKS from mmdet.models.builder import ROI_EXTRACTORS as MMDET_ROI_EXTRACTORS from mmdet.models.builder import SHARED_HEADS as MMDET_SHARED_HEADS from mmseg.models.builder import LOSSES as MMSEG_LOSSES MODELS = Registry('models', parent=MMCV_MODELS) BACKBONES = MODELS NECKS = MODELS ROI_EXTRACTORS = MODELS SHARED_HEADS = MODELS HEADS = MODELS LOSSES = MODELS DETECTORS = MODELS VOXEL_ENCODERS = MODELS MIDDLE_ENCODERS = MODELS FUSION_LAYERS = MODELS SEGMENTORS = MODELS def build_backbone(cfg): """Build backbone.""" if cfg['type'] in BACKBONES._module_dict.keys(): return BACKBONES.build(cfg) else: return MMDET_BACKBONES.build(cfg) def build_neck(cfg): """Build neck.""" if cfg['type'] in NECKS._module_dict.keys(): return NECKS.build(cfg) else: return MMDET_NECKS.build(cfg) def build_roi_extractor(cfg): """Build RoI feature extractor.""" if cfg['type'] in ROI_EXTRACTORS._module_dict.keys(): return ROI_EXTRACTORS.build(cfg) else: return MMDET_ROI_EXTRACTORS.build(cfg) def build_shared_head(cfg): """Build shared head of detector.""" if cfg['type'] in SHARED_HEADS._module_dict.keys(): return SHARED_HEADS.build(cfg) else: return MMDET_SHARED_HEADS.build(cfg) def build_head(cfg): """Build head.""" if cfg['type'] in HEADS._module_dict.keys(): return HEADS.build(cfg) else: return MMDET_HEADS.build(cfg) def build_loss(cfg): """Build loss function.""" if cfg['type'] in LOSSES._module_dict.keys(): return LOSSES.build(cfg) elif cfg['type'] in MMDET_LOSSES._module_dict.keys(): return MMDET_LOSSES.build(cfg) else: return MMSEG_LOSSES.build(cfg) def build_detector(cfg, train_cfg=None, test_cfg=None): """Build detector.""" if train_cfg is not None or test_cfg is not None: warnings.warn( 'train_cfg and test_cfg is deprecated, ' 'please specify them in model', UserWarning) assert cfg.get('train_cfg') is None or train_cfg is None, \ 'train_cfg specified in both outer field and model field ' assert cfg.get('test_cfg') is None or test_cfg is None, \ 'test_cfg specified in both outer field and model field ' if cfg['type'] in DETECTORS._module_dict.keys(): return DETECTORS.build( cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) else: return MMDET_DETECTORS.build( cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) def build_segmentor(cfg, train_cfg=None, test_cfg=None): """Build segmentor.""" if train_cfg is not None or test_cfg is not None: warnings.warn( 'train_cfg and test_cfg is deprecated, ' 'please specify them in model', UserWarning) assert cfg.get('train_cfg') is None or train_cfg is None, \ 'train_cfg specified in both outer field and model field ' assert cfg.get('test_cfg') is None or test_cfg is None, \ 'test_cfg specified in both outer field and model field ' return SEGMENTORS.build( cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) def build_model(cfg, train_cfg=None, test_cfg=None): """A function warpper for building 3D detector or segmentor according to cfg. Should be deprecated in the future. """ if cfg.type in ['EncoderDecoder3D']: return build_segmentor(cfg, train_cfg=train_cfg, test_cfg=test_cfg) else: return build_detector(cfg, train_cfg=train_cfg, test_cfg=test_cfg) def build_voxel_encoder(cfg): """Build voxel encoder.""" return VOXEL_ENCODERS.build(cfg) def build_middle_encoder(cfg): """Build middle level encoder.""" return MIDDLE_ENCODERS.build(cfg) def build_fusion_layer(cfg): """Build fusion layer.""" return FUSION_LAYERS.build(cfg) ================================================ FILE: mmdet3d/models/decode_heads/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .dgcnn_head import DGCNNHead from .paconv_head import PAConvHead from .pointnet2_head import PointNet2Head __all__ = ['PointNet2Head', 'DGCNNHead', 'PAConvHead'] ================================================ FILE: mmdet3d/models/decode_heads/decode_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod from mmcv.cnn import normal_init from mmcv.runner import BaseModule, auto_fp16, force_fp32 from torch import nn as nn from mmseg.models.builder import build_loss class Base3DDecodeHead(BaseModule, metaclass=ABCMeta): """Base class for BaseDecodeHead. Args: channels (int): Channels after modules, before conv_seg. num_classes (int): Number of classes. dropout_ratio (float, optional): Ratio of dropout layer. Default: 0.5. conv_cfg (dict, optional): Config of conv layers. Default: dict(type='Conv1d'). norm_cfg (dict, optional): Config of norm layers. Default: dict(type='BN1d'). act_cfg (dict, optional): Config of activation layers. Default: dict(type='ReLU'). loss_decode (dict, optional): Config of decode loss. Default: dict(type='CrossEntropyLoss'). ignore_index (int, optional): The label index to be ignored. When using masked BCE loss, ignore_index should be set to None. Default: 255. """ def __init__(self, channels, num_classes, dropout_ratio=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), loss_decode=dict( type='CrossEntropyLoss', use_sigmoid=False, class_weight=None, loss_weight=1.0), ignore_index=255, init_cfg=None): super(Base3DDecodeHead, self).__init__(init_cfg=init_cfg) self.channels = channels self.num_classes = num_classes self.dropout_ratio = dropout_ratio self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.loss_decode = build_loss(loss_decode) self.ignore_index = ignore_index self.conv_seg = nn.Conv1d(channels, num_classes, kernel_size=1) if dropout_ratio > 0: self.dropout = nn.Dropout(dropout_ratio) else: self.dropout = None self.fp16_enabled = False def init_weights(self): """Initialize weights of classification layer.""" super().init_weights() normal_init(self.conv_seg, mean=0, std=0.01) @auto_fp16() @abstractmethod def forward(self, inputs): """Placeholder of forward function.""" pass def forward_train(self, inputs, img_metas, pts_semantic_mask, train_cfg): """Forward function for training. Args: inputs (list[torch.Tensor]): List of multi-level point features. img_metas (list[dict]): Meta information of each sample. pts_semantic_mask (torch.Tensor): Semantic segmentation masks used if the architecture supports semantic segmentation task. train_cfg (dict): The training config. Returns: dict[str, Tensor]: a dictionary of loss components """ seg_logits = self.forward(inputs) losses = self.losses(seg_logits, pts_semantic_mask) return losses def forward_test(self, inputs, img_metas, test_cfg): """Forward function for testing. Args: inputs (list[Tensor]): List of multi-level point features. img_metas (list[dict]): Meta information of each sample. test_cfg (dict): The testing config. Returns: Tensor: Output segmentation map. """ return self.forward(inputs) def cls_seg(self, feat): """Classify each points.""" if self.dropout is not None: feat = self.dropout(feat) output = self.conv_seg(feat) return output @force_fp32(apply_to=('seg_logit', )) def losses(self, seg_logit, seg_label): """Compute semantic segmentation loss. Args: seg_logit (torch.Tensor): Predicted per-point segmentation logits of shape [B, num_classes, N]. seg_label (torch.Tensor): Ground-truth segmentation label of shape [B, N]. """ loss = dict() loss['loss_sem_seg'] = self.loss_decode( seg_logit, seg_label, ignore_index=self.ignore_index) return loss ================================================ FILE: mmdet3d/models/decode_heads/dgcnn_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn.bricks import ConvModule from mmdet3d.ops import DGCNNFPModule from ..builder import HEADS from .decode_head import Base3DDecodeHead @HEADS.register_module() class DGCNNHead(Base3DDecodeHead): r"""DGCNN decoder head. Decoder head used in `DGCNN `_. Refer to the `reimplementation code `_. Args: fp_channels (tuple[int], optional): Tuple of mlp channels in feature propagation (FP) modules. Defaults to (1216, 512). """ def __init__(self, fp_channels=(1216, 512), **kwargs): super(DGCNNHead, self).__init__(**kwargs) self.FP_module = DGCNNFPModule( mlp_channels=fp_channels, act_cfg=self.act_cfg) # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40 self.pre_seg_conv = ConvModule( fp_channels[-1], self.channels, kernel_size=1, bias=False, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: points for decoder. """ fa_points = feat_dict['fa_points'] return fa_points def forward(self, feat_dict): """Forward pass. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Segmentation map of shape [B, num_classes, N]. """ fa_points = self._extract_input(feat_dict) fp_points = self.FP_module(fa_points) fp_points = fp_points.transpose(1, 2).contiguous() output = self.pre_seg_conv(fp_points) output = self.cls_seg(output) return output ================================================ FILE: mmdet3d/models/decode_heads/paconv_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn.bricks import ConvModule from ..builder import HEADS from .pointnet2_head import PointNet2Head @HEADS.register_module() class PAConvHead(PointNet2Head): r"""PAConv decoder head. Decoder head used in `PAConv `_. Refer to the `official code `_. Args: fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules. fp_norm_cfg (dict): Config of norm layers used in FP modules. Default: dict(type='BN2d'). """ def __init__(self, fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), (128 + 6, 128, 128, 128)), fp_norm_cfg=dict(type='BN2d'), **kwargs): super(PAConvHead, self).__init__(fp_channels, fp_norm_cfg, **kwargs) # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/pointnet2/pointnet2_paconv_seg.py#L53 # PointNet++'s decoder conv has bias while PAConv's doesn't have # so we need to rebuild it here self.pre_seg_conv = ConvModule( fp_channels[-1][-1], self.channels, kernel_size=1, bias=False, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def forward(self, feat_dict): """Forward pass. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Segmentation map of shape [B, num_classes, N]. """ sa_xyz, sa_features = self._extract_input(feat_dict) # PointNet++ doesn't use the first level of `sa_features` as input # while PAConv inputs it through skip-connection fp_feature = sa_features[-1] for i in range(self.num_fp): # consume the points in a bottom-up manner fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)], sa_features[-(i + 2)], fp_feature) output = self.pre_seg_conv(fp_feature) output = self.cls_seg(output) return output ================================================ FILE: mmdet3d/models/decode_heads/pointnet2_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn.bricks import ConvModule from torch import nn as nn from mmdet3d.ops import PointFPModule from ..builder import HEADS from .decode_head import Base3DDecodeHead @HEADS.register_module() class PointNet2Head(Base3DDecodeHead): r"""PointNet2 decoder head. Decoder head used in `PointNet++ `_. Refer to the `official code `_. Args: fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules. fp_norm_cfg (dict): Config of norm layers used in FP modules. Default: dict(type='BN2d'). """ def __init__(self, fp_channels=((768, 256, 256), (384, 256, 256), (320, 256, 128), (128, 128, 128, 128)), fp_norm_cfg=dict(type='BN2d'), **kwargs): super(PointNet2Head, self).__init__(**kwargs) self.num_fp = len(fp_channels) self.FP_modules = nn.ModuleList() for cur_fp_mlps in fp_channels: self.FP_modules.append( PointFPModule(mlp_channels=cur_fp_mlps, norm_cfg=fp_norm_cfg)) # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L40 self.pre_seg_conv = ConvModule( fp_channels[-1][-1], self.channels, kernel_size=1, bias=True, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg) def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: list[torch.Tensor]: Coordinates of multiple levels of points. list[torch.Tensor]: Features of multiple levels of points. """ sa_xyz = feat_dict['sa_xyz'] sa_features = feat_dict['sa_features'] assert len(sa_xyz) == len(sa_features) return sa_xyz, sa_features def forward(self, feat_dict): """Forward pass. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Segmentation map of shape [B, num_classes, N]. """ sa_xyz, sa_features = self._extract_input(feat_dict) # https://github.com/charlesq34/pointnet2/blob/master/models/pointnet2_sem_seg.py#L24 sa_features[0] = None fp_feature = sa_features[-1] for i in range(self.num_fp): # consume the points in a bottom-up manner fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)], sa_features[-(i + 2)], fp_feature) output = self.pre_seg_conv(fp_feature) output = self.cls_seg(output) return output ================================================ FILE: mmdet3d/models/dense_heads/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .anchor3d_head import Anchor3DHead from .anchor_free_mono3d_head import AnchorFreeMono3DHead from .base_conv_bbox_head import BaseConvBboxHead from .base_mono3d_dense_head import BaseMono3DDenseHead from .centerpoint_head import CenterHead from .fcaf3d_head import FCAF3DHead from .fcos_mono3d_head import FCOSMono3DHead from .free_anchor3d_head import FreeAnchor3DHead from .groupfree3d_head import GroupFree3DHead from .monoflex_head import MonoFlexHead from .parta2_rpn_head import PartA2RPNHead from .pgd_head import PGDHead from .point_rpn_head import PointRPNHead from .shape_aware_head import ShapeAwareHead from .smoke_mono3d_head import SMOKEMono3DHead from .ssd_3d_head import SSD3DHead from .vote_head import VoteHead __all__ = [ 'Anchor3DHead', 'FreeAnchor3DHead', 'PartA2RPNHead', 'VoteHead', 'SSD3DHead', 'BaseConvBboxHead', 'CenterHead', 'ShapeAwareHead', 'BaseMono3DDenseHead', 'AnchorFreeMono3DHead', 'FCOSMono3DHead', 'GroupFree3DHead', 'PointRPNHead', 'SMOKEMono3DHead', 'PGDHead', 'MonoFlexHead', 'FCAF3DHead' ] ================================================ FILE: mmdet3d/models/dense_heads/anchor3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn from mmdet3d.core import (PseudoSampler, box3d_multiclass_nms, limit_period, xywhr2xyxyr) from mmdet.core import (build_assigner, build_bbox_coder, build_prior_generator, build_sampler, multi_apply) from ..builder import HEADS, build_loss from .train_mixins import AnchorTrainMixin @HEADS.register_module() class Anchor3DHead(BaseModule, AnchorTrainMixin): """Anchor head for SECOND/PointPillars/MVXNet/PartA2. Args: num_classes (int): Number of classes. in_channels (int): Number of channels in the input feature map. train_cfg (dict): Train configs. test_cfg (dict): Test configs. feat_channels (int): Number of channels of the feature map. use_direction_classifier (bool): Whether to add a direction classifier. anchor_generator(dict): Config dict of anchor generator. assigner_per_size (bool): Whether to do assignment for each separate anchor size. assign_per_class (bool): Whether to do assignment for each class. diff_rad_by_sin (bool): Whether to change the difference into sin difference for box regression loss. dir_offset (float | int): The offset of BEV rotation angles. (TODO: may be moved into box coder) dir_limit_offset (float | int): The limited range of BEV rotation angles. (TODO: may be moved into box coder) bbox_coder (dict): Config dict of box coders. loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. loss_dir (dict): Config of direction classifier loss. """ def __init__(self, num_classes, in_channels, train_cfg, test_cfg, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], strides=[2], sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], custom_values=[], reshape_out=False), assigner_per_size=False, assign_per_class=False, diff_rad_by_sin=True, dir_offset=-np.pi / 2, dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2), init_cfg=None): super().__init__(init_cfg=init_cfg) self.in_channels = in_channels self.num_classes = num_classes self.feat_channels = feat_channels self.diff_rad_by_sin = diff_rad_by_sin self.use_direction_classifier = use_direction_classifier self.train_cfg = train_cfg self.test_cfg = test_cfg self.assigner_per_size = assigner_per_size self.assign_per_class = assign_per_class self.dir_offset = dir_offset self.dir_limit_offset = dir_limit_offset import warnings warnings.warn( 'dir_offset and dir_limit_offset will be depressed and be ' 'incorporated into box coder in the future') self.fp16_enabled = False # build anchor generator self.anchor_generator = build_prior_generator(anchor_generator) # In 3D detection, the anchor stride is connected with anchor size self.num_anchors = self.anchor_generator.num_base_anchors # build box coder self.bbox_coder = build_bbox_coder(bbox_coder) self.box_code_size = self.bbox_coder.code_size # build loss function self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.sampling = loss_cls['type'] not in ['FocalLoss', 'GHMC'] if not self.use_sigmoid_cls: self.num_classes += 1 self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_dir = build_loss(loss_dir) self.fp16_enabled = False self._init_layers() self._init_assigner_sampler() if init_cfg is None: self.init_cfg = dict( type='Normal', layer='Conv2d', std=0.01, override=dict( type='Normal', name='conv_cls', std=0.01, bias_prob=0.01)) def _init_assigner_sampler(self): """Initialize the target assigner and sampler of the head.""" if self.train_cfg is None: return if self.sampling: self.bbox_sampler = build_sampler(self.train_cfg.sampler) else: self.bbox_sampler = PseudoSampler() if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] def _init_layers(self): """Initialize neural network layers of the head.""" self.cls_out_channels = self.num_anchors * self.num_classes self.conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1) self.conv_reg = nn.Conv2d(self.feat_channels, self.num_anchors * self.box_code_size, 1) if self.use_direction_classifier: self.conv_dir_cls = nn.Conv2d(self.feat_channels, self.num_anchors * 2, 1) def forward_single(self, x): """Forward function on a single-scale feature map. Args: x (torch.Tensor): Input features. Returns: tuple[torch.Tensor]: Contain score of each class, bbox regression and direction classification predictions. """ cls_score = self.conv_cls(x) bbox_pred = self.conv_reg(x) dir_cls_preds = None if self.use_direction_classifier: dir_cls_preds = self.conv_dir_cls(x) return cls_score, bbox_pred, dir_cls_preds def forward(self, feats): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple[list[torch.Tensor]]: Multi-level class score, bbox and direction predictions. """ return multi_apply(self.forward_single, feats) def get_anchors(self, featmap_sizes, input_metas, device='cuda'): """Get anchors according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. input_metas (list[dict]): contain pcd and img's meta info. device (str): device of current module. Returns: list[list[torch.Tensor]]: Anchors of each image, valid flags of each image. """ num_imgs = len(input_metas) # since feature map sizes of all images are the same, we only compute # anchors for one time multi_level_anchors = self.anchor_generator.grid_anchors( featmap_sizes, device=device) anchor_list = [multi_level_anchors for _ in range(num_imgs)] return anchor_list def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, num_total_samples): """Calculate loss of Single-level results. Args: cls_score (torch.Tensor): Class score in single-level. bbox_pred (torch.Tensor): Bbox prediction in single-level. dir_cls_preds (torch.Tensor): Predictions of direction class in single-level. labels (torch.Tensor): Labels of class. label_weights (torch.Tensor): Weights of class loss. bbox_targets (torch.Tensor): Targets of bbox predictions. bbox_weights (torch.Tensor): Weights of bbox loss. dir_targets (torch.Tensor): Targets of direction predictions. dir_weights (torch.Tensor): Weights of direction loss. num_total_samples (int): The number of valid samples. Returns: tuple[torch.Tensor]: Losses of class, bbox and direction, respectively. """ # classification loss if num_total_samples is None: num_total_samples = int(cls_score.shape[0]) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.permute(0, 2, 3, 1).reshape(-1, self.num_classes) assert labels.max().item() <= self.num_classes loss_cls = self.loss_cls( cls_score, labels, label_weights, avg_factor=num_total_samples) # regression loss bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape(-1, self.box_code_size) bbox_targets = bbox_targets.reshape(-1, self.box_code_size) bbox_weights = bbox_weights.reshape(-1, self.box_code_size) bg_class_ind = self.num_classes pos_inds = ((labels >= 0) & (labels < bg_class_ind)).nonzero( as_tuple=False).reshape(-1) num_pos = len(pos_inds) pos_bbox_pred = bbox_pred[pos_inds] pos_bbox_targets = bbox_targets[pos_inds] pos_bbox_weights = bbox_weights[pos_inds] # dir loss if self.use_direction_classifier: dir_cls_preds = dir_cls_preds.permute(0, 2, 3, 1).reshape(-1, 2) dir_targets = dir_targets.reshape(-1) dir_weights = dir_weights.reshape(-1) pos_dir_cls_preds = dir_cls_preds[pos_inds] pos_dir_targets = dir_targets[pos_inds] pos_dir_weights = dir_weights[pos_inds] if num_pos > 0: code_weight = self.train_cfg.get('code_weight', None) if code_weight: pos_bbox_weights = pos_bbox_weights * bbox_weights.new_tensor( code_weight) if self.diff_rad_by_sin: pos_bbox_pred, pos_bbox_targets = self.add_sin_difference( pos_bbox_pred, pos_bbox_targets) loss_bbox = self.loss_bbox( pos_bbox_pred, pos_bbox_targets, pos_bbox_weights, avg_factor=num_total_samples) # direction classification loss loss_dir = None if self.use_direction_classifier: loss_dir = self.loss_dir( pos_dir_cls_preds, pos_dir_targets, pos_dir_weights, avg_factor=num_total_samples) else: loss_bbox = pos_bbox_pred.sum() if self.use_direction_classifier: loss_dir = pos_dir_cls_preds.sum() return loss_cls, loss_bbox, loss_dir @staticmethod def add_sin_difference(boxes1, boxes2): """Convert the rotation difference to difference in sine function. Args: boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7 and the 7th dimension is rotation dimension. boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and the 7th dimension is rotation dimension. Returns: tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th dimensions are changed. """ rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( boxes2[..., 6:7]) rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[..., 6:7]) boxes1 = torch.cat( [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1) boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]], dim=-1) return boxes1, boxes2 @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate losses. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample. input_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding boxes to ignore. Returns: dict[str, list[torch.Tensor]]: Classification, bbox, and direction losses of each level. - loss_cls (list[torch.Tensor]): Classification losses. - loss_bbox (list[torch.Tensor]): Box regression losses. - loss_dir (list[torch.Tensor]): Direction classification losses. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.anchor_generator.num_levels device = cls_scores[0].device anchor_list = self.get_anchors( featmap_sizes, input_metas, device=device) label_channels = self.cls_out_channels if self.use_sigmoid_cls else 1 cls_reg_targets = self.anchor_target_3d( anchor_list, gt_bboxes, input_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, num_classes=self.num_classes, label_channels=label_channels, sampling=self.sampling) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = ( num_total_pos + num_total_neg if self.sampling else num_total_pos) # num_total_samples = None losses_cls, losses_bbox, losses_dir = multi_apply( self.loss_single, cls_scores, bbox_preds, dir_cls_preds, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_samples=num_total_samples) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, input_metas, cfg=None, rescale=False): """Get bboxes of anchor head. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. input_metas (list[dict]): Contain pcd and img's meta info. cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): Whether th rescale bbox. Returns: list[tuple]: Prediction resultes of batches. """ assert len(cls_scores) == len(bbox_preds) assert len(cls_scores) == len(dir_cls_preds) num_levels = len(cls_scores) featmap_sizes = [cls_scores[i].shape[-2:] for i in range(num_levels)] device = cls_scores[0].device mlvl_anchors = self.anchor_generator.grid_anchors( featmap_sizes, device=device) mlvl_anchors = [ anchor.reshape(-1, self.box_code_size) for anchor in mlvl_anchors ] result_list = [] for img_id in range(len(input_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] dir_cls_pred_list = [ dir_cls_preds[i][img_id].detach() for i in range(num_levels) ] input_meta = input_metas[img_id] proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, dir_cls_pred_list, mlvl_anchors, input_meta, cfg, rescale) result_list.append(proposals) return result_list def get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors, input_meta, cfg=None, rescale=False): """Get bboxes of single branch. Args: cls_scores (torch.Tensor): Class score in single batch. bbox_preds (torch.Tensor): Bbox prediction in single batch. dir_cls_preds (torch.Tensor): Predictions of direction class in single batch. mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): whether th rescale bbox. Returns: tuple: Contain predictions of single batch. - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores (torch.Tensor): Class score of each bbox. - labels (torch.Tensor): Label of each bbox. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] mlvl_dir_scores = [] for cls_score, bbox_pred, dir_cls_pred, anchors in zip( cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:] dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.num_classes) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, self.box_code_size) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: max_scores, _ = scores[:, :-1].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] dir_cls_score = dir_cls_score[topk_inds] bboxes = self.bbox_coder.decode(anchors, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_dir_scores.append(dir_cls_score) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.box_code_size).bev) mlvl_scores = torch.cat(mlvl_scores) mlvl_dir_scores = torch.cat(mlvl_dir_scores) if self.use_sigmoid_cls: # Add a dummy background class to the front when using sigmoid padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) score_thr = cfg.get('score_thr', 0) results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, score_thr, cfg.max_num, cfg, mlvl_dir_scores) bboxes, scores, labels, dir_scores = results if bboxes.shape[0] > 0: dir_rot = limit_period(bboxes[..., 6] - self.dir_offset, self.dir_limit_offset, np.pi) bboxes[..., 6] = ( dir_rot + self.dir_offset + np.pi * dir_scores.to(bboxes.dtype)) bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size) return bboxes, scores, labels ================================================ FILE: mmdet3d/models/dense_heads/anchor_free_mono3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import abstractmethod import torch from mmcv.cnn import ConvModule, bias_init_with_prob, normal_init from mmcv.runner import force_fp32 from torch import nn as nn from mmdet.core import multi_apply from ..builder import HEADS, build_loss from .base_mono3d_dense_head import BaseMono3DDenseHead @HEADS.register_module() class AnchorFreeMono3DHead(BaseMono3DDenseHead): """Anchor-free head for monocular 3D object detection. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. feat_channels (int, optional): Number of hidden channels. Used in child classes. Defaults to 256. stacked_convs (int, optional): Number of stacking convs of the head. strides (tuple, optional): Downsample factor of each feature map. dcn_on_last_conv (bool, optional): If true, use dcn in the last layer of towers. Default: False. conv_bias (bool | str, optional): If specified as `auto`, it will be decided by the norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise False. Default: 'auto'. background_label (int, optional): Label ID of background, set as 0 for RPN and num_classes for other heads. It will automatically set as `num_classes` if None is given. use_direction_classifier (bool, optional): Whether to add a direction classifier. diff_rad_by_sin (bool, optional): Whether to change the difference into sin difference for box regression loss. Defaults to True. dir_offset (float, optional): Parameter used in direction classification. Defaults to 0. dir_limit_offset (float, optional): Parameter used in direction classification. Defaults to 0. loss_cls (dict, optional): Config of classification loss. loss_bbox (dict, optional): Config of localization loss. loss_dir (dict, optional): Config of direction classifier loss. loss_attr (dict, optional): Config of attribute classifier loss, which is only active when `pred_attrs=True`. bbox_code_size (int, optional): Dimensions of predicted bounding boxes. pred_attrs (bool, optional): Whether to predict attributes. Defaults to False. num_attrs (int, optional): The number of attributes to be predicted. Default: 9. pred_velo (bool, optional): Whether to predict velocity. Defaults to False. pred_bbox2d (bool, optional): Whether to predict 2D boxes. Defaults to False. group_reg_dims (tuple[int], optional): The dimension of each regression target group. Default: (2, 1, 3, 1, 2). cls_branch (tuple[int], optional): Channels for classification branch. Default: (128, 64). reg_branch (tuple[tuple], optional): Channels for regression branch. Default: ( (128, 64), # offset (128, 64), # depth (64, ), # size (64, ), # rot () # velo ), dir_branch (tuple[int], optional): Channels for direction classification branch. Default: (64, ). attr_branch (tuple[int], optional): Channels for classification branch. Default: (64, ). conv_cfg (dict, optional): Config dict for convolution layer. Default: None. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. train_cfg (dict, optional): Training config of anchor head. test_cfg (dict, optional): Testing config of anchor head. """ # noqa: W605 _version = 1 def __init__( self, num_classes, in_channels, feat_channels=256, stacked_convs=4, strides=(4, 8, 16, 32, 64), dcn_on_last_conv=False, conv_bias='auto', background_label=None, use_direction_classifier=True, diff_rad_by_sin=True, dir_offset=0, dir_limit_offset=0, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), bbox_code_size=9, # For nuscenes pred_attrs=False, num_attrs=9, # For nuscenes pred_velo=False, pred_bbox2d=False, group_reg_dims=(2, 1, 3, 1, 2), # offset, depth, size, rot, velo, cls_branch=(128, 64), reg_branch=( (128, 64), # offset (128, 64), # depth (64, ), # size (64, ), # rot () # velo ), dir_branch=(64, ), attr_branch=(64, ), conv_cfg=None, norm_cfg=None, train_cfg=None, test_cfg=None, init_cfg=None): super(AnchorFreeMono3DHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.cls_out_channels = num_classes self.in_channels = in_channels self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.strides = strides self.dcn_on_last_conv = dcn_on_last_conv assert conv_bias == 'auto' or isinstance(conv_bias, bool) self.conv_bias = conv_bias self.use_direction_classifier = use_direction_classifier self.diff_rad_by_sin = diff_rad_by_sin self.dir_offset = dir_offset self.dir_limit_offset = dir_limit_offset self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_dir = build_loss(loss_dir) self.bbox_code_size = bbox_code_size self.group_reg_dims = list(group_reg_dims) self.cls_branch = cls_branch self.reg_branch = reg_branch assert len(reg_branch) == len(group_reg_dims), 'The number of '\ 'element in reg_branch and group_reg_dims should be the same.' self.pred_velo = pred_velo self.pred_bbox2d = pred_bbox2d self.out_channels = [] for reg_branch_channels in reg_branch: if len(reg_branch_channels) > 0: self.out_channels.append(reg_branch_channels[-1]) else: self.out_channels.append(-1) self.dir_branch = dir_branch self.train_cfg = train_cfg self.test_cfg = test_cfg self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.fp16_enabled = False self.background_label = ( num_classes if background_label is None else background_label) # background_label should be either 0 or num_classes assert (self.background_label == 0 or self.background_label == num_classes) self.pred_attrs = pred_attrs self.attr_background_label = -1 self.num_attrs = num_attrs if self.pred_attrs: self.attr_background_label = num_attrs self.loss_attr = build_loss(loss_attr) self.attr_branch = attr_branch self._init_layers() def _init_layers(self): """Initialize layers of the head.""" self._init_cls_convs() self._init_reg_convs() self._init_predictor() def _init_cls_convs(self): """Initialize classification conv layers of the head.""" self.cls_convs = nn.ModuleList() for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels if self.dcn_on_last_conv and i == self.stacked_convs - 1: conv_cfg = dict(type='DCNv2') else: conv_cfg = self.conv_cfg self.cls_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=conv_cfg, norm_cfg=self.norm_cfg, bias=self.conv_bias)) def _init_reg_convs(self): """Initialize bbox regression conv layers of the head.""" self.reg_convs = nn.ModuleList() for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels if self.dcn_on_last_conv and i == self.stacked_convs - 1: conv_cfg = dict(type='DCNv2') else: conv_cfg = self.conv_cfg self.reg_convs.append( ConvModule( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=conv_cfg, norm_cfg=self.norm_cfg, bias=self.conv_bias)) def _init_branch(self, conv_channels=(64), conv_strides=(1)): """Initialize conv layers as a prediction branch.""" conv_before_pred = nn.ModuleList() if isinstance(conv_channels, int): conv_channels = [self.feat_channels] + [conv_channels] conv_strides = [conv_strides] else: conv_channels = [self.feat_channels] + list(conv_channels) conv_strides = list(conv_strides) for i in range(len(conv_strides)): conv_before_pred.append( ConvModule( conv_channels[i], conv_channels[i + 1], 3, stride=conv_strides[i], padding=1, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, bias=self.conv_bias)) return conv_before_pred def _init_predictor(self): """Initialize predictor layers of the head.""" self.conv_cls_prev = self._init_branch( conv_channels=self.cls_branch, conv_strides=(1, ) * len(self.cls_branch)) self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels, 1) self.conv_reg_prevs = nn.ModuleList() self.conv_regs = nn.ModuleList() for i in range(len(self.group_reg_dims)): reg_dim = self.group_reg_dims[i] reg_branch_channels = self.reg_branch[i] out_channel = self.out_channels[i] if len(reg_branch_channels) > 0: self.conv_reg_prevs.append( self._init_branch( conv_channels=reg_branch_channels, conv_strides=(1, ) * len(reg_branch_channels))) self.conv_regs.append(nn.Conv2d(out_channel, reg_dim, 1)) else: self.conv_reg_prevs.append(None) self.conv_regs.append( nn.Conv2d(self.feat_channels, reg_dim, 1)) if self.use_direction_classifier: self.conv_dir_cls_prev = self._init_branch( conv_channels=self.dir_branch, conv_strides=(1, ) * len(self.dir_branch)) self.conv_dir_cls = nn.Conv2d(self.dir_branch[-1], 2, 1) if self.pred_attrs: self.conv_attr_prev = self._init_branch( conv_channels=self.attr_branch, conv_strides=(1, ) * len(self.attr_branch)) self.conv_attr = nn.Conv2d(self.attr_branch[-1], self.num_attrs, 1) def init_weights(self): """Initialize weights of the head. We currently still use the customized defined init_weights because the default init of DCN triggered by the init_cfg will init conv_offset.weight, which mistakenly affects the training stability. """ for modules in [self.cls_convs, self.reg_convs, self.conv_cls_prev]: for m in modules: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) for conv_reg_prev in self.conv_reg_prevs: if conv_reg_prev is None: continue for m in conv_reg_prev: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) if self.use_direction_classifier: for m in self.conv_dir_cls_prev: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) if self.pred_attrs: for m in self.conv_attr_prev: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) bias_cls = bias_init_with_prob(0.01) normal_init(self.conv_cls, std=0.01, bias=bias_cls) for conv_reg in self.conv_regs: normal_init(conv_reg, std=0.01) if self.use_direction_classifier: normal_init(self.conv_dir_cls, std=0.01, bias=bias_cls) if self.pred_attrs: normal_init(self.conv_attr, std=0.01, bias=bias_cls) def forward(self, feats): """Forward features from the upstream network. Args: feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor. Returns: tuple: Usually contain classification scores, bbox predictions, and direction class predictions. cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. """ return multi_apply(self.forward_single, feats)[:5] def forward_single(self, x): """Forward features of a single scale level. Args: x (Tensor): FPN feature maps of the specified stride. Returns: tuple: Scores for each class, bbox predictions, direction class, and attributes, features after classification and regression conv layers, some models needs these features like FCOS. """ cls_feat = x reg_feat = x for cls_layer in self.cls_convs: cls_feat = cls_layer(cls_feat) # clone the cls_feat for reusing the feature map afterwards clone_cls_feat = cls_feat.clone() for conv_cls_prev_layer in self.conv_cls_prev: clone_cls_feat = conv_cls_prev_layer(clone_cls_feat) cls_score = self.conv_cls(clone_cls_feat) for reg_layer in self.reg_convs: reg_feat = reg_layer(reg_feat) bbox_pred = [] for i in range(len(self.group_reg_dims)): # clone the reg_feat for reusing the feature map afterwards clone_reg_feat = reg_feat.clone() if len(self.reg_branch[i]) > 0: for conv_reg_prev_layer in self.conv_reg_prevs[i]: clone_reg_feat = conv_reg_prev_layer(clone_reg_feat) bbox_pred.append(self.conv_regs[i](clone_reg_feat)) bbox_pred = torch.cat(bbox_pred, dim=1) dir_cls_pred = None if self.use_direction_classifier: clone_reg_feat = reg_feat.clone() for conv_dir_cls_prev_layer in self.conv_dir_cls_prev: clone_reg_feat = conv_dir_cls_prev_layer(clone_reg_feat) dir_cls_pred = self.conv_dir_cls(clone_reg_feat) attr_pred = None if self.pred_attrs: # clone the cls_feat for reusing the feature map afterwards clone_cls_feat = cls_feat.clone() for conv_attr_prev_layer in self.conv_attr_prev: clone_cls_feat = conv_attr_prev_layer(clone_cls_feat) attr_pred = self.conv_attr(clone_cls_feat) return cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, \ reg_feat @abstractmethod @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_3d (list[Tensor]): 3D Ground truth bboxes for each image with shape (num_gts, bbox_code_size). gt_labels_3d (list[Tensor]): 3D class indices of each box. centers2d (list[Tensor]): Projected 3D centers onto 2D images. depths (list[Tensor]): Depth of projected centers on 2D images. attr_labels (list[Tensor], optional): Attribute indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. """ raise NotImplementedError @abstractmethod @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, img_metas, cfg=None, rescale=None): """Transform network output for a batch into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_points * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_points * bbox_code_size, H, W) dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used rescale (bool): If True, return boxes in original image space """ raise NotImplementedError @abstractmethod def get_targets(self, points, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, attr_labels_list): """Compute regression, classification and centerss targets for points in multiple images. Args: points (list[Tensor]): Points of each fpn level, each has shape (num_points, 2). gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, each has shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, each has shape (num_gt,). gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each image, each has shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, each has shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, each has shape (num_gt, 2). depths_list (list[Tensor]): Depth of projected 3D centers onto 2D image, each has shape (num_gt, 1). attr_labels_list (list[Tensor]): Attribute labels of each box, each has shape (num_gt,). """ raise NotImplementedError def _get_points_single(self, featmap_size, stride, dtype, device, flatten=False): """Get points of a single scale level.""" h, w = featmap_size x_range = torch.arange(w, dtype=dtype, device=device) y_range = torch.arange(h, dtype=dtype, device=device) y, x = torch.meshgrid(y_range, x_range) if flatten: y = y.flatten() x = x.flatten() return y, x def get_points(self, featmap_sizes, dtype, device, flatten=False): """Get points according to feature map sizes. Args: featmap_sizes (list[tuple]): Multi-level feature map sizes. dtype (torch.dtype): Type of points. device (torch.device): Device of points. Returns: tuple: points of each image. """ mlvl_points = [] for i in range(len(featmap_sizes)): mlvl_points.append( self._get_points_single(featmap_sizes[i], self.strides[i], dtype, device, flatten)) return mlvl_points ================================================ FILE: mmdet3d/models/dense_heads/base_conv_bbox_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn import ConvModule from mmcv.cnn.bricks import build_conv_layer from mmcv.runner import BaseModule from torch import nn as nn from ..builder import HEADS @HEADS.register_module() class BaseConvBboxHead(BaseModule): r"""More general bbox head, with shared conv layers and two optional separated branches. .. code-block:: none /-> cls convs -> cls_score shared convs \-> reg convs -> bbox_pred """ def __init__(self, in_channels=0, shared_conv_channels=(), cls_conv_channels=(), num_cls_out_channels=0, reg_conv_channels=(), num_reg_out_channels=0, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), bias='auto', init_cfg=None, *args, **kwargs): super(BaseConvBboxHead, self).__init__( init_cfg=init_cfg, *args, **kwargs) assert in_channels > 0 assert num_cls_out_channels > 0 assert num_reg_out_channels > 0 self.in_channels = in_channels self.shared_conv_channels = shared_conv_channels self.cls_conv_channels = cls_conv_channels self.num_cls_out_channels = num_cls_out_channels self.reg_conv_channels = reg_conv_channels self.num_reg_out_channels = num_reg_out_channels self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.bias = bias # add shared convs if len(self.shared_conv_channels) > 0: self.shared_convs = self._add_conv_branch( self.in_channels, self.shared_conv_channels) out_channels = self.shared_conv_channels[-1] else: out_channels = self.in_channels # add cls specific branch prev_channel = out_channels if len(self.cls_conv_channels) > 0: self.cls_convs = self._add_conv_branch(prev_channel, self.cls_conv_channels) prev_channel = self.cls_conv_channels[-1] self.conv_cls = build_conv_layer( conv_cfg, in_channels=prev_channel, out_channels=num_cls_out_channels, kernel_size=1) # add reg specific branch prev_channel = out_channels if len(self.reg_conv_channels) > 0: self.reg_convs = self._add_conv_branch(prev_channel, self.reg_conv_channels) prev_channel = self.reg_conv_channels[-1] self.conv_reg = build_conv_layer( conv_cfg, in_channels=prev_channel, out_channels=num_reg_out_channels, kernel_size=1) def _add_conv_branch(self, in_channels, conv_channels): """Add shared or separable branch.""" conv_spec = [in_channels] + list(conv_channels) # add branch specific conv layers conv_layers = nn.Sequential() for i in range(len(conv_spec) - 1): conv_layers.add_module( f'layer{i}', ConvModule( conv_spec[i], conv_spec[i + 1], kernel_size=1, padding=0, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, bias=self.bias, inplace=True)) return conv_layers def forward(self, feats): """Forward. Args: feats (Tensor): Input features Returns: Tensor: Class scores predictions Tensor: Regression predictions """ # shared part if len(self.shared_conv_channels) > 0: x = self.shared_convs(feats) # separate branches x_cls = x x_reg = x if len(self.cls_conv_channels) > 0: x_cls = self.cls_convs(x_cls) cls_score = self.conv_cls(x_cls) if len(self.reg_conv_channels) > 0: x_reg = self.reg_convs(x_reg) bbox_pred = self.conv_reg(x_reg) return cls_score, bbox_pred ================================================ FILE: mmdet3d/models/dense_heads/base_mono3d_dense_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod from mmcv.runner import BaseModule class BaseMono3DDenseHead(BaseModule, metaclass=ABCMeta): """Base class for Monocular 3D DenseHeads.""" def __init__(self, init_cfg=None): super(BaseMono3DDenseHead, self).__init__(init_cfg=init_cfg) @abstractmethod def loss(self, **kwargs): """Compute losses of the head.""" pass @abstractmethod def get_bboxes(self, **kwargs): """Transform network output for a batch into bbox predictions.""" pass def forward_train(self, x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_3d=None, gt_labels_3d=None, centers2d=None, depths=None, attr_labels=None, gt_bboxes_ignore=None, proposal_cfg=None, **kwargs): """ Args: x (list[Tensor]): Features from FPN. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes (list[Tensor]): Ground truth bboxes of the image, shape (num_gts, 4). gt_labels (list[Tensor]): Ground truth labels of each box, shape (num_gts,). gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image, shape (num_gts, self.bbox_code_size). gt_labels_3d (list[Tensor]): 3D ground truth labels of each box, shape (num_gts,). centers2d (list[Tensor]): Projected 3D center of each box, shape (num_gts, 2). depths (list[Tensor]): Depth of projected 3D center of each box, shape (num_gts,). attr_labels (list[Tensor]): Attribute labels of each box, shape (num_gts,). gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored, shape (num_ignored_gts, 4). proposal_cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used Returns: tuple: losses: (dict[str, Tensor]): A dictionary of loss components. proposal_list (list[Tensor]): Proposals of each image. """ outs = self(x) if gt_labels is None: loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths, attr_labels, img_metas) else: loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas) losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) if proposal_cfg is None: return losses else: proposal_list = self.get_bboxes(*outs, img_metas, cfg=proposal_cfg) return losses, proposal_list ================================================ FILE: mmdet3d/models/dense_heads/centerpoint_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import torch from mmcv.cnn import ConvModule, build_conv_layer from mmcv.runner import BaseModule from torch import nn from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius, xywhr2xyxyr) from mmdet3d.core.post_processing import nms_bev from mmdet3d.models import builder from mmdet3d.models.utils import clip_sigmoid from mmdet.core import build_bbox_coder, multi_apply, reduce_mean from ..builder import HEADS, build_loss from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmcv.runner import BaseModule, force_fp32 @HEADS.register_module() class SeparateHead(BaseModule): """SeparateHead for CenterHead. Args: in_channels (int): Input channels for conv_layer. heads (dict): Conv information. head_conv (int, optional): Output channels. Default: 64. final_kernel (int, optional): Kernel size for the last conv layer. Default: 1. init_bias (float, optional): Initial bias. Default: -2.19. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (str, optional): Type of bias. Default: 'auto'. """ def __init__(self, in_channels, heads, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', init_cfg=None, **kwargs): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(SeparateHead, self).__init__(init_cfg=init_cfg) self.heads = heads self.init_bias = init_bias for head in self.heads: classes, num_conv = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): conv_layers.append( ConvModule( c_in, head_conv, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=bias, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) c_in = head_conv conv_layers.append( build_conv_layer( conv_cfg, head_conv, classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True)) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) if init_cfg is None: self.init_cfg = dict(type='Kaiming', layer='Conv2d') def init_weights(self): """Initialize weights.""" super().init_weights() for head in self.heads: if head == 'heatmap': self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) @force_fp32() def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the shape of [B, 2, H, W]. -vel (torch.Tensor): Velocity value with the shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of [B, N, H, W]. """ ret_dict = dict() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) return ret_dict @HEADS.register_module() class DCNSeparateHead(BaseModule): r"""DCNSeparateHead for CenterHead. .. code-block:: none /-----> DCN for heatmap task -----> heatmap task. feature \-----> DCN for regression tasks -----> regression tasks Args: in_channels (int): Input channels for conv_layer. num_cls (int): Number of classes. heads (dict): Conv information. dcn_config (dict): Config of dcn layer. head_conv (int, optional): Output channels. Default: 64. final_kernel (int, optional): Kernel size for the last conv layer. Default: 1. init_bias (float, optional): Initial bias. Default: -2.19. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (str, optional): Type of bias. Default: 'auto'. """ # noqa: W605 def __init__(self, in_channels, num_cls, heads, dcn_config, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', init_cfg=None, **kwargs): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(DCNSeparateHead, self).__init__(init_cfg=init_cfg) if 'heatmap' in heads: heads.pop('heatmap') # feature adaptation with dcn # use separate features for classification / regression self.feature_adapt_cls = build_conv_layer(dcn_config) self.feature_adapt_reg = build_conv_layer(dcn_config) # heatmap prediction head cls_head = [ ConvModule( in_channels, head_conv, kernel_size=3, padding=1, conv_cfg=conv_cfg, bias=bias, norm_cfg=norm_cfg), build_conv_layer( conv_cfg, head_conv, num_cls, kernel_size=3, stride=1, padding=1, bias=bias) ] self.cls_head = nn.Sequential(*cls_head) self.init_bias = init_bias # other regression target self.task_head = SeparateHead( in_channels, heads, head_conv=head_conv, final_kernel=final_kernel, bias=bias) if init_cfg is None: self.init_cfg = dict(type='Kaiming', layer='Conv2d') def init_weights(self): """Initialize weights.""" super().init_weights() self.cls_head[-1].bias.data.fill_(self.init_bias) def forward(self, x): """Forward function for DCNSepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the shape of [B, 2, H, W]. -vel (torch.Tensor): Velocity value with the shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of [B, N, H, W]. """ center_feat = self.feature_adapt_cls(x) reg_feat = self.feature_adapt_reg(x) cls_score = self.cls_head(center_feat) ret = self.task_head(reg_feat) ret['heatmap'] = cls_score return ret import torch.utils.checkpoint as cp @HEADS.register_module() class CenterHead(BaseModule): """CenterHead for CenterPoint. Args: in_channels (list[int] | int, optional): Channels of the input feature map. Default: [128]. tasks (list[dict], optional): Task information including class number and class names. Default: None. train_cfg (dict, optional): Train-time configs. Default: None. test_cfg (dict, optional): Test-time configs. Default: None. bbox_coder (dict, optional): Bbox coder configs. Default: None. common_heads (dict, optional): Conv information for common heads. Default: dict(). loss_cls (dict, optional): Config of classification loss function. Default: dict(type='GaussianFocalLoss', reduction='mean'). loss_bbox (dict, optional): Config of regression loss function. Default: dict(type='L1Loss', reduction='none'). separate_head (dict, optional): Config of separate head. Default: dict( type='SeparateHead', init_bias=-2.19, final_kernel=3) share_conv_channel (int, optional): Output channels for share_conv layer. Default: 64. num_heatmap_convs (int, optional): Number of conv layers for heatmap conv layer. Default: 2. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (str, optional): Type of bias. Default: 'auto'. """ def __init__(self, in_channels=[128], tasks=None, train_cfg=None, test_cfg=None, bbox_coder=None, common_heads=dict(), with_cp=False, loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict( type='L1Loss', reduction='none', loss_weight=0.25), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), share_conv_channel=64, num_heatmap_convs=2, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', norm_bbox=True, init_cfg=None, voxel2bev=False, loss_weight_per_task=1.0, # balance differet tasks, such as seg, occupancy. task_specific=True): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(CenterHead, self).__init__(init_cfg=init_cfg) self.voxel2bev = voxel2bev self.loss_weight_per_task = loss_weight_per_task if self.voxel2bev: self.voxel2bev_layer = nn.Conv3d(in_channels, in_channels, (1, 1, 8), (1, 1, 1), (0, 0, 0)) num_classes = [len(t['class_names']) for t in tasks] self.class_names = [t['class_names'] for t in tasks] self.train_cfg = train_cfg self.test_cfg = test_cfg self.in_channels = in_channels self.num_classes = num_classes self.norm_bbox = norm_bbox self.with_cp = with_cp self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.bbox_coder = build_bbox_coder(bbox_coder) self.num_anchor_per_locs = [n for n in num_classes] self.fp16_enabled = False # a shared convolution self.shared_conv = ConvModule( in_channels, share_conv_channel, kernel_size=3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) self.task_heads = nn.ModuleList() for num_cls in num_classes: heads = copy.deepcopy(common_heads) heads.update(dict(heatmap=(num_cls, num_heatmap_convs))) separate_head.update( in_channels=share_conv_channel, heads=heads, num_cls=num_cls) self.task_heads.append(builder.build_head(separate_head)) self.with_velocity = 'vel' in common_heads.keys() self.task_specific = task_specific def forward_single(self, x): """Forward function for CenterPoint. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: list[dict]: Output results for tasks. """ ret_dicts = [] if self.with_cp: x = cp.checkpoint(self.shared_conv, x) else: x = self.shared_conv(x) for task in self.task_heads: ret_dicts.append(task(x)) return ret_dicts def forward(self, input_dict, *args, **kwargs): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple(list[dict]): Output results for tasks. """ if isinstance(input_dict, dict): if input_dict['img_bev_feat'][0].dim() == 5: mlvl_feats = [level.mean(-1) for level in input_dict['img_bev_feat']] else: mlvl_feats = input_dict['img_bev_feat'] if not isinstance(mlvl_feats, list): mlvl_feats = [mlvl_feats] elif isinstance(input_dict, list): mlvl_feats = input_dict return multi_apply(self.forward_single, mlvl_feats) def _gather_feat(self, feat, ind, mask=None): """Gather feature map. Given feature map and index, return indexed feature map. Args: feat (torch.tensor): Feature map with the shape of [B, H*W, 10]. ind (torch.Tensor): Index of the ground truth boxes with the shape of [B, max_obj]. mask (torch.Tensor, optional): Mask of the feature map with the shape of [B, max_obj]. Default: None. Returns: torch.Tensor: Feature map after gathering with the shape of [B, max_obj, 10]. """ dim = feat.size(2) ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) feat = feat.gather(1, ind) if mask is not None: mask = mask.unsqueeze(2).expand_as(feat) feat = feat[mask] feat = feat.view(-1, dim) return feat def get_targets(self, gt_bboxes_3d, gt_labels_3d): """Generate targets. How each output is transformed: Each nested list is transposed so that all same-index elements in each sub-list (1, ..., N) become the new sub-lists. [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ] ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ] The new transposed nested list is converted into a list of N tensors generated by concatenating tensors in the new sub-lists. [ tensor0, tensor1, tensor2, ... ] Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. Returns: Returns: tuple[list[torch.Tensor]]: Tuple of target including the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - list[torch.Tensor]: Indexes indicating the position of the valid boxes. - list[torch.Tensor]: Masks indicating which boxes are valid. """ heatmaps, anno_boxes, inds, masks = multi_apply( self.get_targets_single, gt_bboxes_3d, gt_labels_3d) # Transpose heatmaps heatmaps = list(map(list, zip(*heatmaps))) heatmaps = [torch.stack(hms_) for hms_ in heatmaps] # Transpose anno_boxes anno_boxes = list(map(list, zip(*anno_boxes))) anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes] # Transpose inds inds = list(map(list, zip(*inds))) inds = [torch.stack(inds_) for inds_ in inds] # Transpose inds masks = list(map(list, zip(*masks))) masks = [torch.stack(masks_) for masks_ in masks] return heatmaps, anno_boxes, inds, masks def get_targets_single(self, gt_bboxes_3d, gt_labels_3d): """Generate training targets for a single sample. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. Returns: tuple[list[torch.Tensor]]: Tuple of target including the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - list[torch.Tensor]: Indexes indicating the position of the valid boxes. - list[torch.Tensor]: Masks indicating which boxes are valid. """ device = gt_labels_3d.device gt_bboxes_3d = torch.cat( (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]), dim=1).to(device) max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg'] grid_size = torch.tensor(self.train_cfg['grid_size']) pc_range = torch.tensor(self.train_cfg['point_cloud_range']) voxel_size = torch.tensor(self.train_cfg['voxel_size']) feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # reorganize the gt_dict by tasks task_masks = [] flag = 0 for class_name in self.class_names: task_masks.append([ torch.where(gt_labels_3d == class_name.index(i) + flag) for i in class_name ]) flag += len(class_name) task_boxes = [] task_classes = [] flag2 = 0 for idx, mask in enumerate(task_masks): task_box = [] task_class = [] for m in mask: task_box.append(gt_bboxes_3d[m]) # 0 is background for each task, so we need to add 1 here. task_class.append(gt_labels_3d[m] + 1 - flag2) task_boxes.append(torch.cat(task_box, axis=0).to(device)) task_classes.append(torch.cat(task_class).long().to(device)) flag2 += len(mask) draw_gaussian = draw_heatmap_gaussian heatmaps, anno_boxes, inds, masks = [], [], [], [] for idx, task_head in enumerate(self.task_heads): heatmap = gt_bboxes_3d.new_zeros( (len(self.class_names[idx]), feature_map_size[1], feature_map_size[0])) if self.with_velocity: anno_box = gt_bboxes_3d.new_zeros((max_objs, 10), dtype=torch.float32) else: anno_box = gt_bboxes_3d.new_zeros((max_objs, 8), dtype=torch.float32) ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64) mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8) num_objs = min(task_boxes[idx].shape[0], max_objs) for k in range(num_objs): cls_id = task_classes[idx][k] - 1 width = task_boxes[idx][k][3] length = task_boxes[idx][k][4] width = width / voxel_size[0] / self.train_cfg[ 'out_size_factor'] length = length / voxel_size[1] / self.train_cfg[ 'out_size_factor'] if width > 0 and length > 0: radius = gaussian_radius( (length, width), min_overlap=self.train_cfg['gaussian_overlap']) radius = max(self.train_cfg['min_radius'], int(radius)) # be really careful for the coordinate system of # your box annotation. x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][ 1], task_boxes[idx][k][2] coor_x = ( x - pc_range[0] ) / voxel_size[0] / self.train_cfg['out_size_factor'] coor_y = ( y - pc_range[1] ) / voxel_size[1] / self.train_cfg['out_size_factor'] center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) center_int = center.to(torch.int32) # throw out not in range objects to avoid out of array # area when creating the heatmap if not (0 <= center_int[0] < feature_map_size[0] and 0 <= center_int[1] < feature_map_size[1]): continue draw_gaussian(heatmap[cls_id], center_int, radius) new_idx = k x, y = center_int[0], center_int[1] assert (y * feature_map_size[0] + x < feature_map_size[0] * feature_map_size[1]) ind[new_idx] = y * feature_map_size[0] + x mask[new_idx] = 1 # TODO: support other outdoor dataset rot = task_boxes[idx][k][6] box_dim = task_boxes[idx][k][3:6] if self.norm_bbox: box_dim = box_dim.log() if self.with_velocity: vx, vy = task_boxes[idx][k][7:] anno_box[new_idx] = torch.cat([ center - torch.tensor([x, y], device=device), z.unsqueeze(0), box_dim, torch.sin(rot).unsqueeze(0), torch.cos(rot).unsqueeze(0), vx.unsqueeze(0), vy.unsqueeze(0) ]) else: anno_box[new_idx] = torch.cat([ center - torch.tensor([x, y], device=device), z.unsqueeze(0), box_dim, torch.sin(rot).unsqueeze(0), torch.cos(rot).unsqueeze(0) ]) heatmaps.append(heatmap) anno_boxes.append(anno_box) masks.append(mask) inds.append(ind) return heatmaps, anno_boxes, inds, masks def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, img_metas=None, **kwargs): heatmaps, anno_boxes, inds, masks = self.get_targets( gt_bboxes_3d, gt_labels_3d) return self.loss_(heatmaps, anno_boxes, inds, masks, preds_dicts, **kwargs) @force_fp32() def loss_(self, heatmaps, anno_boxes, inds, masks, preds_dicts, **kwargs): """Loss function for CenterHead. Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. preds_dicts (dict): Output of forward function. Returns: dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. """ loss_dict = dict() if not self.task_specific: loss_dict['loss'] = 0 for task_id, preds_dict in enumerate(preds_dicts): # heatmap focal loss preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap']) num_pos = heatmaps[task_id].eq(1).float().sum().item() cls_avg_factor = torch.clamp( reduce_mean(heatmaps[task_id].new_tensor(num_pos)), min=1).item() loss_heatmap = self.loss_cls( preds_dict[0]['heatmap'], heatmaps[task_id], avg_factor=cls_avg_factor) target_box = anno_boxes[task_id] # reconstruct the anno_box from multiple reg heads preds_dict[0]['anno_box'] = torch.cat( ( preds_dict[0]['reg'], preds_dict[0]['height'], preds_dict[0]['dim'], preds_dict[0]['rot'], preds_dict[0]['vel'], ), dim=1, ) # Regression loss for dimension, offset, height, rotation num = masks[task_id].float().sum() ind = inds[task_id] pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous() pred = pred.view(pred.size(0), -1, pred.size(3)) pred = self._gather_feat(pred, ind) mask = masks[task_id].unsqueeze(2).expand_as(target_box).float() num = torch.clamp( reduce_mean(target_box.new_tensor(num)), min=1e-4).item() isnotnan = (~torch.isnan(target_box)).float() mask *= isnotnan code_weights = self.train_cfg['code_weights'] bbox_weights = mask * mask.new_tensor(code_weights) if self.task_specific: name_list = ['xy', 'z', 'whl', 'yaw', 'vel'] clip_index = [0, 2, 3, 6, 8, 10] for reg_task_id in range(len(name_list)): pred_tmp = pred[ ..., clip_index[reg_task_id]:clip_index[reg_task_id + 1]] target_box_tmp = target_box[ ..., clip_index[reg_task_id]:clip_index[reg_task_id + 1]] bbox_weights_tmp = bbox_weights[ ..., clip_index[reg_task_id]:clip_index[reg_task_id + 1]] loss_bbox_tmp = self.loss_bbox( pred_tmp, target_box_tmp, bbox_weights_tmp, avg_factor=(num + 1e-4)) loss_dict[f'task{task_id}.loss_%s' % (name_list[reg_task_id])] = loss_bbox_tmp * self.loss_weight_per_task loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap * self.loss_weight_per_task else: loss_bbox = self.loss_bbox( pred, target_box, bbox_weights, avg_factor=num) loss_dict['loss'] += loss_bbox * self.loss_weight_per_task loss_dict['loss'] += loss_heatmap * self.loss_weight_per_task return loss_dict def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ rets = [] for task_id, preds_dict in enumerate(preds_dicts): num_class_with_bg = self.num_classes[task_id] batch_size = preds_dict[0]['heatmap'].shape[0] batch_heatmap = preds_dict[0]['heatmap'].sigmoid() batch_reg = preds_dict[0]['reg'] batch_hei = preds_dict[0]['height'] if self.norm_bbox: batch_dim = torch.exp(preds_dict[0]['dim']) else: batch_dim = preds_dict[0]['dim'] batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1) batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1) if 'vel' in preds_dict[0]: batch_vel = preds_dict[0]['vel'] else: batch_vel = None temp = self.bbox_coder.decode( batch_heatmap, batch_rots, batch_rotc, batch_hei, batch_dim, batch_vel, reg=batch_reg, task_id=task_id) batch_reg_preds = [box['bboxes'] for box in temp] batch_cls_preds = [box['scores'] for box in temp] batch_cls_labels = [box['labels'] for box in temp] nms_type = self.test_cfg.get('nms_type') if isinstance(nms_type, list): nms_type = nms_type[task_id] if nms_type == 'circle': ret_task = [] for i in range(batch_size): boxes3d = temp[i]['bboxes'] scores = temp[i]['scores'] labels = temp[i]['labels'] centers = boxes3d[:, [0, 1]] boxes = torch.cat([centers, scores.view(-1, 1)], dim=1) keep = torch.tensor( circle_nms( boxes.detach().cpu().numpy(), self.test_cfg['min_radius'][task_id], post_max_size=self.test_cfg['post_max_size']), dtype=torch.long, device=boxes.device) boxes3d = boxes3d[keep] scores = scores[keep] labels = labels[keep] ret = dict(bboxes=boxes3d, scores=scores, labels=labels) ret_task.append(ret) rets.append(ret_task) else: rets.append( self.get_task_detections(num_class_with_bg, batch_cls_preds, batch_reg_preds, batch_cls_labels, img_metas, task_id)) # Merge branches results num_samples = len(rets[0]) ret_list = [] for i in range(num_samples): for k in rets[0][i].keys(): if k == 'bboxes': bboxes = torch.cat([ret[i][k] for ret in rets]) bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 bboxes = img_metas[i]['box_type_3d']( bboxes, self.bbox_coder.code_size) elif k == 'scores': scores = torch.cat([ret[i][k] for ret in rets]) elif k == 'labels': flag = 0 for j, num_class in enumerate(self.num_classes): rets[j][i][k] += flag flag += num_class labels = torch.cat([ret[i][k].int() for ret in rets]) ret_list.append(bbox3d2result(bboxes, scores, labels)) return ret_list def get_task_detections(self, num_class_with_bg, batch_cls_preds, batch_reg_preds, batch_cls_labels, img_metas, task_id): """Rotate nms for each task. Args: num_class_with_bg (int): Number of classes for the current task. batch_cls_preds (list[torch.Tensor]): Prediction score with the shape of [N]. batch_reg_preds (list[torch.Tensor]): Prediction bbox with the shape of [N, 9]. batch_cls_labels (list[torch.Tensor]): Prediction label with the shape of [N]. img_metas (list[dict]): Meta information of each sample. Returns: list[dict[str: torch.Tensor]]: contains the following keys: -bboxes (torch.Tensor): Prediction bboxes after nms with the shape of [N, 9]. -scores (torch.Tensor): Prediction scores after nms with the shape of [N]. -labels (torch.Tensor): Prediction labels after nms with the shape of [N]. """ predictions_dicts = [] post_center_range = self.test_cfg['post_center_limit_range'] if len(post_center_range) > 0: post_center_range = torch.tensor( post_center_range, dtype=batch_reg_preds[0].dtype, device=batch_reg_preds[0].device) for i, (box_preds, cls_preds, cls_labels) in enumerate( zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)): default_val = [1.0 for _ in range(len(self.task_heads))] factor = self.test_cfg.get('nms_rescale_factor', default_val)[task_id] if isinstance(factor, list): for cid in range(len(factor)): box_preds[cls_labels == cid, 3:6] = \ box_preds[cls_labels == cid, 3:6] * factor[cid] else: box_preds[:, 3:6] = box_preds[:, 3:6] * factor # Apply NMS in birdeye view # get the highest score per prediction, then apply nms # to remove overlapped box. if num_class_with_bg == 1: top_scores = cls_preds.squeeze(-1) top_labels = torch.zeros( cls_preds.shape[0], device=cls_preds.device, dtype=torch.long) else: top_labels = cls_labels.long() top_scores = cls_preds.squeeze(-1) if self.test_cfg['score_threshold'] > 0.0: thresh = torch.tensor( [self.test_cfg['score_threshold']], device=cls_preds.device).type_as(cls_preds) top_scores_keep = top_scores >= thresh top_scores = top_scores.masked_select(top_scores_keep) if top_scores.shape[0] != 0: if self.test_cfg['score_threshold'] > 0.0: box_preds = box_preds[top_scores_keep] top_labels = top_labels[top_scores_keep] boxes_for_nms = img_metas[i]['box_type_3d']( box_preds[:, :], self.bbox_coder.code_size).bev # the nms in 3d detection just remove overlap boxes. if isinstance(self.test_cfg['nms_thr'], list): nms_thresh = self.test_cfg['nms_thr'][task_id] else: nms_thresh = self.test_cfg['nms_thr'] selected = nms_bev( boxes_for_nms, top_scores, thresh=nms_thresh, pre_max_size=self.test_cfg['pre_max_size'], post_max_size=self.test_cfg['post_max_size'], xyxyr2xywhr=False) else: selected = [] if isinstance(factor, list): for cid in range(len(factor)): box_preds[top_labels == cid, 3:6] = \ box_preds[top_labels == cid, 3:6] / factor[cid] else: box_preds[:, 3:6] = box_preds[:, 3:6] / factor # if selected is not None: selected_boxes = box_preds[selected] selected_labels = top_labels[selected] selected_scores = top_scores[selected] # finally generate predictions. if selected_boxes.shape[0] != 0: box_preds = selected_boxes scores = selected_scores label_preds = selected_labels final_box_preds = box_preds final_scores = scores final_labels = label_preds if post_center_range is not None: mask = (final_box_preds[:, :3] >= post_center_range[:3]).all(1) mask &= (final_box_preds[:, :3] <= post_center_range[3:]).all(1) predictions_dict = dict( bboxes=final_box_preds[mask], scores=final_scores[mask], labels=final_labels[mask]) else: predictions_dict = dict( bboxes=final_box_preds, scores=final_scores, labels=final_labels) else: dtype = batch_reg_preds[0].dtype device = batch_reg_preds[0].device predictions_dict = dict( bboxes=torch.zeros([0, self.bbox_coder.code_size], dtype=dtype, device=device), scores=torch.zeros([0], dtype=dtype, device=device), labels=torch.zeros([0], dtype=top_labels.dtype, device=device)) predictions_dicts.append(predictions_dict) return predictions_dicts ================================================ FILE: mmdet3d/models/dense_heads/centerpoint_head_single_task.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import torch from mmcv.cnn import ConvModule, build_conv_layer from mmcv.runner import BaseModule from torch import nn from mmdet3d.core import (circle_nms, draw_heatmap_gaussian, gaussian_radius, xywhr2xyxyr) from mmdet3d.core.post_processing import nms_bev from mmdet3d.models import builder from mmdet3d.models.utils import clip_sigmoid from mmdet.core import build_bbox_coder, multi_apply, reduce_mean from ..builder import HEADS, build_loss @HEADS.register_module() class SeparateHead(BaseModule): """SeparateHead for CenterHead. Args: in_channels (int): Input channels for conv_layer. heads (dict): Conv information. head_conv (int, optional): Output channels. Default: 64. final_kernel (int, optional): Kernel size for the last conv layer. Default: 1. init_bias (float, optional): Initial bias. Default: -2.19. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (str, optional): Type of bias. Default: 'auto'. """ def __init__(self, in_channels, heads, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', init_cfg=None, **kwargs): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(SeparateHead, self).__init__(init_cfg=init_cfg) self.heads = heads self.init_bias = init_bias for head in self.heads: classes, num_conv = self.heads[head] conv_layers = [] c_in = in_channels for i in range(num_conv - 1): conv_layers.append( ConvModule( c_in, head_conv, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=bias, conv_cfg=conv_cfg, norm_cfg=norm_cfg)) c_in = head_conv conv_layers.append( build_conv_layer( conv_cfg, head_conv, classes, kernel_size=final_kernel, stride=1, padding=final_kernel // 2, bias=True)) conv_layers = nn.Sequential(*conv_layers) self.__setattr__(head, conv_layers) if init_cfg is None: self.init_cfg = dict(type='Kaiming', layer='Conv2d') def init_weights(self): """Initialize weights.""" super().init_weights() for head in self.heads: if head == 'heatmap': self.__getattr__(head)[-1].bias.data.fill_(self.init_bias) def forward(self, x): """Forward function for SepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the shape of [B, 2, H, W]. -vel (torch.Tensor): Velocity value with the shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of [B, N, H, W]. """ ret_dict = dict() for head in self.heads: ret_dict[head] = self.__getattr__(head)(x) return ret_dict @HEADS.register_module() class DCNSeparateHead(BaseModule): r"""DCNSeparateHead for CenterHead. .. code-block:: none /-----> DCN for heatmap task -----> heatmap task. feature \-----> DCN for regression tasks -----> regression tasks Args: in_channels (int): Input channels for conv_layer. num_cls (int): Number of classes. heads (dict): Conv information. dcn_config (dict): Config of dcn layer. head_conv (int, optional): Output channels. Default: 64. final_kernel (int, optional): Kernel size for the last conv layer. Default: 1. init_bias (float, optional): Initial bias. Default: -2.19. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (str, optional): Type of bias. Default: 'auto'. """ # noqa: W605 def __init__(self, in_channels, num_cls, heads, dcn_config, head_conv=64, final_kernel=1, init_bias=-2.19, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', init_cfg=None, **kwargs): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(DCNSeparateHead, self).__init__(init_cfg=init_cfg) if 'heatmap' in heads: heads.pop('heatmap') # feature adaptation with dcn # use separate features for classification / regression self.feature_adapt_cls = build_conv_layer(dcn_config) self.feature_adapt_reg = build_conv_layer(dcn_config) # heatmap prediction head cls_head = [ ConvModule( in_channels, head_conv, kernel_size=3, padding=1, conv_cfg=conv_cfg, bias=bias, norm_cfg=norm_cfg), build_conv_layer( conv_cfg, head_conv, num_cls, kernel_size=3, stride=1, padding=1, bias=bias) ] self.cls_head = nn.Sequential(*cls_head) self.init_bias = init_bias # other regression target self.task_head = SeparateHead( in_channels, heads, head_conv=head_conv, final_kernel=final_kernel, bias=bias) if init_cfg is None: self.init_cfg = dict(type='Kaiming', layer='Conv2d') def init_weights(self): """Initialize weights.""" super().init_weights() self.cls_head[-1].bias.data.fill_(self.init_bias) def forward(self, x): """Forward function for DCNSepHead. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: dict[str: torch.Tensor]: contains the following keys: -reg (torch.Tensor): 2D regression value with the shape of [B, 2, H, W]. -height (torch.Tensor): Height value with the shape of [B, 1, H, W]. -dim (torch.Tensor): Size value with the shape of [B, 3, H, W]. -rot (torch.Tensor): Rotation value with the shape of [B, 2, H, W]. -vel (torch.Tensor): Velocity value with the shape of [B, 2, H, W]. -heatmap (torch.Tensor): Heatmap with the shape of [B, N, H, W]. """ center_feat = self.feature_adapt_cls(x) reg_feat = self.feature_adapt_reg(x) cls_score = self.cls_head(center_feat) ret = self.task_head(reg_feat) ret['heatmap'] = cls_score return ret @HEADS.register_module() class CenterHead(BaseModule): """CenterHead for CenterPoint. Args: in_channels (list[int] | int, optional): Channels of the input feature map. Default: [128]. tasks (list[dict], optional): Task information including class number and class names. Default: None. train_cfg (dict, optional): Train-time configs. Default: None. test_cfg (dict, optional): Test-time configs. Default: None. bbox_coder (dict, optional): Bbox coder configs. Default: None. common_heads (dict, optional): Conv information for common heads. Default: dict(). loss_cls (dict, optional): Config of classification loss function. Default: dict(type='GaussianFocalLoss', reduction='mean'). loss_bbox (dict, optional): Config of regression loss function. Default: dict(type='L1Loss', reduction='none'). separate_head (dict, optional): Config of separate head. Default: dict( type='SeparateHead', init_bias=-2.19, final_kernel=3) share_conv_channel (int, optional): Output channels for share_conv layer. Default: 64. num_heatmap_convs (int, optional): Number of conv layers for heatmap conv layer. Default: 2. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (str, optional): Type of bias. Default: 'auto'. """ def __init__(self, in_channels=[128], tasks=None, train_cfg=None, test_cfg=None, bbox_coder=None, common_heads=dict(), loss_cls=dict(type='GaussianFocalLoss', reduction='mean'), loss_bbox=dict( type='L1Loss', reduction='none', loss_weight=0.25), separate_head=dict( type='SeparateHead', init_bias=-2.19, final_kernel=3), share_conv_channel=64, num_heatmap_convs=2, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias='auto', norm_bbox=True, init_cfg=None, task_specific=True): assert init_cfg is None, 'To prevent abnormal initialization ' \ 'behavior, init_cfg is not allowed to be set' super(CenterHead, self).__init__(init_cfg=init_cfg) num_classes = [len(t['class_names']) for t in tasks] self.class_names = [t['class_names'] for t in tasks] self.train_cfg = train_cfg self.test_cfg = test_cfg self.in_channels = in_channels self.num_classes = num_classes self.norm_bbox = norm_bbox self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.bbox_coder = build_bbox_coder(bbox_coder) self.num_anchor_per_locs = [n for n in num_classes] self.fp16_enabled = False # a shared convolution self.shared_conv = ConvModule( in_channels, share_conv_channel, kernel_size=3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=bias) self.task_heads = nn.ModuleList() for num_cls in num_classes: heads = copy.deepcopy(common_heads) heads.update(dict(heatmap=(num_cls, num_heatmap_convs))) separate_head.update( in_channels=share_conv_channel, heads=heads, num_cls=num_cls) self.task_heads.append(builder.build_head(separate_head)) self.with_velocity = 'vel' in common_heads.keys() self.task_specific = task_specific def forward_single(self, x): """Forward function for CenterPoint. Args: x (torch.Tensor): Input feature map with the shape of [B, 512, 128, 128]. Returns: list[dict]: Output results for tasks. """ ret_dicts = [] x = self.shared_conv(x) for task in self.task_heads: ret_dicts.append(task(x)) return ret_dicts def forward(self, feats): """Forward pass. Args: feats (list[torch.Tensor]): Multi-level features, e.g., features produced by FPN. Returns: tuple(list[dict]): Output results for tasks. """ return multi_apply(self.forward_single, feats) def _gather_feat(self, feat, ind, mask=None): """Gather feature map. Given feature map and index, return indexed feature map. Args: feat (torch.tensor): Feature map with the shape of [B, H*W, 10]. ind (torch.Tensor): Index of the ground truth boxes with the shape of [B, max_obj]. mask (torch.Tensor, optional): Mask of the feature map with the shape of [B, max_obj]. Default: None. Returns: torch.Tensor: Feature map after gathering with the shape of [B, max_obj, 10]. """ dim = feat.size(2) ind = ind.unsqueeze(2).expand(ind.size(0), ind.size(1), dim) feat = feat.gather(1, ind) if mask is not None: mask = mask.unsqueeze(2).expand_as(feat) feat = feat[mask] feat = feat.view(-1, dim) return feat def get_targets(self, gt_bboxes_3d, gt_labels_3d): """Generate targets. How each output is transformed: Each nested list is transposed so that all same-index elements in each sub-list (1, ..., N) become the new sub-lists. [ [a0, a1, a2, ... ], [b0, b1, b2, ... ], ... ] ==> [ [a0, b0, ... ], [a1, b1, ... ], [a2, b2, ... ] ] The new transposed nested list is converted into a list of N tensors generated by concatenating tensors in the new sub-lists. [ tensor0, tensor1, tensor2, ... ] Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. Returns: Returns: tuple[list[torch.Tensor]]: Tuple of target including the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - list[torch.Tensor]: Indexes indicating the position of the valid boxes. - list[torch.Tensor]: Masks indicating which boxes are valid. """ heatmaps, anno_boxes, inds, masks = multi_apply( self.get_targets_single, gt_bboxes_3d, gt_labels_3d) # Transpose heatmaps heatmaps = list(map(list, zip(*heatmaps))) heatmaps = [torch.stack(hms_) for hms_ in heatmaps] # Transpose anno_boxes anno_boxes = list(map(list, zip(*anno_boxes))) anno_boxes = [torch.stack(anno_boxes_) for anno_boxes_ in anno_boxes] # Transpose inds inds = list(map(list, zip(*inds))) inds = [torch.stack(inds_) for inds_ in inds] # Transpose inds masks = list(map(list, zip(*masks))) masks = [torch.stack(masks_) for masks_ in masks] return heatmaps, anno_boxes, inds, masks def get_targets_single(self, gt_bboxes_3d, gt_labels_3d): """Generate training targets for a single sample. Args: gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. gt_labels_3d (torch.Tensor): Labels of boxes. Returns: tuple[list[torch.Tensor]]: Tuple of target including the following results in order. - list[torch.Tensor]: Heatmap scores. - list[torch.Tensor]: Ground truth boxes. - list[torch.Tensor]: Indexes indicating the position of the valid boxes. - list[torch.Tensor]: Masks indicating which boxes are valid. """ device = gt_labels_3d.device gt_bboxes_3d = torch.cat( (gt_bboxes_3d.gravity_center, gt_bboxes_3d.tensor[:, 3:]), dim=1).to(device) max_objs = self.train_cfg['max_objs'] * self.train_cfg['dense_reg'] grid_size = torch.tensor(self.train_cfg['grid_size']) pc_range = torch.tensor(self.train_cfg['point_cloud_range']) voxel_size = torch.tensor(self.train_cfg['voxel_size']) feature_map_size = grid_size[:2] // self.train_cfg['out_size_factor'] # reorganize the gt_dict by tasks task_masks = [] flag = 0 for class_name in self.class_names: task_masks.append([ torch.where(gt_labels_3d == class_name.index(i) + flag) for i in class_name ]) flag += len(class_name) task_boxes = [] task_classes = [] flag2 = 0 for idx, mask in enumerate(task_masks): task_box = [] task_class = [] for m in mask: task_box.append(gt_bboxes_3d[m]) # 0 is background for each task, so we need to add 1 here. task_class.append(gt_labels_3d[m] + 1 - flag2) task_boxes.append(torch.cat(task_box, axis=0).to(device)) task_classes.append(torch.cat(task_class).long().to(device)) flag2 += len(mask) draw_gaussian = draw_heatmap_gaussian heatmaps, anno_boxes, inds, masks = [], [], [], [] for idx, task_head in enumerate(self.task_heads): heatmap = gt_bboxes_3d.new_zeros( (len(self.class_names[idx]), feature_map_size[1], feature_map_size[0])) if self.with_velocity: anno_box = gt_bboxes_3d.new_zeros((max_objs, 10), dtype=torch.float32) else: anno_box = gt_bboxes_3d.new_zeros((max_objs, 8), dtype=torch.float32) ind = gt_labels_3d.new_zeros((max_objs), dtype=torch.int64) mask = gt_bboxes_3d.new_zeros((max_objs), dtype=torch.uint8) num_objs = min(task_boxes[idx].shape[0], max_objs) for k in range(num_objs): cls_id = task_classes[idx][k] - 1 width = task_boxes[idx][k][3] length = task_boxes[idx][k][4] width = width / voxel_size[0] / self.train_cfg[ 'out_size_factor'] length = length / voxel_size[1] / self.train_cfg[ 'out_size_factor'] if width > 0 and length > 0: radius = gaussian_radius( (length, width), min_overlap=self.train_cfg['gaussian_overlap']) radius = max(self.train_cfg['min_radius'], int(radius)) # be really careful for the coordinate system of # your box annotation. x, y, z = task_boxes[idx][k][0], task_boxes[idx][k][ 1], task_boxes[idx][k][2] coor_x = ( x - pc_range[0] ) / voxel_size[0] / self.train_cfg['out_size_factor'] coor_y = ( y - pc_range[1] ) / voxel_size[1] / self.train_cfg['out_size_factor'] center = torch.tensor([coor_x, coor_y], dtype=torch.float32, device=device) center_int = center.to(torch.int32) # throw out not in range objects to avoid out of array # area when creating the heatmap if not (0 <= center_int[0] < feature_map_size[0] and 0 <= center_int[1] < feature_map_size[1]): continue draw_gaussian(heatmap[cls_id], center_int, radius) new_idx = k x, y = center_int[0], center_int[1] assert (y * feature_map_size[0] + x < feature_map_size[0] * feature_map_size[1]) ind[new_idx] = y * feature_map_size[0] + x mask[new_idx] = 1 # TODO: support other outdoor dataset rot = task_boxes[idx][k][6] box_dim = task_boxes[idx][k][3:6] if self.norm_bbox: box_dim = box_dim.log() if self.with_velocity: vx, vy = task_boxes[idx][k][7:] anno_box[new_idx] = torch.cat([ center - torch.tensor([x, y], device=device), z.unsqueeze(0), box_dim, torch.sin(rot).unsqueeze(0), torch.cos(rot).unsqueeze(0), vx.unsqueeze(0), vy.unsqueeze(0) ]) else: anno_box[new_idx] = torch.cat([ center - torch.tensor([x, y], device=device), z.unsqueeze(0), box_dim, torch.sin(rot).unsqueeze(0), torch.cos(rot).unsqueeze(0) ]) heatmaps.append(heatmap) anno_boxes.append(anno_box) masks.append(mask) inds.append(ind) return heatmaps, anno_boxes, inds, masks def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs): """Loss function for CenterHead. Args: gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground truth gt boxes. gt_labels_3d (list[torch.Tensor]): Labels of boxes. preds_dicts (dict): Output of forward function. Returns: dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. """ heatmaps, anno_boxes, inds, masks = self.get_targets( gt_bboxes_3d, gt_labels_3d) loss_dict = dict() if not self.task_specific: loss_dict['loss'] = 0 for task_id, preds_dict in enumerate(preds_dicts): # heatmap focal loss preds_dict[0]['heatmap'] = clip_sigmoid(preds_dict[0]['heatmap']) num_pos = heatmaps[task_id].eq(1).float().sum().item() cls_avg_factor = torch.clamp( reduce_mean(heatmaps[task_id].new_tensor(num_pos)), min=1).item() loss_heatmap = self.loss_cls( preds_dict[0]['heatmap'], heatmaps[task_id], avg_factor=cls_avg_factor) target_box = anno_boxes[task_id] # reconstruct the anno_box from multiple reg heads preds_dict[0]['anno_box'] = torch.cat( ( preds_dict[0]['reg'], preds_dict[0]['height'], preds_dict[0]['dim'], preds_dict[0]['rot'], preds_dict[0]['vel'], ), dim=1, ) # Regression loss for dimension, offset, height, rotation num = masks[task_id].float().sum() ind = inds[task_id] pred = preds_dict[0]['anno_box'].permute(0, 2, 3, 1).contiguous() pred = pred.view(pred.size(0), -1, pred.size(3)) pred = self._gather_feat(pred, ind) mask = masks[task_id].unsqueeze(2).expand_as(target_box).float() num = torch.clamp( reduce_mean(target_box.new_tensor(num)), min=1e-4).item() isnotnan = (~torch.isnan(target_box)).float() mask *= isnotnan code_weights = self.train_cfg['code_weights'] bbox_weights = mask * mask.new_tensor(code_weights) if self.task_specific: name_list = ['xy', 'z', 'whl', 'yaw', 'vel'] clip_index = [0, 2, 3, 6, 8, 10] for reg_task_id in range(len(name_list)): pred_tmp = pred[ ..., clip_index[reg_task_id]:clip_index[reg_task_id + 1]] target_box_tmp = target_box[ ..., clip_index[reg_task_id]:clip_index[reg_task_id + 1]] bbox_weights_tmp = bbox_weights[ ..., clip_index[reg_task_id]:clip_index[reg_task_id + 1]] loss_bbox_tmp = self.loss_bbox( pred_tmp, target_box_tmp, bbox_weights_tmp, avg_factor=(num + 1e-4)) loss_dict[f'task{task_id}.loss_%s' % (name_list[reg_task_id])] = loss_bbox_tmp loss_dict[f'task{task_id}.loss_heatmap'] = loss_heatmap else: loss_bbox = self.loss_bbox( pred, target_box, bbox_weights, avg_factor=num) loss_dict['loss'] += loss_bbox loss_dict['loss'] += loss_heatmap return loss_dict def get_bboxes(self, preds_dicts, img_metas, img=None, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ rets = [] for task_id, preds_dict in enumerate(preds_dicts): batch_size = preds_dict[0]['heatmap'].shape[0] batch_heatmap = preds_dict[0]['heatmap'].sigmoid() batch_reg = preds_dict[0]['reg'] batch_hei = preds_dict[0]['height'] if self.norm_bbox: batch_dim = torch.exp(preds_dict[0]['dim']) else: batch_dim = preds_dict[0]['dim'] batch_rots = preds_dict[0]['rot'][:, 0].unsqueeze(1) batch_rotc = preds_dict[0]['rot'][:, 1].unsqueeze(1) if 'vel' in preds_dict[0]: batch_vel = preds_dict[0]['vel'] else: batch_vel = None temp = self.bbox_coder.decode( batch_heatmap, batch_rots, batch_rotc, batch_hei, batch_dim, batch_vel, reg=batch_reg, task_id=task_id) batch_reg_preds = [box['bboxes'] for box in temp] batch_cls_preds = [box['scores'] for box in temp] batch_cls_labels = [box['labels'] for box in temp] nms_type = self.test_cfg.get('nms_type') if isinstance(nms_type, list): nms_type = nms_type[task_id] if nms_type == 'circle': ret_task = [] for i in range(batch_size): boxes3d = temp[i]['bboxes'] scores = temp[i]['scores'] labels = temp[i]['labels'] centers = boxes3d[:, [0, 1]] boxes = torch.cat([centers, scores.view(-1, 1)], dim=1) keep = torch.tensor( circle_nms( boxes.detach().cpu().numpy(), self.test_cfg['min_radius'][task_id], post_max_size=self.test_cfg['post_max_size']), dtype=torch.long, device=boxes.device) boxes3d = boxes3d[keep] scores = scores[keep] labels = labels[keep] ret = dict(bboxes=boxes3d, scores=scores, labels=labels) ret_task.append(ret) rets.append(ret_task) else: rets.append( self.get_task_detections(batch_cls_preds, batch_reg_preds, batch_cls_labels, img_metas, task_id)) # Merge branches results num_samples = len(rets[0]) ret_list = [] for i in range(num_samples): for k in rets[0][i].keys(): if k == 'bboxes': bboxes = torch.cat([ret[i][k] for ret in rets]) bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 bboxes = img_metas[i]['box_type_3d']( bboxes, self.bbox_coder.code_size) elif k == 'scores': scores = torch.cat([ret[i][k] for ret in rets]) elif k == 'labels': flag = 0 for j, num_class in enumerate(self.num_classes): rets[j][i][k] += flag flag += num_class labels = torch.cat([ret[i][k].int() for ret in rets]) ret_list.append([bboxes, scores, labels]) return ret_list def get_task_detections(self, batch_cls_preds, batch_reg_preds, batch_cls_labels, img_metas, task_id): """Rotate nms for each task. Args: batch_cls_preds (list[torch.Tensor]): Prediction score with the shape of [N]. batch_reg_preds (list[torch.Tensor]): Prediction bbox with the shape of [N, 9]. batch_cls_labels (list[torch.Tensor]): Prediction label with the shape of [N]. img_metas (list[dict]): Meta information of each sample. Returns: list[dict[str: torch.Tensor]]: contains the following keys: -bboxes (torch.Tensor): Prediction bboxes after nms with the shape of [N, 9]. -scores (torch.Tensor): Prediction scores after nms with the shape of [N]. -labels (torch.Tensor): Prediction labels after nms with the shape of [N]. """ predictions_dicts = [] for i, (box_preds, cls_preds, cls_labels) in enumerate( zip(batch_reg_preds, batch_cls_preds, batch_cls_labels)): default_val = [1.0 for _ in range(len(self.task_heads))] factor = self.test_cfg.get('nms_rescale_factor', default_val)[task_id] if isinstance(factor, list): for cid in range(len(factor)): box_preds[cls_labels == cid, 3:6] = \ box_preds[cls_labels == cid, 3:6] * factor[cid] else: box_preds[:, 3:6] = box_preds[:, 3:6] * factor # Apply NMS in birdeye view top_labels = cls_labels.long() top_scores = cls_preds.squeeze(-1) if cls_preds.shape[0]>1 \ else cls_preds if top_scores.shape[0] != 0: boxes_for_nms = img_metas[i]['box_type_3d']( box_preds[:, :], self.bbox_coder.code_size).bev # the nms in 3d detection just remove overlap boxes. if isinstance(self.test_cfg['nms_thr'], list): nms_thresh = self.test_cfg['nms_thr'][task_id] else: nms_thresh = self.test_cfg['nms_thr'] selected = nms_bev( boxes_for_nms, top_scores, thresh=nms_thresh, pre_max_size=self.test_cfg['pre_max_size'], post_max_size=self.test_cfg['post_max_size'], xyxyr2xywhr=False) else: selected = [] if isinstance(factor, list): for cid in range(len(factor)): box_preds[top_labels == cid, 3:6] = \ box_preds[top_labels == cid, 3:6] / factor[cid] else: box_preds[:, 3:6] = box_preds[:, 3:6] / factor # if selected is not None: selected_boxes = box_preds[selected] selected_labels = top_labels[selected] selected_scores = top_scores[selected] # finally generate predictions. if selected_boxes.shape[0] != 0: predictions_dict = dict( bboxes=selected_boxes, scores=selected_scores, labels=selected_labels) else: dtype = batch_reg_preds[0].dtype device = batch_reg_preds[0].device predictions_dict = dict( bboxes=torch.zeros([0, self.bbox_coder.code_size], dtype=dtype, device=device), scores=torch.zeros([0], dtype=dtype, device=device), labels=torch.zeros([0], dtype=top_labels.dtype, device=device)) predictions_dicts.append(predictions_dict) return predictions_dicts ================================================ FILE: mmdet3d/models/dense_heads/fcaf3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/dense_heads/fcaf3d_neck_with_head.py # noqa try: import MinkowskiEngine as ME except ImportError: # Please follow getting_started.md to install MinkowskiEngine. pass import torch from mmcv.cnn import Scale, bias_init_with_prob from mmcv.ops import nms3d, nms3d_normal from mmcv.runner.base_module import BaseModule from torch import nn from mmdet3d.core.bbox.structures import rotation_3d_in_axis from mmdet3d.models import HEADS, build_loss from mmdet.core import reduce_mean @HEADS.register_module() class FCAF3DHead(BaseModule): r"""Bbox head of `FCAF3D `_. Actually here we store both the sparse 3D FPN and a head. The neck and the head can not be simply separated as pruning score on the i-th level of FPN requires classification scores from i+1-th level of the head. Args: n_classes (int): Number of classes. in_channels (tuple[int]): Number of channels in input tensors. out_channels (int): Number of channels in the neck output tensors. n_reg_outs (int): Number of regression layer channels. voxel_size (float): Voxel size in meters. pts_prune_threshold (int): Pruning threshold on each feature level. pts_assign_threshold (int): Box to location assigner parameter. Assigner selects the maximum feature level with more locations inside the box than pts_assign_threshold. pts_center_threshold (int): Box to location assigner parameter. After feature level for the box is determined, assigner selects pts_center_threshold locations closest to the box center. center_loss (dict, optional): Config of centerness loss. bbox_loss (dict, optional): Config of bbox loss. cls_loss (dict, optional): Config of classification loss. train_cfg (dict, optional): Config for train stage. Defaults to None. test_cfg (dict, optional): Config for test stage. Defaults to None. init_cfg (dict, optional): Config for weight initialization. Defaults to None. """ def __init__(self, n_classes, in_channels, out_channels, n_reg_outs, voxel_size, pts_prune_threshold, pts_assign_threshold, pts_center_threshold, center_loss=dict(type='CrossEntropyLoss', use_sigmoid=True), bbox_loss=dict(type='AxisAlignedIoULoss'), cls_loss=dict(type='FocalLoss'), train_cfg=None, test_cfg=None, init_cfg=None): super(FCAF3DHead, self).__init__(init_cfg) self.voxel_size = voxel_size self.pts_prune_threshold = pts_prune_threshold self.pts_assign_threshold = pts_assign_threshold self.pts_center_threshold = pts_center_threshold self.center_loss = build_loss(center_loss) self.bbox_loss = build_loss(bbox_loss) self.cls_loss = build_loss(cls_loss) self.train_cfg = train_cfg self.test_cfg = test_cfg self._init_layers(in_channels, out_channels, n_reg_outs, n_classes) @staticmethod def _make_block(in_channels, out_channels): """Construct Conv-Norm-Act block. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. Returns: torch.nn.Module: With corresponding layers. """ return nn.Sequential( ME.MinkowskiConvolution( in_channels, out_channels, kernel_size=3, dimension=3), ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU()) @staticmethod def _make_up_block(in_channels, out_channels): """Construct DeConv-Norm-Act-Conv-Norm-Act block. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. Returns: torch.nn.Module: With corresponding layers. """ return nn.Sequential( ME.MinkowskiGenerativeConvolutionTranspose( in_channels, out_channels, kernel_size=2, stride=2, dimension=3), ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU(), ME.MinkowskiConvolution( out_channels, out_channels, kernel_size=3, dimension=3), ME.MinkowskiBatchNorm(out_channels), ME.MinkowskiELU()) def _init_layers(self, in_channels, out_channels, n_reg_outs, n_classes): """Initialize layers. Args: in_channels (tuple[int]): Number of channels in input tensors. out_channels (int): Number of channels in the neck output tensors. n_reg_outs (int): Number of regression layer channels. n_classes (int): Number of classes. """ # neck layers self.pruning = ME.MinkowskiPruning() for i in range(len(in_channels)): if i > 0: self.__setattr__( f'up_block_{i}', self._make_up_block(in_channels[i], in_channels[i - 1])) self.__setattr__(f'out_block_{i}', self._make_block(in_channels[i], out_channels)) # head layers self.conv_center = ME.MinkowskiConvolution( out_channels, 1, kernel_size=1, dimension=3) self.conv_reg = ME.MinkowskiConvolution( out_channels, n_reg_outs, kernel_size=1, dimension=3) self.conv_cls = ME.MinkowskiConvolution( out_channels, n_classes, kernel_size=1, bias=True, dimension=3) self.scales = nn.ModuleList( [Scale(1.) for _ in range(len(in_channels))]) def init_weights(self): """Initialize weights.""" nn.init.normal_(self.conv_center.kernel, std=.01) nn.init.normal_(self.conv_reg.kernel, std=.01) nn.init.normal_(self.conv_cls.kernel, std=.01) nn.init.constant_(self.conv_cls.bias, bias_init_with_prob(.01)) def forward(self, x): """Forward pass. Args: x (list[Tensor]): Features from the backbone. Returns: list[list[Tensor]]: Predictions of the head. """ center_preds, bbox_preds, cls_preds, points = [], [], [], [] inputs = x x = inputs[-1] prune_score = None for i in range(len(inputs) - 1, -1, -1): if i < len(inputs) - 1: x = self.__getattr__(f'up_block_{i + 1}')(x) x = inputs[i] + x x = self._prune(x, prune_score) out = self.__getattr__(f'out_block_{i}')(x) center_pred, bbox_pred, cls_pred, point, prune_score = \ self._forward_single(out, self.scales[i]) center_preds.append(center_pred) bbox_preds.append(bbox_pred) cls_preds.append(cls_pred) points.append(point) return center_preds[::-1], bbox_preds[::-1], cls_preds[::-1], \ points[::-1] def forward_train(self, x, gt_bboxes, gt_labels, input_metas): """Forward pass of the train stage. Args: x (list[SparseTensor]): Features from the backbone. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels(list[torch.Tensor]): Labels of each sample. input_metas (list[dict]): Contains scene meta info for each sample. Returns: dict: Centerness, bbox and classification loss values. """ center_preds, bbox_preds, cls_preds, points = self(x) return self._loss(center_preds, bbox_preds, cls_preds, points, gt_bboxes, gt_labels, input_metas) def forward_test(self, x, input_metas): """Forward pass of the test stage. Args: x (list[SparseTensor]): Features from the backbone. input_metas (list[dict]): Contains scene meta info for each sample. Returns: list[list[Tensor]]: bboxes, scores and labels for each sample. """ center_preds, bbox_preds, cls_preds, points = self(x) return self._get_bboxes(center_preds, bbox_preds, cls_preds, points, input_metas) def _prune(self, x, scores): """Prunes the tensor by score thresholding. Args: x (SparseTensor): Tensor to be pruned. scores (SparseTensor): Scores for thresholding. Returns: SparseTensor: Pruned tensor. """ with torch.no_grad(): coordinates = x.C.float() interpolated_scores = scores.features_at_coordinates(coordinates) prune_mask = interpolated_scores.new_zeros( (len(interpolated_scores)), dtype=torch.bool) for permutation in x.decomposition_permutations: score = interpolated_scores[permutation] mask = score.new_zeros((len(score)), dtype=torch.bool) topk = min(len(score), self.pts_prune_threshold) ids = torch.topk(score.squeeze(1), topk, sorted=False).indices mask[ids] = True prune_mask[permutation[mask]] = True x = self.pruning(x, prune_mask) return x def _forward_single(self, x, scale): """Forward pass per level. Args: x (SparseTensor): Per level neck output tensor. scale (mmcv.cnn.Scale): Per level multiplication weight. Returns: tuple[Tensor]: Per level head predictions. """ center_pred = self.conv_center(x).features scores = self.conv_cls(x) cls_pred = scores.features prune_scores = ME.SparseTensor( scores.features.max(dim=1, keepdim=True).values, coordinate_map_key=scores.coordinate_map_key, coordinate_manager=scores.coordinate_manager) reg_final = self.conv_reg(x).features reg_distance = torch.exp(scale(reg_final[:, :6])) reg_angle = reg_final[:, 6:] bbox_pred = torch.cat((reg_distance, reg_angle), dim=1) center_preds, bbox_preds, cls_preds, points = [], [], [], [] for permutation in x.decomposition_permutations: center_preds.append(center_pred[permutation]) bbox_preds.append(bbox_pred[permutation]) cls_preds.append(cls_pred[permutation]) points = x.decomposed_coordinates for i in range(len(points)): points[i] = points[i] * self.voxel_size return center_preds, bbox_preds, cls_preds, points, prune_scores def _loss_single(self, center_preds, bbox_preds, cls_preds, points, gt_bboxes, gt_labels, input_meta): """Per scene loss function. Args: center_preds (list[Tensor]): Centerness predictions for all levels. bbox_preds (list[Tensor]): Bbox predictions for all levels. cls_preds (list[Tensor]): Classification predictions for all levels. points (list[Tensor]): Final location coordinates for all levels. gt_bboxes (BaseInstance3DBoxes): Ground truth boxes. gt_labels (Tensor): Ground truth labels. input_meta (dict): Scene meta info. Returns: tuple[Tensor]: Centerness, bbox, and classification loss values. """ center_targets, bbox_targets, cls_targets = self._get_targets( points, gt_bboxes, gt_labels) center_preds = torch.cat(center_preds) bbox_preds = torch.cat(bbox_preds) cls_preds = torch.cat(cls_preds) points = torch.cat(points) # cls loss pos_inds = torch.nonzero(cls_targets >= 0).squeeze(1) n_pos = points.new_tensor(len(pos_inds)) n_pos = max(reduce_mean(n_pos), 1.) cls_loss = self.cls_loss(cls_preds, cls_targets, avg_factor=n_pos) # bbox and centerness losses pos_center_preds = center_preds[pos_inds] pos_bbox_preds = bbox_preds[pos_inds] pos_center_targets = center_targets[pos_inds].unsqueeze(1) pos_bbox_targets = bbox_targets[pos_inds] # reduce_mean is outside if / else block to prevent deadlock center_denorm = max( reduce_mean(pos_center_targets.sum().detach()), 1e-6) if len(pos_inds) > 0: pos_points = points[pos_inds] center_loss = self.center_loss( pos_center_preds, pos_center_targets, avg_factor=n_pos) bbox_loss = self.bbox_loss( self._bbox_to_loss( self._bbox_pred_to_bbox(pos_points, pos_bbox_preds)), self._bbox_to_loss(pos_bbox_targets), weight=pos_center_targets.squeeze(1), avg_factor=center_denorm) else: center_loss = pos_center_preds.sum() bbox_loss = pos_bbox_preds.sum() return center_loss, bbox_loss, cls_loss def _loss(self, center_preds, bbox_preds, cls_preds, points, gt_bboxes, gt_labels, input_metas): """Per scene loss function. Args: center_preds (list[list[Tensor]]): Centerness predictions for all scenes. bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes. cls_preds (list[list[Tensor]]): Classification predictions for all scenes. points (list[list[Tensor]]): Final location coordinates for all scenes. gt_bboxes (list[BaseInstance3DBoxes]): Ground truth boxes for all scenes. gt_labels (list[Tensor]): Ground truth labels for all scenes. input_metas (list[dict]): Meta infos for all scenes. Returns: dict: Centerness, bbox, and classification loss values. """ center_losses, bbox_losses, cls_losses = [], [], [] for i in range(len(input_metas)): center_loss, bbox_loss, cls_loss = self._loss_single( center_preds=[x[i] for x in center_preds], bbox_preds=[x[i] for x in bbox_preds], cls_preds=[x[i] for x in cls_preds], points=[x[i] for x in points], input_meta=input_metas[i], gt_bboxes=gt_bboxes[i], gt_labels=gt_labels[i]) center_losses.append(center_loss) bbox_losses.append(bbox_loss) cls_losses.append(cls_loss) return dict( center_loss=torch.mean(torch.stack(center_losses)), bbox_loss=torch.mean(torch.stack(bbox_losses)), cls_loss=torch.mean(torch.stack(cls_losses))) def _get_bboxes_single(self, center_preds, bbox_preds, cls_preds, points, input_meta): """Generate boxes for a single scene. Args: center_preds (list[Tensor]): Centerness predictions for all levels. bbox_preds (list[Tensor]): Bbox predictions for all levels. cls_preds (list[Tensor]): Classification predictions for all levels. points (list[Tensor]): Final location coordinates for all levels. input_meta (dict): Scene meta info. Returns: tuple[Tensor]: Predicted bounding boxes, scores and labels. """ mlvl_bboxes, mlvl_scores = [], [] for center_pred, bbox_pred, cls_pred, point in zip( center_preds, bbox_preds, cls_preds, points): scores = cls_pred.sigmoid() * center_pred.sigmoid() max_scores, _ = scores.max(dim=1) if len(scores) > self.test_cfg.nms_pre > 0: _, ids = max_scores.topk(self.test_cfg.nms_pre) bbox_pred = bbox_pred[ids] scores = scores[ids] point = point[ids] bboxes = self._bbox_pred_to_bbox(point, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) bboxes = torch.cat(mlvl_bboxes) scores = torch.cat(mlvl_scores) bboxes, scores, labels = self._single_scene_multiclass_nms( bboxes, scores, input_meta) return bboxes, scores, labels def _get_bboxes(self, center_preds, bbox_preds, cls_preds, points, input_metas): """Generate boxes for all scenes. Args: center_preds (list[list[Tensor]]): Centerness predictions for all scenes. bbox_preds (list[list[Tensor]]): Bbox predictions for all scenes. cls_preds (list[list[Tensor]]): Classification predictions for all scenes. points (list[list[Tensor]]): Final location coordinates for all scenes. input_metas (list[dict]): Meta infos for all scenes. Returns: list[tuple[Tensor]]: Predicted bboxes, scores, and labels for all scenes. """ results = [] for i in range(len(input_metas)): result = self._get_bboxes_single( center_preds=[x[i] for x in center_preds], bbox_preds=[x[i] for x in bbox_preds], cls_preds=[x[i] for x in cls_preds], points=[x[i] for x in points], input_meta=input_metas[i]) results.append(result) return results @staticmethod def _bbox_to_loss(bbox): """Transform box to the axis-aligned or rotated iou loss format. Args: bbox (Tensor): 3D box of shape (N, 6) or (N, 7). Returns: Tensor: Transformed 3D box of shape (N, 6) or (N, 7). """ # rotated iou loss accepts (x, y, z, w, h, l, heading) if bbox.shape[-1] != 6: return bbox # axis-aligned case: x, y, z, w, h, l -> x1, y1, z1, x2, y2, z2 return torch.stack( (bbox[..., 0] - bbox[..., 3] / 2, bbox[..., 1] - bbox[..., 4] / 2, bbox[..., 2] - bbox[..., 5] / 2, bbox[..., 0] + bbox[..., 3] / 2, bbox[..., 1] + bbox[..., 4] / 2, bbox[..., 2] + bbox[..., 5] / 2), dim=-1) @staticmethod def _bbox_pred_to_bbox(points, bbox_pred): """Transform predicted bbox parameters to bbox. Args: points (Tensor): Final locations of shape (N, 3) bbox_pred (Tensor): Predicted bbox parameters of shape (N, 6) or (N, 8). Returns: Tensor: Transformed 3D box of shape (N, 6) or (N, 7). """ if bbox_pred.shape[0] == 0: return bbox_pred x_center = points[:, 0] + (bbox_pred[:, 1] - bbox_pred[:, 0]) / 2 y_center = points[:, 1] + (bbox_pred[:, 3] - bbox_pred[:, 2]) / 2 z_center = points[:, 2] + (bbox_pred[:, 5] - bbox_pred[:, 4]) / 2 # dx_min, dx_max, dy_min, dy_max, dz_min, dz_max -> x, y, z, w, l, h base_bbox = torch.stack([ x_center, y_center, z_center, bbox_pred[:, 0] + bbox_pred[:, 1], bbox_pred[:, 2] + bbox_pred[:, 3], bbox_pred[:, 4] + bbox_pred[:, 5], ], -1) # axis-aligned case if bbox_pred.shape[1] == 6: return base_bbox # rotated case: ..., sin(2a)ln(q), cos(2a)ln(q) scale = bbox_pred[:, 0] + bbox_pred[:, 1] + \ bbox_pred[:, 2] + bbox_pred[:, 3] q = torch.exp( torch.sqrt( torch.pow(bbox_pred[:, 6], 2) + torch.pow(bbox_pred[:, 7], 2))) alpha = 0.5 * torch.atan2(bbox_pred[:, 6], bbox_pred[:, 7]) return torch.stack( (x_center, y_center, z_center, scale / (1 + q), scale / (1 + q) * q, bbox_pred[:, 5] + bbox_pred[:, 4], alpha), dim=-1) @staticmethod def _get_face_distances(points, boxes): """Calculate distances from point to box faces. Args: points (Tensor): Final locations of shape (N_points, N_boxes, 3). boxes (Tensor): 3D boxes of shape (N_points, N_boxes, 7) Returns: Tensor: Face distances of shape (N_points, N_boxes, 6), (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max). """ shift = torch.stack( (points[..., 0] - boxes[..., 0], points[..., 1] - boxes[..., 1], points[..., 2] - boxes[..., 2]), dim=-1).permute(1, 0, 2) shift = rotation_3d_in_axis( shift, -boxes[0, :, 6], axis=2).permute(1, 0, 2) centers = boxes[..., :3] + shift dx_min = centers[..., 0] - boxes[..., 0] + boxes[..., 3] / 2 dx_max = boxes[..., 0] + boxes[..., 3] / 2 - centers[..., 0] dy_min = centers[..., 1] - boxes[..., 1] + boxes[..., 4] / 2 dy_max = boxes[..., 1] + boxes[..., 4] / 2 - centers[..., 1] dz_min = centers[..., 2] - boxes[..., 2] + boxes[..., 5] / 2 dz_max = boxes[..., 2] + boxes[..., 5] / 2 - centers[..., 2] return torch.stack((dx_min, dx_max, dy_min, dy_max, dz_min, dz_max), dim=-1) @staticmethod def _get_centerness(face_distances): """Compute point centerness w.r.t containing box. Args: face_distances (Tensor): Face distances of shape (B, N, 6), (dx_min, dx_max, dy_min, dy_max, dz_min, dz_max). Returns: Tensor: Centerness of shape (B, N). """ x_dims = face_distances[..., [0, 1]] y_dims = face_distances[..., [2, 3]] z_dims = face_distances[..., [4, 5]] centerness_targets = x_dims.min(dim=-1)[0] / x_dims.max(dim=-1)[0] * \ y_dims.min(dim=-1)[0] / y_dims.max(dim=-1)[0] * \ z_dims.min(dim=-1)[0] / z_dims.max(dim=-1)[0] return torch.sqrt(centerness_targets) @torch.no_grad() def _get_targets(self, points, gt_bboxes, gt_labels): """Compute targets for final locations for a single scene. Args: points (list[Tensor]): Final locations for all levels. gt_bboxes (BaseInstance3DBoxes): Ground truth boxes. gt_labels (Tensor): Ground truth labels. Returns: tuple[Tensor]: Centerness, bbox and classification targets for all locations. """ float_max = points[0].new_tensor(1e8) n_levels = len(points) levels = torch.cat([ points[i].new_tensor(i).expand(len(points[i])) for i in range(len(points)) ]) points = torch.cat(points) gt_bboxes = gt_bboxes.to(points.device) n_points = len(points) n_boxes = len(gt_bboxes) volumes = gt_bboxes.volume.unsqueeze(0).expand(n_points, n_boxes) # condition 1: point inside box boxes = torch.cat((gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1) boxes = boxes.expand(n_points, n_boxes, 7) points = points.unsqueeze(1).expand(n_points, n_boxes, 3) face_distances = self._get_face_distances(points, boxes) inside_box_condition = face_distances.min(dim=-1).values > 0 # condition 2: positive points per level >= limit # calculate positive points per scale n_pos_points_per_level = [] for i in range(n_levels): n_pos_points_per_level.append( torch.sum(inside_box_condition[levels == i], dim=0)) # find best level n_pos_points_per_level = torch.stack(n_pos_points_per_level, dim=0) lower_limit_mask = n_pos_points_per_level < self.pts_assign_threshold lower_index = torch.argmax(lower_limit_mask.int(), dim=0) - 1 lower_index = torch.where(lower_index < 0, 0, lower_index) all_upper_limit_mask = torch.all( torch.logical_not(lower_limit_mask), dim=0) best_level = torch.where(all_upper_limit_mask, n_levels - 1, lower_index) # keep only points with best level best_level = best_level.expand(n_points, n_boxes) levels = torch.unsqueeze(levels, 1).expand(n_points, n_boxes) level_condition = best_level == levels # condition 3: limit topk points per box by centerness centerness = self._get_centerness(face_distances) centerness = torch.where(inside_box_condition, centerness, torch.ones_like(centerness) * -1) centerness = torch.where(level_condition, centerness, torch.ones_like(centerness) * -1) top_centerness = torch.topk( centerness, min(self.pts_center_threshold + 1, len(centerness)), dim=0).values[-1] topk_condition = centerness > top_centerness.unsqueeze(0) # condition 4: min volume box per point volumes = torch.where(inside_box_condition, volumes, float_max) volumes = torch.where(level_condition, volumes, float_max) volumes = torch.where(topk_condition, volumes, float_max) min_volumes, min_inds = volumes.min(dim=1) center_targets = centerness[torch.arange(n_points), min_inds] bbox_targets = boxes[torch.arange(n_points), min_inds] if not gt_bboxes.with_yaw: bbox_targets = bbox_targets[:, :-1] cls_targets = gt_labels[min_inds] cls_targets = torch.where(min_volumes == float_max, -1, cls_targets) return center_targets, bbox_targets, cls_targets def _single_scene_multiclass_nms(self, bboxes, scores, input_meta): """Multi-class nms for a single scene. Args: bboxes (Tensor): Predicted boxes of shape (N_boxes, 6) or (N_boxes, 7). scores (Tensor): Predicted scores of shape (N_boxes, N_classes). input_meta (dict): Scene meta data. Returns: tuple[Tensor]: Predicted bboxes, scores and labels. """ n_classes = scores.shape[1] with_yaw = bboxes.shape[1] == 7 nms_bboxes, nms_scores, nms_labels = [], [], [] for i in range(n_classes): ids = scores[:, i] > self.test_cfg.score_thr if not ids.any(): continue class_scores = scores[ids, i] class_bboxes = bboxes[ids] if with_yaw: nms_function = nms3d else: class_bboxes = torch.cat( (class_bboxes, torch.zeros_like(class_bboxes[:, :1])), dim=1) nms_function = nms3d_normal nms_ids = nms_function(class_bboxes, class_scores, self.test_cfg.iou_thr) nms_bboxes.append(class_bboxes[nms_ids]) nms_scores.append(class_scores[nms_ids]) nms_labels.append( bboxes.new_full( class_scores[nms_ids].shape, i, dtype=torch.long)) if len(nms_bboxes): nms_bboxes = torch.cat(nms_bboxes, dim=0) nms_scores = torch.cat(nms_scores, dim=0) nms_labels = torch.cat(nms_labels, dim=0) else: nms_bboxes = bboxes.new_zeros((0, bboxes.shape[1])) nms_scores = bboxes.new_zeros((0, )) nms_labels = bboxes.new_zeros((0, )) if with_yaw: box_dim = 7 else: box_dim = 6 nms_bboxes = nms_bboxes[:, :6] nms_bboxes = input_meta['box_type_3d']( nms_bboxes, box_dim=box_dim, with_yaw=with_yaw, origin=(.5, .5, .5)) return nms_bboxes, nms_scores, nms_labels ================================================ FILE: mmdet3d/models/dense_heads/fcos_mono3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from logging import warning import numpy as np import torch from mmcv.cnn import Scale, normal_init from mmcv.runner import force_fp32 from torch import nn as nn from mmdet3d.core import (box3d_multiclass_nms, limit_period, points_img2cam, xywhr2xyxyr) from mmdet.core import multi_apply from mmdet.core.bbox.builder import build_bbox_coder from ..builder import HEADS, build_loss from .anchor_free_mono3d_head import AnchorFreeMono3DHead INF = 1e8 @HEADS.register_module() class FCOSMono3DHead(AnchorFreeMono3DHead): """Anchor-free head used in FCOS3D. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. regress_ranges (tuple[tuple[int, int]], optional): Regress range of multiple level points. center_sampling (bool, optional): If true, use center sampling. Default: True. center_sample_radius (float, optional): Radius of center sampling. Default: 1.5. norm_on_bbox (bool, optional): If true, normalize the regression targets with FPN strides. Default: True. centerness_on_reg (bool, optional): If true, position centerness on the regress branch. Please refer to https://github.com/tianzhi0549/FCOS/issues/89#issuecomment-516877042. Default: True. centerness_alpha (int, optional): Parameter used to adjust the intensity attenuation from the center to the periphery. Default: 2.5. loss_cls (dict, optional): Config of classification loss. loss_bbox (dict, optional): Config of localization loss. loss_dir (dict, optional): Config of direction classification loss. loss_attr (dict, optional): Config of attribute classification loss. loss_centerness (dict, optional): Config of centerness loss. norm_cfg (dict, optional): dictionary to construct and config norm layer. Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). centerness_branch (tuple[int], optional): Channels for centerness branch. Default: (64, ). """ # noqa: E501 def __init__(self, regress_ranges=((-1, 48), (48, 96), (96, 192), (192, 384), (384, INF)), center_sampling=True, center_sample_radius=1.5, norm_on_bbox=True, centerness_on_reg=True, centerness_alpha=2.5, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_dir=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_attr=dict( type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), centerness_branch=(64, ), init_cfg=None, **kwargs): self.regress_ranges = regress_ranges self.center_sampling = center_sampling self.center_sample_radius = center_sample_radius self.norm_on_bbox = norm_on_bbox self.centerness_on_reg = centerness_on_reg self.centerness_alpha = centerness_alpha self.centerness_branch = centerness_branch super().__init__( loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dir=loss_dir, loss_attr=loss_attr, norm_cfg=norm_cfg, init_cfg=init_cfg, **kwargs) self.loss_centerness = build_loss(loss_centerness) bbox_coder['code_size'] = self.bbox_code_size self.bbox_coder = build_bbox_coder(bbox_coder) def _init_layers(self): """Initialize layers of the head.""" super()._init_layers() self.conv_centerness_prev = self._init_branch( conv_channels=self.centerness_branch, conv_strides=(1, ) * len(self.centerness_branch)) self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1) self.scale_dim = 3 # only for offset, depth and size regression self.scales = nn.ModuleList([ nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)]) for _ in self.strides ]) def init_weights(self): """Initialize weights of the head. We currently still use the customized init_weights because the default init of DCN triggered by the init_cfg will init conv_offset.weight, which mistakenly affects the training stability. """ super().init_weights() for m in self.conv_centerness_prev: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) normal_init(self.conv_centerness, std=0.01) def forward(self, feats): """Forward features from the upstream network. Args: feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor. Returns: tuple: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2). attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. """ # Note: we use [:5] to filter feats and only return predictions return multi_apply(self.forward_single, feats, self.scales, self.strides)[:5] def forward_single(self, x, scale, stride): """Forward features of a single scale level. Args: x (Tensor): FPN feature maps of the specified stride. scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize the bbox prediction. stride (int): The corresponding stride for feature maps, only used to normalize the bbox prediction when self.norm_on_bbox is True. Returns: tuple: scores for each class, bbox and direction class predictions, centerness predictions of input feature maps. """ cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \ super().forward_single(x) if self.centerness_on_reg: clone_reg_feat = reg_feat.clone() for conv_centerness_prev_layer in self.conv_centerness_prev: clone_reg_feat = conv_centerness_prev_layer(clone_reg_feat) centerness = self.conv_centerness(clone_reg_feat) else: clone_cls_feat = cls_feat.clone() for conv_centerness_prev_layer in self.conv_centerness_prev: clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat) centerness = self.conv_centerness(clone_cls_feat) bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride, self.training, cls_score) return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \ cls_feat, reg_feat @staticmethod def add_sin_difference(boxes1, boxes2): """Convert the rotation difference to difference in sine function. Args: boxes1 (torch.Tensor): Original Boxes in shape (NxC), where C>=7 and the 7th dimension is rotation dimension. boxes2 (torch.Tensor): Target boxes in shape (NxC), where C>=7 and the 7th dimension is rotation dimension. Returns: tuple[torch.Tensor]: ``boxes1`` and ``boxes2`` whose 7th dimensions are changed. """ rad_pred_encoding = torch.sin(boxes1[..., 6:7]) * torch.cos( boxes2[..., 6:7]) rad_tg_encoding = torch.cos(boxes1[..., 6:7]) * torch.sin(boxes2[..., 6:7]) boxes1 = torch.cat( [boxes1[..., :6], rad_pred_encoding, boxes1[..., 7:]], dim=-1) boxes2 = torch.cat([boxes2[..., :6], rad_tg_encoding, boxes2[..., 7:]], dim=-1) return boxes1, boxes2 @staticmethod def get_direction_target(reg_targets, dir_offset=0, dir_limit_offset=0.0, num_bins=2, one_hot=True): """Encode direction to 0 ~ num_bins-1. Args: reg_targets (torch.Tensor): Bbox regression targets. dir_offset (int, optional): Direction offset. Default to 0. dir_limit_offset (float, optional): Offset to set the direction range. Default to 0.0. num_bins (int, optional): Number of bins to divide 2*PI. Default to 2. one_hot (bool, optional): Whether to encode as one hot. Default to True. Returns: torch.Tensor: Encoded direction targets. """ rot_gt = reg_targets[..., 6] offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi) dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) if one_hot: dir_targets = torch.zeros( *list(dir_cls_targets.shape), num_bins, dtype=reg_targets.dtype, device=dir_cls_targets.device) dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0) dir_cls_targets = dir_targets return dir_cls_targets @force_fp32( apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds', 'centernesses')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, centernesses, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of (num_gts, code_size). gt_labels_3d (list[Tensor]): same as gt_labels centers2d (list[Tensor]): 2D centers on the image with shape of (num_gts, 2). depths (list[Tensor]): Depth ground truth with shape of (num_gts, ). attr_labels (list[Tensor]): Attributes indices of each box. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == len(centernesses) == len( attr_preds) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \ self.get_targets( all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels) num_imgs = cls_scores[0].size(0) # flatten cls_scores, bbox_preds, dir_cls_preds and centerness flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims)) for bbox_pred in bbox_preds ] flatten_dir_cls_preds = [ dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2) for dir_cls_pred in dir_cls_preds ] flatten_centerness = [ centerness.permute(0, 2, 3, 1).reshape(-1) for centerness in centernesses ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds) flatten_centerness = torch.cat(flatten_centerness) flatten_labels_3d = torch.cat(labels_3d) flatten_bbox_targets_3d = torch.cat(bbox_targets_3d) flatten_centerness_targets = torch.cat(centerness_targets) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = ((flatten_labels_3d >= 0) & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1) num_pos = len(pos_inds) loss_cls = self.loss_cls( flatten_cls_scores, flatten_labels_3d, avg_factor=num_pos + num_imgs) # avoid num_pos is 0 pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds] pos_centerness = flatten_centerness[pos_inds] if self.pred_attrs: flatten_attr_preds = [ attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs) for attr_pred in attr_preds ] flatten_attr_preds = torch.cat(flatten_attr_preds) flatten_attr_targets = torch.cat(attr_targets) pos_attr_preds = flatten_attr_preds[pos_inds] if num_pos > 0: pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds] pos_centerness_targets = flatten_centerness_targets[pos_inds] if self.pred_attrs: pos_attr_targets = flatten_attr_targets[pos_inds] bbox_weights = pos_centerness_targets.new_ones( len(pos_centerness_targets), sum(self.group_reg_dims)) equal_weights = pos_centerness_targets.new_ones( pos_centerness_targets.shape) code_weight = self.train_cfg.get('code_weight', None) if code_weight: assert len(code_weight) == sum(self.group_reg_dims) bbox_weights = bbox_weights * bbox_weights.new_tensor( code_weight) if self.use_direction_classifier: pos_dir_cls_targets = self.get_direction_target( pos_bbox_targets_3d, self.dir_offset, self.dir_limit_offset, one_hot=False) if self.diff_rad_by_sin: pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference( pos_bbox_preds, pos_bbox_targets_3d) loss_offset = self.loss_bbox( pos_bbox_preds[:, :2], pos_bbox_targets_3d[:, :2], weight=bbox_weights[:, :2], avg_factor=equal_weights.sum()) loss_depth = self.loss_bbox( pos_bbox_preds[:, 2], pos_bbox_targets_3d[:, 2], weight=bbox_weights[:, 2], avg_factor=equal_weights.sum()) loss_size = self.loss_bbox( pos_bbox_preds[:, 3:6], pos_bbox_targets_3d[:, 3:6], weight=bbox_weights[:, 3:6], avg_factor=equal_weights.sum()) loss_rotsin = self.loss_bbox( pos_bbox_preds[:, 6], pos_bbox_targets_3d[:, 6], weight=bbox_weights[:, 6], avg_factor=equal_weights.sum()) loss_velo = None if self.pred_velo: loss_velo = self.loss_bbox( pos_bbox_preds[:, 7:9], pos_bbox_targets_3d[:, 7:9], weight=bbox_weights[:, 7:9], avg_factor=equal_weights.sum()) loss_centerness = self.loss_centerness(pos_centerness, pos_centerness_targets) # direction classification loss loss_dir = None # TODO: add more check for use_direction_classifier if self.use_direction_classifier: loss_dir = self.loss_dir( pos_dir_cls_preds, pos_dir_cls_targets, equal_weights, avg_factor=equal_weights.sum()) # attribute classification loss loss_attr = None if self.pred_attrs: loss_attr = self.loss_attr( pos_attr_preds, pos_attr_targets, pos_centerness_targets, avg_factor=pos_centerness_targets.sum()) else: # need absolute due to possible negative delta x/y loss_offset = pos_bbox_preds[:, :2].sum() loss_depth = pos_bbox_preds[:, 2].sum() loss_size = pos_bbox_preds[:, 3:6].sum() loss_rotsin = pos_bbox_preds[:, 6].sum() loss_velo = None if self.pred_velo: loss_velo = pos_bbox_preds[:, 7:9].sum() loss_centerness = pos_centerness.sum() loss_dir = None if self.use_direction_classifier: loss_dir = pos_dir_cls_preds.sum() loss_attr = None if self.pred_attrs: loss_attr = pos_attr_preds.sum() loss_dict = dict( loss_cls=loss_cls, loss_offset=loss_offset, loss_depth=loss_depth, loss_size=loss_size, loss_rotsin=loss_rotsin, loss_centerness=loss_centerness) if loss_velo is not None: loss_dict['loss_velo'] = loss_velo if loss_dir is not None: loss_dict['loss_dir'] = loss_dir if loss_attr is not None: loss_dict['loss_attr'] = loss_attr return loss_dict @force_fp32( apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'attr_preds', 'centernesses')) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, centernesses, img_metas, cfg=None, rescale=None): """Transform network output for a batch into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_points * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_points * 4, H, W) dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) centernesses (list[Tensor]): Centerness for each scale level with shape (N, num_points * 1, H, W) img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used rescale (bool): If True, return boxes in original image space Returns: list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple. The first item is an (n, 5) tensor, where the first 4 columns are bounding box positions (tl_x, tl_y, br_x, br_y) and the 5-th column is a score between 0 and 1. The second item is a (n,) tensor where each item is the predicted class label of the corresponding box. """ assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ len(centernesses) == len(attr_preds) num_levels = len(cls_scores) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) result_list = [] for img_id in range(len(img_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] if self.use_direction_classifier: dir_cls_pred_list = [ dir_cls_preds[i][img_id].detach() for i in range(num_levels) ] else: dir_cls_pred_list = [ cls_scores[i][img_id].new_full( [2, *cls_scores[i][img_id].shape[1:]], 0).detach() for i in range(num_levels) ] if self.pred_attrs: attr_pred_list = [ attr_preds[i][img_id].detach() for i in range(num_levels) ] else: attr_pred_list = [ cls_scores[i][img_id].new_full( [self.num_attrs, *cls_scores[i][img_id].shape[1:]], self.attr_background_label).detach() for i in range(num_levels) ] centerness_pred_list = [ centernesses[i][img_id].detach() for i in range(num_levels) ] input_meta = img_metas[img_id] det_bboxes = self._get_bboxes_single( cls_score_list, bbox_pred_list, dir_cls_pred_list, attr_pred_list, centerness_pred_list, mlvl_points, input_meta, cfg, rescale) result_list.append(det_bboxes) return result_list def _get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, attr_preds, centernesses, mlvl_points, input_meta, cfg, rescale=False): """Transform outputs for a single batch item into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for a single scale level Has shape (num_points * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for a single scale level with shape (num_points * bbox_code_size, H, W). dir_cls_preds (list[Tensor]): Box scores for direction class predictions on a single scale level with shape (num_points * 2, H, W) attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) centernesses (list[Tensor]): Centerness for a single scale level with shape (num_points, H, W). mlvl_points (list[Tensor]): Box reference for a single scale level with shape (num_total_points, 2). input_meta (dict): Metadata of input image. cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool): If True, return boxes in original image space. Returns: tuples[Tensor]: Predicted 3D boxes, scores, labels and attributes. """ view = np.array(input_meta['cam2img']) scale_factor = input_meta['scale_factor'] cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_centers2d = [] mlvl_bboxes = [] mlvl_scores = [] mlvl_dir_scores = [] mlvl_attr_scores = [] mlvl_centerness = [] for cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \ points in zip(cls_scores, bbox_preds, dir_cls_preds, attr_preds, centernesses, mlvl_points): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs) attr_score = torch.max(attr_pred, dim=-1)[1] centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, sum(self.group_reg_dims)) bbox_pred = bbox_pred[:, :self.bbox_code_size] nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: max_scores, _ = (scores * centerness[:, None]).max(dim=1) _, topk_inds = max_scores.topk(nms_pre) points = points[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] dir_cls_pred = dir_cls_pred[topk_inds, :] centerness = centerness[topk_inds] dir_cls_score = dir_cls_score[topk_inds] attr_score = attr_score[topk_inds] # change the offset to actual center predictions bbox_pred[:, :2] = points - bbox_pred[:, :2] if rescale: bbox_pred[:, :2] /= bbox_pred[:, :2].new_tensor(scale_factor) pred_center2d = bbox_pred[:, :3].clone() bbox_pred[:, :3] = points_img2cam(bbox_pred[:, :3], view) mlvl_centers2d.append(pred_center2d) mlvl_bboxes.append(bbox_pred) mlvl_scores.append(scores) mlvl_dir_scores.append(dir_cls_score) mlvl_attr_scores.append(attr_score) mlvl_centerness.append(centerness) mlvl_centers2d = torch.cat(mlvl_centers2d) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_dir_scores = torch.cat(mlvl_dir_scores) # change local yaw to global yaw for 3D nms cam2img = mlvl_centers2d.new_zeros((4, 4)) cam2img[:view.shape[0], :view.shape[1]] = \ mlvl_centers2d.new_tensor(view) mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d, mlvl_dir_scores, self.dir_offset, cam2img) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)).bev) mlvl_scores = torch.cat(mlvl_scores) padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 # BG cat_id: num_class mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) mlvl_attr_scores = torch.cat(mlvl_attr_scores) mlvl_centerness = torch.cat(mlvl_centerness) # no scale_factors in box3d_multiclass_nms # Then we multiply it from outside mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None] results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_nms_scores, cfg.score_thr, cfg.max_per_img, cfg, mlvl_dir_scores, mlvl_attr_scores) bboxes, scores, labels, dir_scores, attrs = results attrs = attrs.to(labels.dtype) # change data type to int bboxes = input_meta['box_type_3d']( bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)) # Note that the predictions use origin (0.5, 0.5, 0.5) # Due to the ground truth centers2d are the gravity center of objects # v0.10.0 fix inplace operation to the input tensor of cam_box3d # So here we also need to add origin=(0.5, 0.5, 0.5) if not self.pred_attrs: attrs = None return bboxes, scores, labels, attrs @staticmethod def pts2Dto3D(points, view): """ Args: points (torch.Tensor): points in 2D images, [N, 3], 3 corresponds with x, y in the image and depth. view (np.ndarray): camera intrinsic, [3, 3] Returns: torch.Tensor: points in 3D space. [N, 3], 3 corresponds with x, y, z in 3D space. """ warning.warn('DeprecationWarning: This static method has been moved ' 'out of this class to mmdet3d/core. The function ' 'pts2Dto3D will be deprecated.') assert view.shape[0] <= 4 assert view.shape[1] <= 4 assert points.shape[1] == 3 points2D = points[:, :2] depths = points[:, 2].view(-1, 1) unnorm_points2D = torch.cat([points2D * depths, depths], dim=1) viewpad = torch.eye(4, dtype=points2D.dtype, device=points2D.device) viewpad[:view.shape[0], :view.shape[1]] = points2D.new_tensor(view) inv_viewpad = torch.inverse(viewpad).transpose(0, 1) # Do operation in homogeneous coordinates. nbr_points = unnorm_points2D.shape[0] homo_points2D = torch.cat( [unnorm_points2D, points2D.new_ones((nbr_points, 1))], dim=1) points3D = torch.mm(homo_points2D, inv_viewpad)[:, :3] return points3D def _get_points_single(self, featmap_size, stride, dtype, device, flatten=False): """Get points according to feature map sizes.""" y, x = super()._get_points_single(featmap_size, stride, dtype, device) points = torch.stack((x.reshape(-1) * stride, y.reshape(-1) * stride), dim=-1) + stride // 2 return points def get_targets(self, points, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, attr_labels_list): """Compute regression, classification and centerss targets for points in multiple images. Args: points (list[Tensor]): Points of each fpn level, each has shape (num_points, 2). gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, each has shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, each has shape (num_gt,). gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each image, each has shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, each has shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, each has shape (num_gt, 2). depths_list (list[Tensor]): Depth of projected 3D centers onto 2D image, each has shape (num_gt, 1). attr_labels_list (list[Tensor]): Attribute labels of each box, each has shape (num_gt,). Returns: tuple: concat_lvl_labels (list[Tensor]): Labels of each level. concat_lvl_bbox_targets (list[Tensor]): BBox targets of each level. """ assert len(points) == len(self.regress_ranges) num_levels = len(points) # expand regress ranges to align with points expanded_regress_ranges = [ points[i].new_tensor(self.regress_ranges[i])[None].expand_as( points[i]) for i in range(num_levels) ] # concat all levels points and regress ranges concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) concat_points = torch.cat(points, dim=0) # the number of points per img, per lvl num_points = [center.size(0) for center in points] if attr_labels_list is None: attr_labels_list = [ gt_labels.new_full(gt_labels.shape, self.attr_background_label) for gt_labels in gt_labels_list ] # get labels and bbox_targets of each image _, _, labels_3d_list, bbox_targets_3d_list, centerness_targets_list, \ attr_targets_list = multi_apply( self._get_target_single, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, attr_labels_list, points=concat_points, regress_ranges=concat_regress_ranges, num_points_per_lvl=num_points) # split to per img, per level labels_3d_list = [ labels_3d.split(num_points, 0) for labels_3d in labels_3d_list ] bbox_targets_3d_list = [ bbox_targets_3d.split(num_points, 0) for bbox_targets_3d in bbox_targets_3d_list ] centerness_targets_list = [ centerness_targets.split(num_points, 0) for centerness_targets in centerness_targets_list ] attr_targets_list = [ attr_targets.split(num_points, 0) for attr_targets in attr_targets_list ] # concat per level image concat_lvl_labels_3d = [] concat_lvl_bbox_targets_3d = [] concat_lvl_centerness_targets = [] concat_lvl_attr_targets = [] for i in range(num_levels): concat_lvl_labels_3d.append( torch.cat([labels[i] for labels in labels_3d_list])) concat_lvl_centerness_targets.append( torch.cat([ centerness_targets[i] for centerness_targets in centerness_targets_list ])) bbox_targets_3d = torch.cat([ bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list ]) concat_lvl_attr_targets.append( torch.cat( [attr_targets[i] for attr_targets in attr_targets_list])) if self.norm_on_bbox: bbox_targets_3d[:, : 2] = bbox_targets_3d[:, :2] / self.strides[i] concat_lvl_bbox_targets_3d.append(bbox_targets_3d) return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \ concat_lvl_centerness_targets, concat_lvl_attr_targets def _get_target_single(self, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, points, regress_ranges, num_points_per_lvl): """Compute regression and classification targets for a single image.""" num_points = points.size(0) num_gts = gt_labels.size(0) if not isinstance(gt_bboxes_3d, torch.Tensor): gt_bboxes_3d = gt_bboxes_3d.tensor.to(gt_bboxes.device) if num_gts == 0: return gt_labels.new_full((num_points,), self.background_label), \ gt_bboxes.new_zeros((num_points, 4)), \ gt_labels_3d.new_full( (num_points,), self.background_label), \ gt_bboxes_3d.new_zeros((num_points, self.bbox_code_size)), \ gt_bboxes_3d.new_zeros((num_points,)), \ attr_labels.new_full( (num_points,), self.attr_background_label) # change orientation to local yaw gt_bboxes_3d[..., 6] = -torch.atan2( gt_bboxes_3d[..., 0], gt_bboxes_3d[..., 2]) + gt_bboxes_3d[..., 6] areas = (gt_bboxes[:, 2] - gt_bboxes[:, 0]) * ( gt_bboxes[:, 3] - gt_bboxes[:, 1]) areas = areas[None].repeat(num_points, 1) regress_ranges = regress_ranges[:, None, :].expand( num_points, num_gts, 2) gt_bboxes = gt_bboxes[None].expand(num_points, num_gts, 4) centers2d = centers2d[None].expand(num_points, num_gts, 2) gt_bboxes_3d = gt_bboxes_3d[None].expand(num_points, num_gts, self.bbox_code_size) depths = depths[None, :, None].expand(num_points, num_gts, 1) xs, ys = points[:, 0], points[:, 1] xs = xs[:, None].expand(num_points, num_gts) ys = ys[:, None].expand(num_points, num_gts) delta_xs = (xs - centers2d[..., 0])[..., None] delta_ys = (ys - centers2d[..., 1])[..., None] bbox_targets_3d = torch.cat( (delta_xs, delta_ys, depths, gt_bboxes_3d[..., 3:]), dim=-1) left = xs - gt_bboxes[..., 0] right = gt_bboxes[..., 2] - xs top = ys - gt_bboxes[..., 1] bottom = gt_bboxes[..., 3] - ys bbox_targets = torch.stack((left, top, right, bottom), -1) assert self.center_sampling is True, 'Setting center_sampling to '\ 'False has not been implemented for FCOS3D.' # condition1: inside a `center bbox` radius = self.center_sample_radius center_xs = centers2d[..., 0] center_ys = centers2d[..., 1] center_gts = torch.zeros_like(gt_bboxes) stride = center_xs.new_zeros(center_xs.shape) # project the points on current lvl back to the `original` sizes lvl_begin = 0 for lvl_idx, num_points_lvl in enumerate(num_points_per_lvl): lvl_end = lvl_begin + num_points_lvl stride[lvl_begin:lvl_end] = self.strides[lvl_idx] * radius lvl_begin = lvl_end center_gts[..., 0] = center_xs - stride center_gts[..., 1] = center_ys - stride center_gts[..., 2] = center_xs + stride center_gts[..., 3] = center_ys + stride cb_dist_left = xs - center_gts[..., 0] cb_dist_right = center_gts[..., 2] - xs cb_dist_top = ys - center_gts[..., 1] cb_dist_bottom = center_gts[..., 3] - ys center_bbox = torch.stack( (cb_dist_left, cb_dist_top, cb_dist_right, cb_dist_bottom), -1) inside_gt_bbox_mask = center_bbox.min(-1)[0] > 0 # condition2: limit the regression range for each location max_regress_distance = bbox_targets.max(-1)[0] inside_regress_range = ( (max_regress_distance >= regress_ranges[..., 0]) & (max_regress_distance <= regress_ranges[..., 1])) # center-based criterion to deal with ambiguity dists = torch.sqrt(torch.sum(bbox_targets_3d[..., :2]**2, dim=-1)) dists[inside_gt_bbox_mask == 0] = INF dists[inside_regress_range == 0] = INF min_dist, min_dist_inds = dists.min(dim=1) labels = gt_labels[min_dist_inds] labels_3d = gt_labels_3d[min_dist_inds] attr_labels = attr_labels[min_dist_inds] labels[min_dist == INF] = self.background_label # set as BG labels_3d[min_dist == INF] = self.background_label # set as BG attr_labels[min_dist == INF] = self.attr_background_label bbox_targets = bbox_targets[range(num_points), min_dist_inds] bbox_targets_3d = bbox_targets_3d[range(num_points), min_dist_inds] relative_dists = torch.sqrt( torch.sum(bbox_targets_3d[..., :2]**2, dim=-1)) / (1.414 * stride[:, 0]) # [N, 1] / [N, 1] centerness_targets = torch.exp(-self.centerness_alpha * relative_dists) return labels, bbox_targets, labels_3d, bbox_targets_3d, \ centerness_targets, attr_labels ================================================ FILE: mmdet3d/models/dense_heads/free_anchor3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core.bbox import bbox_overlaps_nearest_3d from ..builder import HEADS from .anchor3d_head import Anchor3DHead from .train_mixins import get_direction_target @HEADS.register_module() class FreeAnchor3DHead(Anchor3DHead): r"""`FreeAnchor `_ head for 3D detection. Note: This implementation is directly modified from the `mmdet implementation `_. We find it also works on 3D detection with minor modification, i.e., different hyper-parameters and a additional direction classifier. Args: pre_anchor_topk (int): Number of boxes that be token in each bag. bbox_thr (float): The threshold of the saturated linear function. It is usually the same with the IoU threshold used in NMS. gamma (float): Gamma parameter in focal loss. alpha (float): Alpha parameter in focal loss. kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`. """ # noqa: E501 def __init__(self, pre_anchor_topk=50, bbox_thr=0.6, gamma=2.0, alpha=0.5, init_cfg=None, **kwargs): super().__init__(init_cfg=init_cfg, **kwargs) self.pre_anchor_topk = pre_anchor_topk self.bbox_thr = bbox_thr self.gamma = gamma self.alpha = alpha @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate loss of FreeAnchor head. Args: cls_scores (list[torch.Tensor]): Classification scores of different samples. bbox_preds (list[torch.Tensor]): Box predictions of different samples dir_cls_preds (list[torch.Tensor]): Direction predictions of different samples gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels (list[torch.Tensor]): Ground truth labels. input_metas (list[dict]): List of input meta information. gt_bboxes_ignore (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth boxes that should be ignored. Defaults to None. Returns: dict[str, torch.Tensor]: Loss items. - positive_bag_loss (torch.Tensor): Loss of positive samples. - negative_bag_loss (torch.Tensor): Loss of negative samples. """ featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] assert len(featmap_sizes) == self.anchor_generator.num_levels anchor_list = self.get_anchors(featmap_sizes, input_metas) anchors = [torch.cat(anchor) for anchor in anchor_list] # concatenate each level cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape( cls_score.size(0), -1, self.num_classes) for cls_score in cls_scores ] bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape( bbox_pred.size(0), -1, self.box_code_size) for bbox_pred in bbox_preds ] dir_cls_preds = [ dir_cls_pred.permute(0, 2, 3, 1).reshape(dir_cls_pred.size(0), -1, 2) for dir_cls_pred in dir_cls_preds ] cls_scores = torch.cat(cls_scores, dim=1) bbox_preds = torch.cat(bbox_preds, dim=1) dir_cls_preds = torch.cat(dir_cls_preds, dim=1) cls_prob = torch.sigmoid(cls_scores) box_prob = [] num_pos = 0 positive_losses = [] for _, (anchors_, gt_labels_, gt_bboxes_, cls_prob_, bbox_preds_, dir_cls_preds_) in enumerate( zip(anchors, gt_labels, gt_bboxes, cls_prob, bbox_preds, dir_cls_preds)): gt_bboxes_ = gt_bboxes_.tensor.to(anchors_.device) with torch.no_grad(): # box_localization: a_{j}^{loc}, shape: [j, 4] pred_boxes = self.bbox_coder.decode(anchors_, bbox_preds_) # object_box_iou: IoU_{ij}^{loc}, shape: [i, j] object_box_iou = bbox_overlaps_nearest_3d( gt_bboxes_, pred_boxes) # object_box_prob: P{a_{j} -> b_{i}}, shape: [i, j] t1 = self.bbox_thr t2 = object_box_iou.max( dim=1, keepdim=True).values.clamp(min=t1 + 1e-6) object_box_prob = ((object_box_iou - t1) / (t2 - t1)).clamp( min=0, max=1) # object_cls_box_prob: P{a_{j} -> b_{i}}, shape: [i, c, j] num_obj = gt_labels_.size(0) indices = torch.stack( [torch.arange(num_obj).type_as(gt_labels_), gt_labels_], dim=0) object_cls_box_prob = torch.sparse_coo_tensor( indices, object_box_prob) # image_box_iou: P{a_{j} \in A_{+}}, shape: [c, j] """ from "start" to "end" implement: image_box_iou = torch.sparse.max(object_cls_box_prob, dim=0).t() """ # start box_cls_prob = torch.sparse.sum( object_cls_box_prob, dim=0).to_dense() indices = torch.nonzero(box_cls_prob, as_tuple=False).t_() if indices.numel() == 0: image_box_prob = torch.zeros( anchors_.size(0), self.num_classes).type_as(object_box_prob) else: nonzero_box_prob = torch.where( (gt_labels_.unsqueeze(dim=-1) == indices[0]), object_box_prob[:, indices[1]], torch.tensor( [0]).type_as(object_box_prob)).max(dim=0).values # upmap to shape [j, c] image_box_prob = torch.sparse_coo_tensor( indices.flip([0]), nonzero_box_prob, size=(anchors_.size(0), self.num_classes)).to_dense() # end box_prob.append(image_box_prob) # construct bags for objects match_quality_matrix = bbox_overlaps_nearest_3d( gt_bboxes_, anchors_) _, matched = torch.topk( match_quality_matrix, self.pre_anchor_topk, dim=1, sorted=False) del match_quality_matrix # matched_cls_prob: P_{ij}^{cls} matched_cls_prob = torch.gather( cls_prob_[matched], 2, gt_labels_.view(-1, 1, 1).repeat(1, self.pre_anchor_topk, 1)).squeeze(2) # matched_box_prob: P_{ij}^{loc} matched_anchors = anchors_[matched] matched_object_targets = self.bbox_coder.encode( matched_anchors, gt_bboxes_.unsqueeze(dim=1).expand_as(matched_anchors)) # direction classification loss loss_dir = None if self.use_direction_classifier: # also calculate direction prob: P_{ij}^{dir} matched_dir_targets = get_direction_target( matched_anchors, matched_object_targets, self.dir_offset, self.dir_limit_offset, one_hot=False) loss_dir = self.loss_dir( dir_cls_preds_[matched].transpose(-2, -1), matched_dir_targets, reduction_override='none') # generate bbox weights if self.diff_rad_by_sin: bbox_preds_[matched], matched_object_targets = \ self.add_sin_difference( bbox_preds_[matched], matched_object_targets) bbox_weights = matched_anchors.new_ones(matched_anchors.size()) # Use pop is not right, check performance code_weight = self.train_cfg.get('code_weight', None) if code_weight: bbox_weights = bbox_weights * bbox_weights.new_tensor( code_weight) loss_bbox = self.loss_bbox( bbox_preds_[matched], matched_object_targets, bbox_weights, reduction_override='none').sum(-1) if loss_dir is not None: loss_bbox += loss_dir matched_box_prob = torch.exp(-loss_bbox) # positive_losses: {-log( Mean-max(P_{ij}^{cls} * P_{ij}^{loc}) )} num_pos += len(gt_bboxes_) positive_losses.append( self.positive_bag_loss(matched_cls_prob, matched_box_prob)) positive_loss = torch.cat(positive_losses).sum() / max(1, num_pos) # box_prob: P{a_{j} \in A_{+}} box_prob = torch.stack(box_prob, dim=0) # negative_loss: # \sum_{j}{ FL((1 - P{a_{j} \in A_{+}}) * (1 - P_{j}^{bg})) } / n||B|| negative_loss = self.negative_bag_loss(cls_prob, box_prob).sum() / max( 1, num_pos * self.pre_anchor_topk) losses = { 'positive_bag_loss': positive_loss, 'negative_bag_loss': negative_loss } return losses def positive_bag_loss(self, matched_cls_prob, matched_box_prob): """Generate positive bag loss. Args: matched_cls_prob (torch.Tensor): Classification probability of matched positive samples. matched_box_prob (torch.Tensor): Bounding box probability of matched positive samples. Returns: torch.Tensor: Loss of positive samples. """ # bag_prob = Mean-max(matched_prob) matched_prob = matched_cls_prob * matched_box_prob weight = 1 / torch.clamp(1 - matched_prob, 1e-12, None) weight /= weight.sum(dim=1).unsqueeze(dim=-1) bag_prob = (weight * matched_prob).sum(dim=1) # positive_bag_loss = -self.alpha * log(bag_prob) bag_prob = bag_prob.clamp(0, 1) # to avoid bug of BCE, check return self.alpha * F.binary_cross_entropy( bag_prob, torch.ones_like(bag_prob), reduction='none') def negative_bag_loss(self, cls_prob, box_prob): """Generate negative bag loss. Args: cls_prob (torch.Tensor): Classification probability of negative samples. box_prob (torch.Tensor): Bounding box probability of negative samples. Returns: torch.Tensor: Loss of negative samples. """ prob = cls_prob * (1 - box_prob) prob = prob.clamp(0, 1) # to avoid bug of BCE, check negative_bag_loss = prob**self.gamma * F.binary_cross_entropy( prob, torch.zeros_like(prob), reduction='none') return (1 - self.alpha) * negative_bag_loss ================================================ FILE: mmdet3d/models/dense_heads/groupfree3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import numpy as np import torch from mmcv import ConfigDict from mmcv.cnn import ConvModule, xavier_init from mmcv.cnn.bricks.transformer import (build_positional_encoding, build_transformer_layer) from mmcv.ops import PointsSampler as Points_Sampler from mmcv.ops import gather_points from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.post_processing import aligned_3d_nms from mmdet.core import build_bbox_coder, multi_apply from ..builder import HEADS, build_loss from .base_conv_bbox_head import BaseConvBboxHead EPS = 1e-6 class PointsObjClsModule(BaseModule): """object candidate point prediction from seed point features. Args: in_channel (int): number of channels of seed point features. num_convs (int, optional): number of conv layers. Default: 3. conv_cfg (dict, optional): Config of convolution. Default: dict(type='Conv1d'). norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d'). act_cfg (dict, optional): Config of activation. Default: dict(type='ReLU'). """ def __init__(self, in_channel, num_convs=3, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), init_cfg=None): super().__init__(init_cfg=init_cfg) conv_channels = [in_channel for _ in range(num_convs - 1)] conv_channels.append(1) self.mlp = nn.Sequential() prev_channels = in_channel for i in range(num_convs): self.mlp.add_module( f'layer{i}', ConvModule( prev_channels, conv_channels[i], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg if i < num_convs - 1 else None, act_cfg=act_cfg if i < num_convs - 1 else None, bias=True, inplace=True)) prev_channels = conv_channels[i] def forward(self, seed_features): """Forward pass. Args: seed_features (torch.Tensor): seed features, dims: (batch_size, feature_dim, num_seed) Returns: torch.Tensor: objectness logits, dim: (batch_size, 1, num_seed) """ return self.mlp(seed_features) class GeneralSamplingModule(nn.Module): """Sampling Points. Sampling points with given index. """ def forward(self, xyz, features, sample_inds): """Forward pass. Args: xyz: (B, N, 3) the coordinates of the features. features (Tensor): (B, C, N) features to sample. sample_inds (Tensor): (B, M) the given index, where M is the number of points. Returns: Tensor: (B, M, 3) coordinates of sampled features Tensor: (B, C, M) the sampled features. Tensor: (B, M) the given index. """ xyz_t = xyz.transpose(1, 2).contiguous() new_xyz = gather_points(xyz_t, sample_inds).transpose(1, 2).contiguous() new_features = gather_points(features, sample_inds).contiguous() return new_xyz, new_features, sample_inds @HEADS.register_module() class GroupFree3DHead(BaseModule): r"""Bbox head of `Group-Free 3D `_. Args: num_classes (int): The number of class. in_channels (int): The dims of input features from backbone. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. num_decoder_layers (int): The number of transformer decoder layers. transformerlayers (dict): Config for transformer decoder. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. num_proposal (int): The number of initial sampling candidates. pred_layer_cfg (dict): Config of classfication and regression prediction layers. size_cls_agnostic (bool): Whether the predicted size is class-agnostic. gt_per_seed (int): the number of candidate instance each point belongs to. sampling_objectness_loss (dict): Config of initial sampling objectness loss. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_class_loss (dict): Config of size classification loss. size_res_loss (dict): Config of size residual regression loss. size_reg_loss (dict): Config of class-agnostic size regression loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. """ def __init__(self, num_classes, in_channels, bbox_coder, num_decoder_layers, transformerlayers, decoder_self_posembeds=dict( type='ConvBNPositionalEncoding', input_channel=6, num_pos_feats=288), decoder_cross_posembeds=dict( type='ConvBNPositionalEncoding', input_channel=3, num_pos_feats=288), train_cfg=None, test_cfg=None, num_proposal=128, pred_layer_cfg=None, size_cls_agnostic=True, gt_per_seed=3, sampling_objectness_loss=None, objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_class_loss=None, size_res_loss=None, size_reg_loss=None, semantic_loss=None, init_cfg=None): super(GroupFree3DHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.num_proposal = num_proposal self.in_channels = in_channels self.num_decoder_layers = num_decoder_layers self.size_cls_agnostic = size_cls_agnostic self.gt_per_seed = gt_per_seed # Transformer decoder layers if isinstance(transformerlayers, ConfigDict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_decoder_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_decoder_layers self.decoder_layers = nn.ModuleList() for i in range(self.num_decoder_layers): self.decoder_layers.append( build_transformer_layer(transformerlayers[i])) self.embed_dims = self.decoder_layers[0].embed_dims assert self.embed_dims == decoder_self_posembeds['num_pos_feats'] assert self.embed_dims == decoder_cross_posembeds['num_pos_feats'] # bbox_coder self.bbox_coder = build_bbox_coder(bbox_coder) self.num_sizes = self.bbox_coder.num_sizes self.num_dir_bins = self.bbox_coder.num_dir_bins # Initial object candidate sampling self.gsample_module = GeneralSamplingModule() self.fps_module = Points_Sampler([self.num_proposal]) self.points_obj_cls = PointsObjClsModule(self.in_channels) self.fp16_enabled = False # initial candidate prediction self.conv_pred = BaseConvBboxHead( **pred_layer_cfg, num_cls_out_channels=self._get_cls_out_channels(), num_reg_out_channels=self._get_reg_out_channels()) # query proj and key proj self.decoder_query_proj = nn.Conv1d( self.embed_dims, self.embed_dims, kernel_size=1) self.decoder_key_proj = nn.Conv1d( self.embed_dims, self.embed_dims, kernel_size=1) # query position embed self.decoder_self_posembeds = nn.ModuleList() for _ in range(self.num_decoder_layers): self.decoder_self_posembeds.append( build_positional_encoding(decoder_self_posembeds)) # key position embed self.decoder_cross_posembeds = nn.ModuleList() for _ in range(self.num_decoder_layers): self.decoder_cross_posembeds.append( build_positional_encoding(decoder_cross_posembeds)) # Prediction Head self.prediction_heads = nn.ModuleList() for i in range(self.num_decoder_layers): self.prediction_heads.append( BaseConvBboxHead( **pred_layer_cfg, num_cls_out_channels=self._get_cls_out_channels(), num_reg_out_channels=self._get_reg_out_channels())) self.sampling_objectness_loss = build_loss(sampling_objectness_loss) self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.dir_res_loss = build_loss(dir_res_loss) self.dir_class_loss = build_loss(dir_class_loss) self.semantic_loss = build_loss(semantic_loss) if self.size_cls_agnostic: self.size_reg_loss = build_loss(size_reg_loss) else: self.size_res_loss = build_loss(size_res_loss) self.size_class_loss = build_loss(size_class_loss) def init_weights(self): """Initialize weights of transformer decoder in GroupFree3DHead.""" # initialize transformer for m in self.decoder_layers.parameters(): if m.dim() > 1: xavier_init(m, distribution='uniform') for m in self.decoder_self_posembeds.parameters(): if m.dim() > 1: xavier_init(m, distribution='uniform') for m in self.decoder_cross_posembeds.parameters(): if m.dim() > 1: xavier_init(m, distribution='uniform') def _get_cls_out_channels(self): """Return the channel number of classification outputs.""" # Class numbers (k) + objectness (1) return self.num_classes + 1 def _get_reg_out_channels(self): """Return the channel number of regression outputs.""" # center residual (3), # heading class+residual (num_dir_bins*2), # size class+residual(num_sizes*4 or 3) if self.size_cls_agnostic: return 6 + self.num_dir_bins * 2 else: return 3 + self.num_dir_bins * 2 + self.num_sizes * 4 def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. torch.Tensor: Indices of input points. """ seed_points = feat_dict['fp_xyz'][-1] seed_features = feat_dict['fp_features'][-1] seed_indices = feat_dict['fp_indices'][-1] return seed_points, seed_features, seed_indices def forward(self, feat_dict, sample_mod): """Forward pass. Note: The forward of GroupFree3DHead is divided into 2 steps: 1. Initial object candidates sampling. 2. Iterative object box prediction by transformer decoder. Args: feat_dict (dict): Feature dict from backbone. sample_mod (str): sample mode for initial candidates sampling. Returns: results (dict): Predictions of GroupFree3D head. """ assert sample_mod in ['fps', 'kps'] seed_xyz, seed_features, seed_indices = self._extract_input(feat_dict) results = dict( seed_points=seed_xyz, seed_features=seed_features, seed_indices=seed_indices) # 1. Initial object candidates sampling. if sample_mod == 'fps': sample_inds = self.fps_module(seed_xyz, seed_features) elif sample_mod == 'kps': points_obj_cls_logits = self.points_obj_cls( seed_features) # (batch_size, 1, num_seed) points_obj_cls_scores = points_obj_cls_logits.sigmoid().squeeze(1) sample_inds = torch.topk(points_obj_cls_scores, self.num_proposal)[1].int() results['seeds_obj_cls_logits'] = points_obj_cls_logits else: raise NotImplementedError( f'Sample mode {sample_mod} is not supported!') candidate_xyz, candidate_features, sample_inds = self.gsample_module( seed_xyz, seed_features, sample_inds) results['query_points_xyz'] = candidate_xyz # (B, M, 3) results['query_points_feature'] = candidate_features # (B, C, M) results['query_points_sample_inds'] = sample_inds.long() # (B, M) prefix = 'proposal.' cls_predictions, reg_predictions = self.conv_pred(candidate_features) decode_res = self.bbox_coder.split_pred(cls_predictions, reg_predictions, candidate_xyz, prefix) results.update(decode_res) bbox3d = self.bbox_coder.decode(results, prefix) # 2. Iterative object box prediction by transformer decoder. base_bbox3d = bbox3d[:, :, :6].detach().clone() query = self.decoder_query_proj(candidate_features).permute(2, 0, 1) key = self.decoder_key_proj(seed_features).permute(2, 0, 1) value = key # transformer decoder results['num_decoder_layers'] = 0 for i in range(self.num_decoder_layers): prefix = f's{i}.' query_pos = self.decoder_self_posembeds[i](base_bbox3d).permute( 2, 0, 1) key_pos = self.decoder_cross_posembeds[i](seed_xyz).permute( 2, 0, 1) query = self.decoder_layers[i]( query, key, value, query_pos=query_pos, key_pos=key_pos).permute(1, 2, 0) results[f'{prefix}query'] = query cls_predictions, reg_predictions = self.prediction_heads[i](query) decode_res = self.bbox_coder.split_pred(cls_predictions, reg_predictions, candidate_xyz, prefix) # TODO: should save bbox3d instead of decode_res? results.update(decode_res) bbox3d = self.bbox_coder.decode(results, prefix) results[f'{prefix}bbox3d'] = bbox3d base_bbox3d = bbox3d[:, :, :6].detach().clone() query = query.permute(2, 0, 1) results['num_decoder_layers'] += 1 return results @force_fp32(apply_to=('bbox_preds', )) def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None, ret_target=False): """Compute loss. Args: bbox_preds (dict): Predictions from forward of vote head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. ret_target (Bool): Return targets or not. Returns: dict: Losses of GroupFree3D. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (sampling_targets, sampling_weights, assigned_size_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) = targets batch_size, proposal_num = size_class_targets.shape[:2] losses = dict() # calculate objectness classification loss sampling_obj_score = bbox_preds['seeds_obj_cls_logits'].reshape(-1, 1) sampling_objectness_loss = self.sampling_objectness_loss( sampling_obj_score, 1 - sampling_targets.reshape(-1), sampling_weights.reshape(-1), avg_factor=batch_size) losses['sampling_objectness_loss'] = sampling_objectness_loss prefixes = ['proposal.'] + [ f's{i}.' for i in range(bbox_preds['num_decoder_layers']) ] num_stages = len(prefixes) for prefix in prefixes: # calculate objectness loss obj_score = bbox_preds[f'{prefix}obj_scores'].transpose(2, 1) objectness_loss = self.objectness_loss( obj_score.reshape(-1, 1), 1 - objectness_targets.reshape(-1), objectness_weights.reshape(-1), avg_factor=batch_size) losses[f'{prefix}objectness_loss'] = objectness_loss / num_stages # calculate center loss box_loss_weights_expand = box_loss_weights.unsqueeze(-1).expand( -1, -1, 3) center_loss = self.center_loss( bbox_preds[f'{prefix}center'], assigned_center_targets, weight=box_loss_weights_expand) losses[f'{prefix}center_loss'] = center_loss / num_stages # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds[f'{prefix}dir_class'].transpose(2, 1), dir_class_targets, weight=box_loss_weights) losses[f'{prefix}dir_class_loss'] = dir_class_loss / num_stages # calculate direction residual loss heading_label_one_hot = size_class_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) dir_res_norm = torch.sum( bbox_preds[f'{prefix}dir_res_norm'] * heading_label_one_hot, -1) dir_res_loss = self.dir_res_loss( dir_res_norm, dir_res_targets, weight=box_loss_weights) losses[f'{prefix}dir_res_loss'] = dir_res_loss / num_stages if self.size_cls_agnostic: # calculate class-agnostic size loss size_reg_loss = self.size_reg_loss( bbox_preds[f'{prefix}size'], assigned_size_targets, weight=box_loss_weights_expand) losses[f'{prefix}size_reg_loss'] = size_reg_loss / num_stages else: # calculate size class loss size_class_loss = self.size_class_loss( bbox_preds[f'{prefix}size_class'].transpose(2, 1), size_class_targets, weight=box_loss_weights) losses[ f'{prefix}size_class_loss'] = size_class_loss / num_stages # calculate size residual loss one_hot_size_targets = size_class_targets.new_zeros( (batch_size, proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets_expand = one_hot_size_targets.unsqueeze( -1).expand(-1, -1, -1, 3).contiguous() size_residual_norm = torch.sum( bbox_preds[f'{prefix}size_res_norm'] * one_hot_size_targets_expand, 2) box_loss_weights_expand = box_loss_weights.unsqueeze( -1).expand(-1, -1, 3) size_res_loss = self.size_res_loss( size_residual_norm, size_res_targets, weight=box_loss_weights_expand) losses[f'{prefix}size_res_loss'] = size_res_loss / num_stages # calculate semantic loss semantic_loss = self.semantic_loss( bbox_preds[f'{prefix}sem_scores'].transpose(2, 1), mask_targets, weight=box_loss_weights) losses[f'{prefix}semantic_loss'] = semantic_loss / num_stages if ret_target: losses['targets'] = targets return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None, max_gt_num=64): """Generate targets of GroupFree3D head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. max_gt_num (int): Max number of GTs for single batch. Returns: tuple[torch.Tensor]: Targets of GroupFree3D head. """ # find empty example valid_gt_masks = list() gt_num = list() for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) valid_gt_masks.append(gt_labels_3d[index].new_zeros(1)) gt_num.append(1) else: valid_gt_masks.append(gt_labels_3d[index].new_ones( gt_labels_3d[index].shape)) gt_num.append(gt_labels_3d[index].shape[0]) # max_gt_num = max(gt_num) max_gt_nums = [max_gt_num for _ in range(len(gt_labels_3d))] if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] seed_points = [ bbox_preds['seed_points'][i] for i in range(len(gt_labels_3d)) ] seed_indices = [ bbox_preds['seed_indices'][i] for i in range(len(gt_labels_3d)) ] candidate_indices = [ bbox_preds['query_points_sample_inds'][i] for i in range(len(gt_labels_3d)) ] (sampling_targets, assigned_size_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, objectness_targets, objectness_masks) = multi_apply(self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, max_gt_nums, seed_points, seed_indices, candidate_indices) # pad targets as original code of GroupFree3D. for index in range(len(gt_labels_3d)): pad_num = max_gt_num - gt_labels_3d[index].shape[0] valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num)) sampling_targets = torch.stack(sampling_targets) sampling_weights = (sampling_targets >= 0).float() sampling_normalizer = sampling_weights.sum(dim=1, keepdim=True).float() sampling_weights /= sampling_normalizer.clamp(min=1.0) assigned_size_targets = torch.stack(assigned_size_targets) center_targets = torch.stack(center_targets) valid_gt_masks = torch.stack(valid_gt_masks) assigned_center_targets = torch.stack(assigned_center_targets) objectness_targets = torch.stack(objectness_targets) objectness_weights = torch.stack(objectness_masks) cls_normalizer = objectness_weights.sum(dim=1, keepdim=True).float() objectness_weights /= cls_normalizer.clamp(min=1.0) box_loss_weights = objectness_targets.float() / ( objectness_targets.sum().float() + EPS) valid_gt_weights = valid_gt_masks.float() / ( valid_gt_masks.sum().float() + EPS) dir_class_targets = torch.stack(dir_class_targets) dir_res_targets = torch.stack(dir_res_targets) size_class_targets = torch.stack(size_class_targets) size_res_targets = torch.stack(size_res_targets) mask_targets = torch.stack(mask_targets) return (sampling_targets, sampling_weights, assigned_size_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, max_gt_nums=None, seed_points=None, seed_indices=None, candidate_indices=None, seed_points_obj_topk=4): """Generate targets of GroupFree3D head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. max_gt_nums (int): Max number of GTs for single batch. seed_points (torch.Tensor): Coordinates of seed points. seed_indices (torch.Tensor): Indices of seed points. candidate_indices (torch.Tensor): Indices of object candidates. seed_points_obj_topk (int): k value of k-Closest Points Sampling. Returns: tuple[torch.Tensor]: Targets of GroupFree3D head. """ assert self.bbox_coder.with_rot or pts_semantic_mask is not None gt_bboxes_3d = gt_bboxes_3d.to(points.device) # generate center, dir, size target (center_targets, size_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d) # pad targets as original code of GroupFree3D pad_num = max_gt_nums - gt_labels_3d.shape[0] box_label_mask = points.new_zeros([max_gt_nums]) box_label_mask[:gt_labels_3d.shape[0]] = 1 gt_bboxes_pad = F.pad(gt_bboxes_3d.tensor, (0, 0, 0, pad_num)) gt_bboxes_pad[gt_labels_3d.shape[0]:, 0:3] += 1000 gt_bboxes_3d = gt_bboxes_3d.new_box(gt_bboxes_pad) gt_labels_3d = F.pad(gt_labels_3d, (0, pad_num)) center_targets = F.pad(center_targets, (0, 0, 0, pad_num), value=1000) size_targets = F.pad(size_targets, (0, 0, 0, pad_num)) size_class_targets = F.pad(size_class_targets, (0, pad_num)) size_res_targets = F.pad(size_res_targets, (0, 0, 0, pad_num)) dir_class_targets = F.pad(dir_class_targets, (0, pad_num)) dir_res_targets = F.pad(dir_res_targets, (0, pad_num)) # 0. generate pts_instance_label and pts_obj_mask num_points = points.shape[0] pts_obj_mask = points.new_zeros([num_points], dtype=torch.long) pts_instance_label = points.new_zeros([num_points], dtype=torch.long) - 1 if self.bbox_coder.with_rot: vote_targets = points.new_zeros([num_points, 4 * self.gt_per_seed]) vote_target_idx = points.new_zeros([num_points], dtype=torch.long) box_indices_all = gt_bboxes_3d.points_in_boxes_part(points) for i in range(gt_labels_3d.shape[0]): box_indices = box_indices_all[:, i] indices = torch.nonzero( box_indices, as_tuple=False).squeeze(-1) selected_points = points[indices] pts_obj_mask[indices] = 1 vote_targets_tmp = vote_targets[indices] votes = gt_bboxes_3d.gravity_center[i].unsqueeze( 0) - selected_points[:, :3] for j in range(self.gt_per_seed): column_indices = torch.nonzero( vote_target_idx[indices] == j, as_tuple=False).squeeze(-1) vote_targets_tmp[column_indices, int(j * 3):int(j * 3 + 3)] = votes[column_indices] vote_targets_tmp[column_indices, j + 3 * self.gt_per_seed] = i if j == 0: vote_targets_tmp[ column_indices, :3 * self.gt_per_seed] = votes[column_indices].repeat( 1, self.gt_per_seed) vote_targets_tmp[column_indices, 3 * self.gt_per_seed:] = i vote_targets[indices] = vote_targets_tmp vote_target_idx[indices] = torch.clamp( vote_target_idx[indices] + 1, max=2) dist = points.new_zeros([num_points, self.gt_per_seed]) + 1000 for j in range(self.gt_per_seed): dist[:, j] = (vote_targets[:, 3 * j:3 * j + 3]**2).sum(-1) instance_indices = torch.argmin( dist, dim=-1).unsqueeze(-1) + 3 * self.gt_per_seed instance_lable = torch.gather(vote_targets, 1, instance_indices).squeeze(-1) pts_instance_label = instance_lable.long() pts_instance_label[pts_obj_mask == 0] = -1 elif pts_semantic_mask is not None: for i in torch.unique(pts_instance_mask): indices = torch.nonzero( pts_instance_mask == i, as_tuple=False).squeeze(-1) if pts_semantic_mask[indices[0]] < self.num_classes: selected_points = points[indices, :3] center = 0.5 * ( selected_points.min(0)[0] + selected_points.max(0)[0]) delta_xyz = center - center_targets instance_lable = torch.argmin((delta_xyz**2).sum(-1)) pts_instance_label[indices] = instance_lable pts_obj_mask[indices] = 1 else: raise NotImplementedError # 1. generate objectness targets in sampling head gt_num = gt_labels_3d.shape[0] num_seed = seed_points.shape[0] num_candidate = candidate_indices.shape[0] object_assignment = torch.gather(pts_instance_label, 0, seed_indices) # set background points to the last gt bbox as original code object_assignment[object_assignment < 0] = gt_num - 1 object_assignment_one_hot = gt_bboxes_3d.tensor.new_zeros( (num_seed, gt_num)) object_assignment_one_hot.scatter_(1, object_assignment.unsqueeze(-1), 1) # (num_seed, gt_num) delta_xyz = seed_points.unsqueeze( 1) - gt_bboxes_3d.gravity_center.unsqueeze( 0) # (num_seed, gt_num, 3) delta_xyz = delta_xyz / (gt_bboxes_3d.dims.unsqueeze(0) + EPS) new_dist = torch.sum(delta_xyz**2, dim=-1) euclidean_dist1 = torch.sqrt(new_dist + EPS) euclidean_dist1 = euclidean_dist1 * object_assignment_one_hot + 100 * ( 1 - object_assignment_one_hot) # (gt_num, num_seed) euclidean_dist1 = euclidean_dist1.permute(1, 0) # gt_num x topk topk_inds = torch.topk( euclidean_dist1, seed_points_obj_topk, largest=False)[1] * box_label_mask[:, None] + \ (box_label_mask[:, None] - 1) topk_inds = topk_inds.long() topk_inds = topk_inds.view(-1).contiguous() sampling_targets = torch.zeros( num_seed + 1, dtype=torch.long).to(points.device) sampling_targets[topk_inds] = 1 sampling_targets = sampling_targets[:num_seed] # pts_instance_label objectness_label_mask = torch.gather(pts_instance_label, 0, seed_indices) # num_seed sampling_targets[objectness_label_mask < 0] = 0 # 2. objectness target seed_obj_gt = torch.gather(pts_obj_mask, 0, seed_indices) # num_seed objectness_targets = torch.gather(seed_obj_gt, 0, candidate_indices) # num_candidate # 3. box target seed_instance_label = torch.gather(pts_instance_label, 0, seed_indices) # num_seed query_points_instance_label = torch.gather( seed_instance_label, 0, candidate_indices) # num_candidate # Set assignment # (num_candidate, ) with values in 0,1,...,gt_num-1 assignment = query_points_instance_label # set background points to the last gt bbox as original code assignment[assignment < 0] = gt_num - 1 assignment_expand = assignment.unsqueeze(1).expand(-1, 3) assigned_center_targets = center_targets[assignment] assigned_size_targets = size_targets[assignment] dir_class_targets = dir_class_targets[assignment] dir_res_targets = dir_res_targets[assignment] dir_res_targets /= (np.pi / self.num_dir_bins) size_class_targets = size_class_targets[assignment] size_res_targets = \ torch.gather(size_res_targets, 0, assignment_expand) one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros( (num_candidate, self.num_sizes)) one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).expand( -1, -1, 3) # (num_candidate,num_size_cluster,3) mean_sizes = size_res_targets.new_tensor( self.bbox_coder.mean_sizes).unsqueeze(0) pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1) size_res_targets /= pos_mean_sizes mask_targets = gt_labels_3d[assignment].long() objectness_masks = points.new_ones((num_candidate)) return (sampling_targets, assigned_size_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, objectness_targets, objectness_masks) def get_bboxes(self, points, bbox_preds, input_metas, rescale=False, use_nms=True): """Generate bboxes from GroupFree3D head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from GroupFree3D head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. use_nms (bool): Whether to apply NMS, skip nms postprocessing while using GroupFree3D head in rpn stage. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # support multi-stage predictions assert self.test_cfg['prediction_stages'] in \ ['last', 'all', 'last_three'] prefixes = list() if self.test_cfg['prediction_stages'] == 'last': prefixes = [f's{self.num_decoder_layers - 1}.'] elif self.test_cfg['prediction_stages'] == 'all': prefixes = ['proposal.'] + \ [f's{i}.' for i in range(self.num_decoder_layers)] elif self.test_cfg['prediction_stages'] == 'last_three': prefixes = [ f's{i}.' for i in range(self.num_decoder_layers - 3, self.num_decoder_layers) ] else: raise NotImplementedError obj_scores = list() sem_scores = list() bbox3d = list() for prefix in prefixes: # decode boxes obj_score = bbox_preds[f'{prefix}obj_scores'][..., -1].sigmoid() sem_score = bbox_preds[f'{prefix}sem_scores'].softmax(-1) bbox = self.bbox_coder.decode(bbox_preds, prefix) obj_scores.append(obj_score) sem_scores.append(sem_score) bbox3d.append(bbox) obj_scores = torch.cat(obj_scores, dim=1) sem_scores = torch.cat(sem_scores, dim=1) bbox3d = torch.cat(bbox3d, dim=1) if use_nms: batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = \ self.multiclass_nms_single(obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected, box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results else: return bbox3d def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ bbox = input_meta['box_type_3d']( bbox, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) box_indices = bbox.points_in_boxes_all(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] nonempty_box_mask = box_indices.T.sum(1) > 5 bbox_classes = torch.argmax(sem_scores, -1) nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_thr) # filter empty boxes and boxes with low score scores_mask = (obj_scores > self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_selected], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected] * sem_scores[selected][:, k]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels ================================================ FILE: mmdet3d/models/dense_heads/monoflex_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import xavier_init from torch import nn as nn from mmdet3d.core.utils import get_ellip_gaussian_2D from mmdet3d.models.model_utils import EdgeFusionModule from mmdet3d.models.utils import (filter_outside_objs, get_edge_indices, get_keypoints, handle_proj_objs) from mmdet.core import multi_apply from mmdet.core.bbox.builder import build_bbox_coder from mmdet.models.utils import gaussian_radius, gen_gaussian_target from mmdet.models.utils.gaussian_target import (get_local_maximum, get_topk_from_heatmap, transpose_and_gather_feat) from ..builder import HEADS, build_loss from .anchor_free_mono3d_head import AnchorFreeMono3DHead @HEADS.register_module() class MonoFlexHead(AnchorFreeMono3DHead): r"""MonoFlex head used in `MonoFlex `_ .. code-block:: none / --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> cls | | --> 3 x 3 conv --> 1 x 1 conv --> 2d bbox | | --> 3 x 3 conv --> 1 x 1 conv --> [edge fusion] --> 2d offsets | | --> 3 x 3 conv --> 1 x 1 conv --> keypoints offsets | | --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty feature | --> 3 x 3 conv --> 1 x 1 conv --> keypoints uncertainty | | --> 3 x 3 conv --> 1 x 1 conv --> 3d dimensions | | |--- 1 x 1 conv --> ori cls | --> 3 x 3 conv --| | |--- 1 x 1 conv --> ori offsets | | --> 3 x 3 conv --> 1 x 1 conv --> depth | \ --> 3 x 3 conv --> 1 x 1 conv --> depth uncertainty Args: use_edge_fusion (bool): Whether to use edge fusion module while feature extraction. edge_fusion_inds (list[tuple]): Indices of feature to use edge fusion. edge_heatmap_ratio (float): Ratio of generating target heatmap. filter_outside_objs (bool, optional): Whether to filter the outside objects. Default: True. loss_cls (dict, optional): Config of classification loss. Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0). loss_bbox (dict, optional): Config of localization loss. Default: loss_bbox=dict(type='IOULoss', loss_weight=10.0). loss_dir (dict, optional): Config of direction classification loss. Default: dict(type='MultibinLoss', loss_weight=0.1). loss_keypoints (dict, optional): Config of keypoints loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_dims: (dict, optional): Config of dimensions loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_offsets2d: (dict, optional): Config of offsets2d loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_direct_depth: (dict, optional): Config of directly regression depth loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_keypoints_depth: (dict, optional): Config of keypoints decoded depth loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_combined_depth: (dict, optional): Config of combined depth loss. Default: dict(type='L1Loss', loss_weight=0.1). loss_attr (dict, optional): Config of attribute classification loss. In MonoFlex, Default: None. bbox_coder (dict, optional): Bbox coder for encoding and decoding boxes. Default: dict(type='MonoFlexCoder', code_size=7). norm_cfg (dict, optional): Dictionary to construct and config norm layer. Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). init_cfg (dict): Initialization config dict. Default: None. """ # noqa: E501 def __init__(self, num_classes, in_channels, use_edge_fusion, edge_fusion_inds, edge_heatmap_ratio, filter_outside_objs=True, loss_cls=dict(type='GaussianFocalLoss', loss_weight=1.0), loss_bbox=dict(type='IoULoss', loss_weight=0.1), loss_dir=dict(type='MultiBinLoss', loss_weight=0.1), loss_keypoints=dict(type='L1Loss', loss_weight=0.1), loss_dims=dict(type='L1Loss', loss_weight=0.1), loss_offsets2d=dict(type='L1Loss', loss_weight=0.1), loss_direct_depth=dict(type='L1Loss', loss_weight=0.1), loss_keypoints_depth=dict(type='L1Loss', loss_weight=0.1), loss_combined_depth=dict(type='L1Loss', loss_weight=0.1), loss_attr=None, bbox_coder=dict(type='MonoFlexCoder', code_size=7), norm_cfg=dict(type='BN'), init_cfg=None, init_bias=-2.19, **kwargs): self.use_edge_fusion = use_edge_fusion self.edge_fusion_inds = edge_fusion_inds super().__init__( num_classes, in_channels, loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dir=loss_dir, loss_attr=loss_attr, norm_cfg=norm_cfg, init_cfg=init_cfg, **kwargs) self.filter_outside_objs = filter_outside_objs self.edge_heatmap_ratio = edge_heatmap_ratio self.init_bias = init_bias self.loss_dir = build_loss(loss_dir) self.loss_keypoints = build_loss(loss_keypoints) self.loss_dims = build_loss(loss_dims) self.loss_offsets2d = build_loss(loss_offsets2d) self.loss_direct_depth = build_loss(loss_direct_depth) self.loss_keypoints_depth = build_loss(loss_keypoints_depth) self.loss_combined_depth = build_loss(loss_combined_depth) self.bbox_coder = build_bbox_coder(bbox_coder) def _init_edge_module(self): """Initialize edge fusion module for feature extraction.""" self.edge_fuse_cls = EdgeFusionModule(self.num_classes, 256) for i in range(len(self.edge_fusion_inds)): reg_inds, out_inds = self.edge_fusion_inds[i] out_channels = self.group_reg_dims[reg_inds][out_inds] fusion_layer = EdgeFusionModule(out_channels, 256) layer_name = f'edge_fuse_reg_{reg_inds}_{out_inds}' self.add_module(layer_name, fusion_layer) def init_weights(self): """Initialize weights.""" super().init_weights() self.conv_cls.bias.data.fill_(self.init_bias) xavier_init(self.conv_regs[4][0], gain=0.01) xavier_init(self.conv_regs[7][0], gain=0.01) for m in self.conv_regs.modules(): if isinstance(m, nn.Conv2d): if m.bias is not None: nn.init.constant_(m.bias, 0) def _init_predictor(self): """Initialize predictor layers of the head.""" self.conv_cls_prev = self._init_branch( conv_channels=self.cls_branch, conv_strides=(1, ) * len(self.cls_branch)) self.conv_cls = nn.Conv2d(self.cls_branch[-1], self.cls_out_channels, 1) # init regression head self.conv_reg_prevs = nn.ModuleList() # init output head self.conv_regs = nn.ModuleList() # group_reg_dims: # ((4, ), (2, ), (20, ), (3, ), (3, ), (8, 8), (1, ), (1, )) for i in range(len(self.group_reg_dims)): reg_dims = self.group_reg_dims[i] reg_branch_channels = self.reg_branch[i] out_channel = self.out_channels[i] reg_list = nn.ModuleList() if len(reg_branch_channels) > 0: self.conv_reg_prevs.append( self._init_branch( conv_channels=reg_branch_channels, conv_strides=(1, ) * len(reg_branch_channels))) for reg_dim in reg_dims: reg_list.append(nn.Conv2d(out_channel, reg_dim, 1)) self.conv_regs.append(reg_list) else: self.conv_reg_prevs.append(None) for reg_dim in reg_dims: reg_list.append(nn.Conv2d(self.feat_channels, reg_dim, 1)) self.conv_regs.append(reg_list) def _init_layers(self): """Initialize layers of the head.""" self._init_predictor() if self.use_edge_fusion: self._init_edge_module() def forward_train(self, x, input_metas, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, gt_bboxes_ignore, proposal_cfg, **kwargs): """ Args: x (list[Tensor]): Features from FPN. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes (list[Tensor]): Ground truth bboxes of the image, shape (num_gts, 4). gt_labels (list[Tensor]): Ground truth labels of each box, shape (num_gts,). gt_bboxes_3d (list[Tensor]): 3D ground truth bboxes of the image, shape (num_gts, self.bbox_code_size). gt_labels_3d (list[Tensor]): 3D ground truth labels of each box, shape (num_gts,). centers2d (list[Tensor]): Projected 3D center of each box, shape (num_gts, 2). depths (list[Tensor]): Depth of projected 3D center of each box, shape (num_gts,). attr_labels (list[Tensor]): Attribute labels of each box, shape (num_gts,). gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be ignored, shape (num_ignored_gts, 4). proposal_cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used Returns: tuple: losses: (dict[str, Tensor]): A dictionary of loss components. proposal_list (list[Tensor]): Proposals of each image. """ outs = self(x, input_metas) if gt_labels is None: loss_inputs = outs + (gt_bboxes, gt_bboxes_3d, centers2d, depths, attr_labels, input_metas) else: loss_inputs = outs + (gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, input_metas) losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) if proposal_cfg is None: return losses else: proposal_list = self.get_bboxes( *outs, input_metas, cfg=proposal_cfg) return losses, proposal_list def forward(self, feats, input_metas): """Forward features from the upstream network. Args: feats (list[Tensor]): Features from the upstream network, each is a 4D-tensor. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. """ mlvl_input_metas = [input_metas for i in range(len(feats))] return multi_apply(self.forward_single, feats, mlvl_input_metas) def forward_single(self, x, input_metas): """Forward features of a single scale level. Args: x (Tensor): Feature maps from a specific FPN feature level. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple: Scores for each class, bbox predictions. """ img_h, img_w = input_metas[0]['pad_shape'][:2] batch_size, _, feat_h, feat_w = x.shape downsample_ratio = img_h / feat_h for conv_cls_prev_layer in self.conv_cls_prev: cls_feat = conv_cls_prev_layer(x) out_cls = self.conv_cls(cls_feat) if self.use_edge_fusion: # calculate the edge indices for the batch data edge_indices_list = get_edge_indices( input_metas, downsample_ratio, device=x.device) edge_lens = [ edge_indices.shape[0] for edge_indices in edge_indices_list ] max_edge_len = max(edge_lens) edge_indices = x.new_zeros((batch_size, max_edge_len, 2), dtype=torch.long) for i in range(batch_size): edge_indices[i, :edge_lens[i]] = edge_indices_list[i] # cls feature map edge fusion out_cls = self.edge_fuse_cls(cls_feat, out_cls, edge_indices, edge_lens, feat_h, feat_w) bbox_pred = [] for i in range(len(self.group_reg_dims)): reg_feat = x.clone() # feature regression head if len(self.reg_branch[i]) > 0: for conv_reg_prev_layer in self.conv_reg_prevs[i]: reg_feat = conv_reg_prev_layer(reg_feat) for j, conv_reg in enumerate(self.conv_regs[i]): out_reg = conv_reg(reg_feat) # Use Edge Fusion Module if self.use_edge_fusion and (i, j) in self.edge_fusion_inds: # reg feature map edge fusion out_reg = getattr(self, 'edge_fuse_reg_{}_{}'.format( i, j))(reg_feat, out_reg, edge_indices, edge_lens, feat_h, feat_w) bbox_pred.append(out_reg) bbox_pred = torch.cat(bbox_pred, dim=1) cls_score = out_cls.sigmoid() # turn to 0-1 cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4) return cls_score, bbox_pred def get_bboxes(self, cls_scores, bbox_preds, input_metas): """Generate bboxes from bbox head predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level. bbox_preds (list[Tensor]): Box regression for each scale. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. rescale (bool): If True, return boxes in original image space. Returns: list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]: Each item in result_list is 4-tuple. """ assert len(cls_scores) == len(bbox_preds) == 1 cam2imgs = torch.stack([ cls_scores[0].new_tensor(input_meta['cam2img']) for input_meta in input_metas ]) batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap( cls_scores[0], bbox_preds[0], input_metas, cam2imgs=cam2imgs, topk=100, kernel=3) result_list = [] for img_id in range(len(input_metas)): bboxes = batch_bboxes[img_id] scores = batch_scores[img_id] labels = batch_topk_labels[img_id] keep_idx = scores > 0.25 bboxes = bboxes[keep_idx] scores = scores[keep_idx] labels = labels[keep_idx] bboxes = input_metas[img_id]['box_type_3d']( bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)) attrs = None result_list.append((bboxes, scores, labels, attrs)) return result_list def decode_heatmap(self, cls_score, reg_pred, input_metas, cam2imgs, topk=100, kernel=3): """Transform outputs into detections raw bbox predictions. Args: class_score (Tensor): Center predict heatmap, shape (B, num_classes, H, W). reg_pred (Tensor): Box regression map. shape (B, channel, H , W). input_metas (List[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cam2imgs (Tensor): Camera intrinsic matrix. shape (N, 4, 4) topk (int, optional): Get top k center keypoints from heatmap. Default 100. kernel (int, optional): Max pooling kernel for extract local maximum pixels. Default 3. Returns: tuple[torch.Tensor]: Decoded output of SMOKEHead, containing the following Tensors: - batch_bboxes (Tensor): Coords of each 3D box. shape (B, k, 7) - batch_scores (Tensor): Scores of each 3D box. shape (B, k) - batch_topk_labels (Tensor): Categories of each 3D box. shape (B, k) """ img_h, img_w = input_metas[0]['pad_shape'][:2] batch_size, _, feat_h, feat_w = cls_score.shape downsample_ratio = img_h / feat_h center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel) *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap( center_heatmap_pred, k=topk) batch_scores, batch_index, batch_topk_labels = batch_dets regression = transpose_and_gather_feat(reg_pred, batch_index) regression = regression.view(-1, 8) pred_base_centers2d = torch.cat( [topk_xs.view(-1, 1), topk_ys.view(-1, 1).float()], dim=1) preds = self.bbox_coder.decode(regression, batch_topk_labels, downsample_ratio, cam2imgs) pred_locations = self.bbox_coder.decode_location( pred_base_centers2d, preds['offsets2d'], preds['combined_depth'], cam2imgs, downsample_ratio) pred_yaws = self.bbox_coder.decode_orientation( preds['orientations']).unsqueeze(-1) pred_dims = preds['dimensions'] batch_bboxes = torch.cat((pred_locations, pred_dims, pred_yaws), dim=1) batch_bboxes = batch_bboxes.view(batch_size, -1, self.bbox_code_size) return batch_bboxes, batch_scores, batch_topk_labels def get_predictions(self, pred_reg, labels3d, centers2d, reg_mask, batch_indices, input_metas, downsample_ratio): """Prepare predictions for computing loss. Args: pred_reg (Tensor): Box regression map. shape (B, channel, H , W). labels3d (Tensor): Labels of each 3D box. shape (B * max_objs, ) centers2d (Tensor): Coords of each projected 3D box center on image. shape (N, 2) reg_mask (Tensor): Indexes of the existence of the 3D box. shape (B * max_objs, ) batch_indices (Tenosr): Batch indices of the 3D box. shape (N, 3) input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. downsample_ratio (int): The stride of feature map. Returns: dict: The predictions for computing loss. """ batch, channel = pred_reg.shape[0], pred_reg.shape[1] w = pred_reg.shape[3] cam2imgs = torch.stack([ centers2d.new_tensor(input_meta['cam2img']) for input_meta in input_metas ]) # (batch_size, 4, 4) -> (N, 4, 4) cam2imgs = cam2imgs[batch_indices, :, :] centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0] centers2d_inds = centers2d_inds.view(batch, -1) pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds) pred_regression_pois = pred_regression.view(-1, channel)[reg_mask] preds = self.bbox_coder.decode(pred_regression_pois, labels3d, downsample_ratio, cam2imgs) return preds def get_targets(self, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, feat_shape, img_shape, input_metas): """Get training targets for batch images. `` Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, shape (num_gt,). gt_bboxes_3d_list (list[:obj:`CameraInstance3DBoxes`]): 3D Ground truth bboxes of each image, shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, shape (num_gt, 2). depths_list (list[Tensor]): Depth of projected 3D centers onto 2D image, each has shape (num_gt, 1). feat_shape (tuple[int]): Feature map shape with value, shape (B, _, H, W). img_shape (tuple[int]): Image shape in [h, w] format. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple[Tensor, dict]: The Tensor value is the targets of center heatmap, the dict has components below: - base_centers2d_target (Tensor): Coords of each projected 3D box center on image. shape (B * max_objs, 2), [dtype: int] - labels3d (Tensor): Labels of each 3D box. shape (N, ) - reg_mask (Tensor): Mask of the existence of the 3D box. shape (B * max_objs, ) - batch_indices (Tensor): Batch id of the 3D box. shape (N, ) - depth_target (Tensor): Depth target of each 3D box. shape (N, ) - keypoints2d_target (Tensor): Keypoints of each projected 3D box on image. shape (N, 10, 2) - keypoints_mask (Tensor): Keypoints mask of each projected 3D box on image. shape (N, 10) - keypoints_depth_mask (Tensor): Depths decoded from keypoints of each 3D box. shape (N, 3) - orientations_target (Tensor): Orientation (encoded local yaw) target of each 3D box. shape (N, ) - offsets2d_target (Tensor): Offsets target of each projected 3D box. shape (N, 2) - dimensions_target (Tensor): Dimensions target of each 3D box. shape (N, 3) - downsample_ratio (int): The stride of feature map. """ img_h, img_w = img_shape[:2] batch_size, _, feat_h, feat_w = feat_shape width_ratio = float(feat_w / img_w) # 1/4 height_ratio = float(feat_h / img_h) # 1/4 assert width_ratio == height_ratio # Whether to filter the objects which are not in FOV. if self.filter_outside_objs: filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, input_metas) # transform centers2d to base centers2d for regression and # heatmap generation. # centers2d = int(base_centers2d) + offsets2d base_centers2d_list, offsets2d_list, trunc_mask_list = \ handle_proj_objs(centers2d_list, gt_bboxes_list, input_metas) keypoints2d_list, keypoints_mask_list, keypoints_depth_mask_list = \ get_keypoints(gt_bboxes_3d_list, centers2d_list, input_metas) center_heatmap_target = gt_bboxes_list[-1].new_zeros( [batch_size, self.num_classes, feat_h, feat_w]) for batch_id in range(batch_size): # project gt_bboxes from input image to feat map gt_bboxes = gt_bboxes_list[batch_id] * width_ratio gt_labels = gt_labels_list[batch_id] # project base centers2d from input image to feat map gt_base_centers2d = base_centers2d_list[batch_id] * width_ratio trunc_masks = trunc_mask_list[batch_id] for j, base_center2d in enumerate(gt_base_centers2d): if trunc_masks[j]: # for outside objects, generate ellipse heatmap base_center2d_x_int, base_center2d_y_int = \ base_center2d.int() scale_box_w = min(base_center2d_x_int - gt_bboxes[j][0], gt_bboxes[j][2] - base_center2d_x_int) scale_box_h = min(base_center2d_y_int - gt_bboxes[j][1], gt_bboxes[j][3] - base_center2d_y_int) radius_x = scale_box_w * self.edge_heatmap_ratio radius_y = scale_box_h * self.edge_heatmap_ratio radius_x, radius_y = max(0, int(radius_x)), max( 0, int(radius_y)) assert min(radius_x, radius_y) == 0 ind = gt_labels[j] get_ellip_gaussian_2D( center_heatmap_target[batch_id, ind], [base_center2d_x_int, base_center2d_y_int], radius_x, radius_y) else: base_center2d_x_int, base_center2d_y_int = \ base_center2d.int() scale_box_h = (gt_bboxes[j][3] - gt_bboxes[j][1]) scale_box_w = (gt_bboxes[j][2] - gt_bboxes[j][0]) radius = gaussian_radius([scale_box_h, scale_box_w], min_overlap=0.7) radius = max(0, int(radius)) ind = gt_labels[j] gen_gaussian_target( center_heatmap_target[batch_id, ind], [base_center2d_x_int, base_center2d_y_int], radius) avg_factor = max(1, center_heatmap_target.eq(1).sum()) num_ctrs = [centers2d.shape[0] for centers2d in centers2d_list] max_objs = max(num_ctrs) batch_indices = [ centers2d_list[0].new_full((num_ctrs[i], ), i) for i in range(batch_size) ] batch_indices = torch.cat(batch_indices, dim=0) reg_mask = torch.zeros( (batch_size, max_objs), dtype=torch.bool).to(base_centers2d_list[0].device) gt_bboxes_3d = input_metas['box_type_3d'].cat(gt_bboxes_3d_list) gt_bboxes_3d = gt_bboxes_3d.to(base_centers2d_list[0].device) # encode original local yaw to multibin format orienations_target = self.bbox_coder.encode(gt_bboxes_3d) batch_base_centers2d = base_centers2d_list[0].new_zeros( (batch_size, max_objs, 2)) for i in range(batch_size): reg_mask[i, :num_ctrs[i]] = 1 batch_base_centers2d[i, :num_ctrs[i]] = base_centers2d_list[i] flatten_reg_mask = reg_mask.flatten() # transform base centers2d from input scale to output scale batch_base_centers2d = batch_base_centers2d.view(-1, 2) * width_ratio dimensions_target = gt_bboxes_3d.tensor[:, 3:6] labels_3d = torch.cat(gt_labels_3d_list) keypoints2d_target = torch.cat(keypoints2d_list) keypoints_mask = torch.cat(keypoints_mask_list) keypoints_depth_mask = torch.cat(keypoints_depth_mask_list) offsets2d_target = torch.cat(offsets2d_list) bboxes2d = torch.cat(gt_bboxes_list) # transform FCOS style bbox into [x1, y1, x2, y2] format. bboxes2d_target = torch.cat([bboxes2d[:, 0:2] * -1, bboxes2d[:, 2:]], dim=-1) depths = torch.cat(depths_list) target_labels = dict( base_centers2d_target=batch_base_centers2d.int(), labels3d=labels_3d, reg_mask=flatten_reg_mask, batch_indices=batch_indices, bboxes2d_target=bboxes2d_target, depth_target=depths, keypoints2d_target=keypoints2d_target, keypoints_mask=keypoints_mask, keypoints_depth_mask=keypoints_depth_mask, orienations_target=orienations_target, offsets2d_target=offsets2d_target, dimensions_target=dimensions_target, downsample_ratio=1 / width_ratio) return center_heatmap_target, avg_factor, target_labels def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, input_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level. shape (num_gt, 4). bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel number is bbox_code_size. shape (B, 7, H, W). gt_bboxes (list[Tensor]): Ground truth bboxes for each image. shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box. shape (num_gts, ). gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground truth. it is the flipped gt_bboxes gt_labels_3d (list[Tensor]): Same as gt_labels. centers2d (list[Tensor]): 2D centers on the image. shape (num_gts, 2). depths (list[Tensor]): Depth ground truth. shape (num_gts, ). attr_labels (list[Tensor]): Attributes indices of each box. In kitti it's None. input_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Default: None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == 1 assert attr_labels is None assert gt_bboxes_ignore is None center2d_heatmap = cls_scores[0] pred_reg = bbox_preds[0] center2d_heatmap_target, avg_factor, target_labels = \ self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, center2d_heatmap.shape, input_metas[0]['pad_shape'], input_metas) preds = self.get_predictions( pred_reg=pred_reg, labels3d=target_labels['labels3d'], centers2d=target_labels['base_centers2d_target'], reg_mask=target_labels['reg_mask'], batch_indices=target_labels['batch_indices'], input_metas=input_metas, downsample_ratio=target_labels['downsample_ratio']) # heatmap loss loss_cls = self.loss_cls( center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor) # bbox2d regression loss loss_bbox = self.loss_bbox(preds['bboxes2d'], target_labels['bboxes2d_target']) # keypoints loss, the keypoints in predictions and target are all # local coordinates. Check the mask dtype should be bool, not int # or float to ensure the indexing is bool index keypoints2d_mask = target_labels['keypoints2d_mask'] loss_keypoints = self.loss_keypoints( preds['keypoints2d'][keypoints2d_mask], target_labels['keypoints2d_target'][keypoints2d_mask]) # orientations loss loss_dir = self.loss_dir(preds['orientations'], target_labels['orientations_target']) # dimensions loss loss_dims = self.loss_dims(preds['dimensions'], target_labels['dimensions_target']) # offsets for center heatmap loss_offsets2d = self.loss_offsets2d(preds['offsets2d'], target_labels['offsets2d_target']) # directly regressed depth loss with direct depth uncertainty loss direct_depth_weights = torch.exp(-preds['direct_depth_uncertainty']) loss_weight_1 = self.loss_direct_depth.loss_weight loss_direct_depth = self.loss_direct_depth( preds['direct_depth'], target_labels['depth_target'], direct_depth_weights) loss_uncertainty_1 =\ preds['direct_depth_uncertainty'] * loss_weight_1 loss_direct_depth = loss_direct_depth + loss_uncertainty_1.mean() # keypoints decoded depth loss with keypoints depth uncertainty loss depth_mask = target_labels['keypoints_depth_mask'] depth_target = target_labels['depth_target'].unsqueeze(-1).repeat(1, 3) valid_keypoints_depth_uncertainty = preds[ 'keypoints_depth_uncertainty'][depth_mask] valid_keypoints_depth_weights = torch.exp( -valid_keypoints_depth_uncertainty) loss_keypoints_depth = self.loss_keypoint_depth( preds['keypoints_depth'][depth_mask], depth_target[depth_mask], valid_keypoints_depth_weights) loss_weight_2 = self.loss_keypoints_depth.loss_weight loss_uncertainty_2 =\ valid_keypoints_depth_uncertainty * loss_weight_2 loss_keypoints_depth = loss_keypoints_depth + loss_uncertainty_2.mean() # combined depth loss for optimiaze the uncertainty loss_combined_depth = self.loss_combined_depth( preds['combined_depth'], target_labels['depth_target']) loss_dict = dict( loss_cls=loss_cls, loss_bbox=loss_bbox, loss_keypoints=loss_keypoints, loss_dir=loss_dir, loss_dims=loss_dims, loss_offsets2d=loss_offsets2d, loss_direct_depth=loss_direct_depth, loss_keypoints_depth=loss_keypoints_depth, loss_combined_depth=loss_combined_depth) return loss_dict ================================================ FILE: mmdet3d/models/dense_heads/parta2_rpn_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.runner import force_fp32 from mmdet3d.core import limit_period, xywhr2xyxyr from mmdet3d.core.post_processing import nms_bev, nms_normal_bev from ..builder import HEADS from .anchor3d_head import Anchor3DHead @HEADS.register_module() class PartA2RPNHead(Anchor3DHead): """RPN head for PartA2. Note: The main difference between the PartA2 RPN head and the Anchor3DHead lies in their output during inference. PartA2 RPN head further returns the original classification score for the second stage since the bbox head in RoI head does not do classification task. Different from RPN heads in 2D detectors, this RPN head does multi-class classification task and uses FocalLoss like the SECOND and PointPillars do. But this head uses class agnostic nms rather than multi-class nms. Args: num_classes (int): Number of classes. in_channels (int): Number of channels in the input feature map. train_cfg (dict): Train configs. test_cfg (dict): Test configs. feat_channels (int): Number of channels of the feature map. use_direction_classifier (bool): Whether to add a direction classifier. anchor_generator(dict): Config dict of anchor generator. assigner_per_size (bool): Whether to do assignment for each separate anchor size. assign_per_class (bool): Whether to do assignment for each class. diff_rad_by_sin (bool): Whether to change the difference into sin difference for box regression loss. dir_offset (float | int): The offset of BEV rotation angles (TODO: may be moved into box coder) dir_limit_offset (float | int): The limited range of BEV rotation angles. (TODO: may be moved into box coder) bbox_coder (dict): Config dict of box coders. loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. loss_dir (dict): Config of direction classifier loss. """ def __init__(self, num_classes, in_channels, train_cfg, test_cfg, feat_channels=256, use_direction_classifier=True, anchor_generator=dict( type='Anchor3DRangeGenerator', range=[0, -39.68, -1.78, 69.12, 39.68, -1.78], strides=[2], sizes=[[3.9, 1.6, 1.56]], rotations=[0, 1.57], custom_values=[], reshape_out=False), assigner_per_size=False, assign_per_class=False, diff_rad_by_sin=True, dir_offset=-np.pi / 2, dir_limit_offset=0, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_dir=dict(type='CrossEntropyLoss', loss_weight=0.2), init_cfg=None): super().__init__(num_classes, in_channels, train_cfg, test_cfg, feat_channels, use_direction_classifier, anchor_generator, assigner_per_size, assign_per_class, diff_rad_by_sin, dir_offset, dir_limit_offset, bbox_coder, loss_cls, loss_bbox, loss_dir, init_cfg) @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate losses. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes of each sample. gt_labels (list[torch.Tensor]): Labels of each sample. input_metas (list[dict]): Point cloud and image's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict[str, list[torch.Tensor]]: Classification, bbox, and direction losses of each level. - loss_rpn_cls (list[torch.Tensor]): Classification losses. - loss_rpn_bbox (list[torch.Tensor]): Box regression losses. - loss_rpn_dir (list[torch.Tensor]): Direction classification losses. """ loss_dict = super().loss(cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore) # change the loss key names to avoid conflict return dict( loss_rpn_cls=loss_dict['loss_cls'], loss_rpn_bbox=loss_dict['loss_bbox'], loss_rpn_dir=loss_dict['loss_dir']) def get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors, input_meta, cfg, rescale=False): """Get bboxes of single branch. Args: cls_scores (torch.Tensor): Class score in single batch. bbox_preds (torch.Tensor): Bbox prediction in single batch. dir_cls_preds (torch.Tensor): Predictions of direction class in single batch. mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor]): whether th rescale bbox. Returns: dict: Predictions of single batch containing the following keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores_3d (torch.Tensor): Score of each bbox. - labels_3d (torch.Tensor): Label of each bbox. - cls_preds (torch.Tensor): Class score of each bbox. """ assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_max_scores = [] mlvl_label_pred = [] mlvl_dir_scores = [] mlvl_cls_score = [] for cls_score, bbox_pred, dir_cls_pred, anchors in zip( cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] assert cls_score.size()[-2:] == dir_cls_pred.size()[-2:] dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] cls_score = cls_score.permute(1, 2, 0).reshape(-1, self.num_classes) if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, self.box_code_size) nms_pre = cfg.get('nms_pre', -1) if self.use_sigmoid_cls: max_scores, pred_labels = scores.max(dim=1) else: max_scores, pred_labels = scores[:, :-1].max(dim=1) # get topk if nms_pre > 0 and scores.shape[0] > nms_pre: topk_scores, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] max_scores = topk_scores cls_score = scores[topk_inds, :] dir_cls_score = dir_cls_score[topk_inds] pred_labels = pred_labels[topk_inds] bboxes = self.bbox_coder.decode(anchors, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_max_scores.append(max_scores) mlvl_cls_score.append(cls_score) mlvl_label_pred.append(pred_labels) mlvl_dir_scores.append(dir_cls_score) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.box_code_size).bev) mlvl_max_scores = torch.cat(mlvl_max_scores) mlvl_label_pred = torch.cat(mlvl_label_pred) mlvl_dir_scores = torch.cat(mlvl_dir_scores) # shape [k, num_class] before sigmoid # PartA2 need to keep raw classification score # because the bbox head in the second stage does not have # classification branch, # roi head need this score as classification score mlvl_cls_score = torch.cat(mlvl_cls_score) score_thr = cfg.get('score_thr', 0) result = self.class_agnostic_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_max_scores, mlvl_label_pred, mlvl_cls_score, mlvl_dir_scores, score_thr, cfg.nms_post, cfg, input_meta) return result def class_agnostic_nms(self, mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_max_scores, mlvl_label_pred, mlvl_cls_score, mlvl_dir_scores, score_thr, max_num, cfg, input_meta): """Class agnostic nms for single batch. Args: mlvl_bboxes (torch.Tensor): Bboxes from Multi-level. mlvl_bboxes_for_nms (torch.Tensor): Bboxes for nms (bev or minmax boxes) from Multi-level. mlvl_max_scores (torch.Tensor): Max scores of Multi-level bbox. mlvl_label_pred (torch.Tensor): Class predictions of Multi-level bbox. mlvl_cls_score (torch.Tensor): Class scores of Multi-level bbox. mlvl_dir_scores (torch.Tensor): Direction scores of Multi-level bbox. score_thr (int): Score threshold. max_num (int): Max number of bboxes after nms. cfg (:obj:`ConfigDict`): Training or testing config. input_meta (dict): Contain pcd and img's meta info. Returns: dict: Predictions of single batch. Contain the keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores_3d (torch.Tensor): Score of each bbox. - labels_3d (torch.Tensor): Label of each bbox. - cls_preds (torch.Tensor): Class score of each bbox. """ bboxes = [] scores = [] labels = [] dir_scores = [] cls_scores = [] score_thr_inds = mlvl_max_scores > score_thr _scores = mlvl_max_scores[score_thr_inds] _bboxes_for_nms = mlvl_bboxes_for_nms[score_thr_inds, :] if cfg.use_rotate_nms: nms_func = nms_bev else: nms_func = nms_normal_bev selected = nms_func(_bboxes_for_nms, _scores, cfg.nms_thr) _mlvl_bboxes = mlvl_bboxes[score_thr_inds, :] _mlvl_dir_scores = mlvl_dir_scores[score_thr_inds] _mlvl_label_pred = mlvl_label_pred[score_thr_inds] _mlvl_cls_score = mlvl_cls_score[score_thr_inds] if len(selected) > 0: bboxes.append(_mlvl_bboxes[selected]) scores.append(_scores[selected]) labels.append(_mlvl_label_pred[selected]) cls_scores.append(_mlvl_cls_score[selected]) dir_scores.append(_mlvl_dir_scores[selected]) dir_rot = limit_period(bboxes[-1][..., 6] - self.dir_offset, self.dir_limit_offset, np.pi) bboxes[-1][..., 6] = ( dir_rot + self.dir_offset + np.pi * dir_scores[-1].to(bboxes[-1].dtype)) if bboxes: bboxes = torch.cat(bboxes, dim=0) scores = torch.cat(scores, dim=0) cls_scores = torch.cat(cls_scores, dim=0) labels = torch.cat(labels, dim=0) if bboxes.shape[0] > max_num: _, inds = scores.sort(descending=True) inds = inds[:max_num] bboxes = bboxes[inds, :] labels = labels[inds] scores = scores[inds] cls_scores = cls_scores[inds] bboxes = input_meta['box_type_3d']( bboxes, box_dim=self.box_code_size) return dict( boxes_3d=bboxes, scores_3d=scores, labels_3d=labels, cls_preds=cls_scores # raw scores [max_num, cls_num] ) else: return dict( boxes_3d=input_meta['box_type_3d']( mlvl_bboxes.new_zeros([0, self.box_code_size]), box_dim=self.box_code_size), scores_3d=mlvl_bboxes.new_zeros([0]), labels_3d=mlvl_bboxes.new_zeros([0]), cls_preds=mlvl_bboxes.new_zeros([0, mlvl_cls_score.shape[-1]])) ================================================ FILE: mmdet3d/models/dense_heads/pgd_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import Scale, bias_init_with_prob, normal_init from mmcv.runner import force_fp32 from torch import nn as nn from torch.nn import functional as F from mmdet3d.core import box3d_multiclass_nms, xywhr2xyxyr from mmdet3d.core.bbox import points_cam2img, points_img2cam from mmdet.core import distance2bbox, multi_apply from ..builder import HEADS, build_loss from .fcos_mono3d_head import FCOSMono3DHead @HEADS.register_module() class PGDHead(FCOSMono3DHead): r"""Anchor-free head used in `PGD `_. Args: use_depth_classifer (bool, optional): Whether to use depth classifier. Defaults to True. use_only_reg_proj (bool, optional): Whether to use only direct regressed depth in the re-projection (to make the network easier to learn). Defaults to False. weight_dim (int, optional): Dimension of the location-aware weight map. Defaults to -1. weight_branch (tuple[tuple[int]], optional): Feature map channels of the convolutional branch for weight map. Defaults to ((256, ), ). depth_branch (tuple[int], optional): Feature map channels of the branch for probabilistic depth estimation. Defaults to (64, ), depth_range (tuple[float], optional): Range of depth estimation. Defaults to (0, 70), depth_unit (int, optional): Unit of depth range division. Defaults to 10. division (str, optional): Depth division method. Options include 'uniform', 'linear', 'log', 'loguniform'. Defaults to 'uniform'. depth_bins (int, optional): Discrete bins of depth division. Defaults to 8. loss_depth (dict, optional): Depth loss. Defaults to dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0). loss_bbox2d (dict, optional): Loss for 2D box estimation. Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0). loss_consistency (dict, optional): Consistency loss. Defaults to dict(type='GIoULoss', loss_weight=1.0), pred_velo (bool, optional): Whether to predict velocity. Defaults to False. pred_bbox2d (bool, optional): Whether to predict 2D bounding boxes. Defaults to True. pred_keypoints (bool, optional): Whether to predict keypoints. Defaults to False, bbox_coder (dict, optional): Bounding box coder. Defaults to dict(type='PGDBBoxCoder', base_depths=((28.01, 16.32), ), base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)), code_size=7). """ def __init__(self, use_depth_classifier=True, use_onlyreg_proj=False, weight_dim=-1, weight_branch=((256, ), ), depth_branch=(64, ), depth_range=(0, 70), depth_unit=10, division='uniform', depth_bins=8, loss_depth=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_bbox2d=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=1.0), loss_consistency=dict(type='GIoULoss', loss_weight=1.0), pred_bbox2d=True, pred_keypoints=False, bbox_coder=dict( type='PGDBBoxCoder', base_depths=((28.01, 16.32), ), base_dims=((0.8, 1.73, 0.6), (1.76, 1.73, 0.6), (3.9, 1.56, 1.6)), code_size=7), **kwargs): self.use_depth_classifier = use_depth_classifier self.use_onlyreg_proj = use_onlyreg_proj self.depth_branch = depth_branch self.pred_keypoints = pred_keypoints self.weight_dim = weight_dim self.weight_branch = weight_branch self.weight_out_channels = [] for weight_branch_channels in weight_branch: if len(weight_branch_channels) > 0: self.weight_out_channels.append(weight_branch_channels[-1]) else: self.weight_out_channels.append(-1) self.depth_range = depth_range self.depth_unit = depth_unit self.division = division if self.division == 'uniform': self.num_depth_cls = int( (depth_range[1] - depth_range[0]) / depth_unit) + 1 if self.num_depth_cls != depth_bins: print('Warning: The number of bins computed from ' + 'depth_unit is different from given parameter! ' + 'Depth_unit will be considered with priority in ' + 'Uniform Division.') else: self.num_depth_cls = depth_bins super().__init__( pred_bbox2d=pred_bbox2d, bbox_coder=bbox_coder, **kwargs) self.loss_depth = build_loss(loss_depth) if self.pred_bbox2d: self.loss_bbox2d = build_loss(loss_bbox2d) self.loss_consistency = build_loss(loss_consistency) if self.pred_keypoints: self.kpts_start = 9 if self.pred_velo else 7 def _init_layers(self): """Initialize layers of the head.""" super()._init_layers() if self.pred_bbox2d: self.scale_dim += 1 if self.pred_keypoints: self.scale_dim += 1 self.scales = nn.ModuleList([ nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)]) for _ in self.strides ]) def _init_predictor(self): """Initialize predictor layers of the head.""" super()._init_predictor() if self.use_depth_classifier: self.conv_depth_cls_prev = self._init_branch( conv_channels=self.depth_branch, conv_strides=(1, ) * len(self.depth_branch)) self.conv_depth_cls = nn.Conv2d(self.depth_branch[-1], self.num_depth_cls, 1) # Data-agnostic single param lambda for local depth fusion self.fuse_lambda = nn.Parameter(torch.tensor(10e-5)) if self.weight_dim != -1: self.conv_weight_prevs = nn.ModuleList() self.conv_weights = nn.ModuleList() for i in range(self.weight_dim): weight_branch_channels = self.weight_branch[i] weight_out_channel = self.weight_out_channels[i] if len(weight_branch_channels) > 0: self.conv_weight_prevs.append( self._init_branch( conv_channels=weight_branch_channels, conv_strides=(1, ) * len(weight_branch_channels))) self.conv_weights.append( nn.Conv2d(weight_out_channel, 1, 1)) else: self.conv_weight_prevs.append(None) self.conv_weights.append( nn.Conv2d(self.feat_channels, 1, 1)) def init_weights(self): """Initialize weights of the head. We currently still use the customized defined init_weights because the default init of DCN triggered by the init_cfg will init conv_offset.weight, which mistakenly affects the training stability. """ super().init_weights() bias_cls = bias_init_with_prob(0.01) if self.use_depth_classifier: for m in self.conv_depth_cls_prev: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) normal_init(self.conv_depth_cls, std=0.01, bias=bias_cls) if self.weight_dim != -1: for conv_weight_prev in self.conv_weight_prevs: if conv_weight_prev is None: continue for m in conv_weight_prev: if isinstance(m.conv, nn.Conv2d): normal_init(m.conv, std=0.01) for conv_weight in self.conv_weights: normal_init(conv_weight, std=0.01) def forward(self, feats): """Forward features from the upstream network. Args: feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor. Returns: tuple: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2). weight (list[Tensor]): Location-aware weight maps on each scale level, each is a 4D-tensor, the channel number is num_points * 1. depth_cls_preds (list[Tensor]): Box scores for depth class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * self.num_depth_cls. attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. """ return multi_apply(self.forward_single, feats, self.scales, self.strides) def forward_single(self, x, scale, stride): """Forward features of a single scale level. Args: x (Tensor): FPN feature maps of the specified stride. scale (:obj: `mmcv.cnn.Scale`): Learnable scale module to resize the bbox prediction. stride (int): The corresponding stride for feature maps, only used to normalize the bbox prediction when self.norm_on_bbox is True. Returns: tuple: scores for each class, bbox and direction class predictions, depth class predictions, location-aware weights, attribute and centerness predictions of input feature maps. """ cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, cls_feat, \ reg_feat = super().forward_single(x, scale, stride) max_regress_range = stride * self.regress_ranges[0][1] / \ self.strides[0] bbox_pred = self.bbox_coder.decode_2d(bbox_pred, scale, stride, max_regress_range, self.training, self.pred_keypoints, self.pred_bbox2d) depth_cls_pred = None if self.use_depth_classifier: clone_reg_feat = reg_feat.clone() for conv_depth_cls_prev_layer in self.conv_depth_cls_prev: clone_reg_feat = conv_depth_cls_prev_layer(clone_reg_feat) depth_cls_pred = self.conv_depth_cls(clone_reg_feat) weight = None if self.weight_dim != -1: weight = [] for i in range(self.weight_dim): clone_reg_feat = reg_feat.clone() if len(self.weight_branch[i]) > 0: for conv_weight_prev_layer in self.conv_weight_prevs[i]: clone_reg_feat = conv_weight_prev_layer(clone_reg_feat) weight.append(self.conv_weights[i](clone_reg_feat)) weight = torch.cat(weight, dim=1) return cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \ attr_pred, centerness def get_proj_bbox2d(self, bbox_preds, pos_dir_cls_preds, labels_3d, bbox_targets_3d, pos_points, pos_inds, img_metas, pos_depth_cls_preds=None, pos_weights=None, pos_cls_scores=None, with_kpts=False): """Decode box predictions and get projected 2D attributes. Args: bbox_preds (list[Tensor]): Box predictions for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. pos_dir_cls_preds (Tensor): Box scores for direction class predictions of positive boxes on all the scale levels in shape (num_pos_points, 2). labels_3d (list[Tensor]): 3D box category labels for each scale level, each is a 4D-tensor. bbox_targets_3d (list[Tensor]): 3D box targets for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. pos_points (Tensor): Foreground points. pos_inds (Tensor): Index of foreground points from flattened tensors. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. pos_depth_cls_preds (Tensor, optional): Probabilistic depth map of positive boxes on all the scale levels in shape (num_pos_points, self.num_depth_cls). Defaults to None. pos_weights (Tensor, optional): Location-aware weights of positive boxes in shape (num_pos_points, self.weight_dim). Defaults to None. pos_cls_scores (Tensor, optional): Classification scores of positive boxes in shape (num_pos_points, self.num_classes). Defaults to None. with_kpts (bool, optional): Whether to output keypoints targets. Defaults to False. Returns: tuple[Tensor]: Exterior 2D boxes from projected 3D boxes, predicted 2D boxes and keypoint targets (if necessary). """ views = [np.array(img_meta['cam2img']) for img_meta in img_metas] num_imgs = len(img_metas) img_idx = [] for label in labels_3d: for idx in range(num_imgs): img_idx.append( labels_3d[0].new_ones(int(len(label) / num_imgs)) * idx) img_idx = torch.cat(img_idx) pos_img_idx = img_idx[pos_inds] flatten_strided_bbox_preds = [] flatten_strided_bbox2d_preds = [] flatten_bbox_targets_3d = [] flatten_strides = [] for stride_idx, bbox_pred in enumerate(bbox_preds): flatten_bbox_pred = bbox_pred.permute(0, 2, 3, 1).reshape( -1, sum(self.group_reg_dims)) flatten_bbox_pred[:, :2] *= self.strides[stride_idx] flatten_bbox_pred[:, -4:] *= self.strides[stride_idx] flatten_strided_bbox_preds.append( flatten_bbox_pred[:, :self.bbox_coder.bbox_code_size]) flatten_strided_bbox2d_preds.append(flatten_bbox_pred[:, -4:]) bbox_target_3d = bbox_targets_3d[stride_idx].clone() bbox_target_3d[:, :2] *= self.strides[stride_idx] bbox_target_3d[:, -4:] *= self.strides[stride_idx] flatten_bbox_targets_3d.append(bbox_target_3d) flatten_stride = flatten_bbox_pred.new_ones( *flatten_bbox_pred.shape[:-1], 1) * self.strides[stride_idx] flatten_strides.append(flatten_stride) flatten_strided_bbox_preds = torch.cat(flatten_strided_bbox_preds) flatten_strided_bbox2d_preds = torch.cat(flatten_strided_bbox2d_preds) flatten_bbox_targets_3d = torch.cat(flatten_bbox_targets_3d) flatten_strides = torch.cat(flatten_strides) pos_strided_bbox_preds = flatten_strided_bbox_preds[pos_inds] pos_strided_bbox2d_preds = flatten_strided_bbox2d_preds[pos_inds] pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds] pos_strides = flatten_strides[pos_inds] pos_decoded_bbox2d_preds = distance2bbox(pos_points, pos_strided_bbox2d_preds) pos_strided_bbox_preds[:, :2] = \ pos_points - pos_strided_bbox_preds[:, :2] pos_bbox_targets_3d[:, :2] = \ pos_points - pos_bbox_targets_3d[:, :2] if self.use_depth_classifier and (not self.use_onlyreg_proj): pos_prob_depth_preds = self.bbox_coder.decode_prob_depth( pos_depth_cls_preds, self.depth_range, self.depth_unit, self.division, self.num_depth_cls) sig_alpha = torch.sigmoid(self.fuse_lambda) pos_strided_bbox_preds[:, 2] = \ sig_alpha * pos_strided_bbox_preds.clone()[:, 2] + \ (1 - sig_alpha) * pos_prob_depth_preds box_corners_in_image = pos_strided_bbox_preds.new_zeros( (*pos_strided_bbox_preds.shape[:-1], 8, 2)) box_corners_in_image_gt = pos_strided_bbox_preds.new_zeros( (*pos_strided_bbox_preds.shape[:-1], 8, 2)) for idx in range(num_imgs): mask = (pos_img_idx == idx) if pos_strided_bbox_preds[mask].shape[0] == 0: continue cam2img = torch.eye( 4, dtype=pos_strided_bbox_preds.dtype, device=pos_strided_bbox_preds.device) view_shape = views[idx].shape cam2img[:view_shape[0], :view_shape[1]] = \ pos_strided_bbox_preds.new_tensor(views[idx]) centers2d_preds = pos_strided_bbox_preds.clone()[mask, :2] centers2d_targets = pos_bbox_targets_3d.clone()[mask, :2] centers3d_targets = points_img2cam(pos_bbox_targets_3d[mask, :3], views[idx]) # use predicted depth to re-project the 2.5D centers pos_strided_bbox_preds[mask, :3] = points_img2cam( pos_strided_bbox_preds[mask, :3], views[idx]) pos_bbox_targets_3d[mask, :3] = centers3d_targets # depth fixed when computing re-project 3D bboxes pos_strided_bbox_preds[mask, 2] = \ pos_bbox_targets_3d.clone()[mask, 2] # decode yaws if self.use_direction_classifier: pos_dir_cls_scores = torch.max( pos_dir_cls_preds[mask], dim=-1)[1] pos_strided_bbox_preds[mask] = self.bbox_coder.decode_yaw( pos_strided_bbox_preds[mask], centers2d_preds, pos_dir_cls_scores, self.dir_offset, cam2img) pos_bbox_targets_3d[mask, 6] = torch.atan2( centers2d_targets[:, 0] - cam2img[0, 2], cam2img[0, 0]) + pos_bbox_targets_3d[mask, 6] corners = img_metas[0]['box_type_3d']( pos_strided_bbox_preds[mask], box_dim=self.bbox_coder.bbox_code_size, origin=(0.5, 0.5, 0.5)).corners box_corners_in_image[mask] = points_cam2img(corners, cam2img) corners_gt = img_metas[0]['box_type_3d']( pos_bbox_targets_3d[mask, :self.bbox_code_size], box_dim=self.bbox_coder.bbox_code_size, origin=(0.5, 0.5, 0.5)).corners box_corners_in_image_gt[mask] = points_cam2img(corners_gt, cam2img) minxy = torch.min(box_corners_in_image, dim=1)[0] maxxy = torch.max(box_corners_in_image, dim=1)[0] proj_bbox2d_preds = torch.cat([minxy, maxxy], dim=1) outputs = (proj_bbox2d_preds, pos_decoded_bbox2d_preds) if with_kpts: norm_strides = pos_strides * self.regress_ranges[0][1] / \ self.strides[0] kpts_targets = box_corners_in_image_gt - pos_points[..., None, :] kpts_targets = kpts_targets.view( (*pos_strided_bbox_preds.shape[:-1], 16)) kpts_targets /= norm_strides outputs += (kpts_targets, ) return outputs def get_pos_predictions(self, bbox_preds, dir_cls_preds, depth_cls_preds, weights, attr_preds, centernesses, pos_inds, img_metas): """Flatten predictions and get positive ones. Args: bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) depth_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * self.num_depth_cls. attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. pos_inds (Tensor): Index of foreground points from flattened tensors. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple[Tensor]: Box predictions, direction classes, probabilistic depth maps, location-aware weight maps, attributes and centerness predictions. """ flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(-1, sum(self.group_reg_dims)) for bbox_pred in bbox_preds ] flatten_dir_cls_preds = [ dir_cls_pred.permute(0, 2, 3, 1).reshape(-1, 2) for dir_cls_pred in dir_cls_preds ] flatten_centerness = [ centerness.permute(0, 2, 3, 1).reshape(-1) for centerness in centernesses ] flatten_bbox_preds = torch.cat(flatten_bbox_preds) flatten_dir_cls_preds = torch.cat(flatten_dir_cls_preds) flatten_centerness = torch.cat(flatten_centerness) pos_bbox_preds = flatten_bbox_preds[pos_inds] pos_dir_cls_preds = flatten_dir_cls_preds[pos_inds] pos_centerness = flatten_centerness[pos_inds] pos_depth_cls_preds = None if self.use_depth_classifier: flatten_depth_cls_preds = [ depth_cls_pred.permute(0, 2, 3, 1).reshape(-1, self.num_depth_cls) for depth_cls_pred in depth_cls_preds ] flatten_depth_cls_preds = torch.cat(flatten_depth_cls_preds) pos_depth_cls_preds = flatten_depth_cls_preds[pos_inds] pos_weights = None if self.weight_dim != -1: flatten_weights = [ weight.permute(0, 2, 3, 1).reshape(-1, self.weight_dim) for weight in weights ] flatten_weights = torch.cat(flatten_weights) pos_weights = flatten_weights[pos_inds] pos_attr_preds = None if self.pred_attrs: flatten_attr_preds = [ attr_pred.permute(0, 2, 3, 1).reshape(-1, self.num_attrs) for attr_pred in attr_preds ] flatten_attr_preds = torch.cat(flatten_attr_preds) pos_attr_preds = flatten_attr_preds[pos_inds] return pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, \ pos_weights, pos_attr_preds, pos_centerness @force_fp32( apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'depth_cls_preds', 'weights', 'attr_preds', 'centernesses')) def loss(self, cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds, weights, attr_preds, centernesses, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) depth_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * self.num_depth_cls. weights (list[Tensor]): Location-aware weights for each scale level, each is a 4D-tensor, the channel number is num_points * self.weight_dim. attr_preds (list[Tensor]): Attribute scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_attrs. centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box gt_bboxes_3d (list[Tensor]): 3D boxes ground truth with shape of (num_gts, code_size). gt_labels_3d (list[Tensor]): same as gt_labels centers2d (list[Tensor]): 2D centers on the image with shape of (num_gts, 2). depths (list[Tensor]): Depth ground truth with shape of (num_gts, ). attr_labels (list[Tensor]): Attributes indices of each box. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (list[Tensor]): specify which bounding boxes can be ignored when computing the loss. Defaults to None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ len(depth_cls_preds) == len(weights) == len(centernesses) == \ len(attr_preds), 'The length of cls_scores, bbox_preds, ' \ 'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \ f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \ f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \ f'{len(centernesses)}, {len(attr_preds)} are inconsistent.' featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] all_level_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) labels_3d, bbox_targets_3d, centerness_targets, attr_targets = \ self.get_targets( all_level_points, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels) num_imgs = cls_scores[0].size(0) # flatten cls_scores and targets flatten_cls_scores = [ cls_score.permute(0, 2, 3, 1).reshape(-1, self.cls_out_channels) for cls_score in cls_scores ] flatten_cls_scores = torch.cat(flatten_cls_scores) flatten_labels_3d = torch.cat(labels_3d) flatten_bbox_targets_3d = torch.cat(bbox_targets_3d) flatten_centerness_targets = torch.cat(centerness_targets) flatten_points = torch.cat( [points.repeat(num_imgs, 1) for points in all_level_points]) if self.pred_attrs: flatten_attr_targets = torch.cat(attr_targets) # FG cat_id: [0, num_classes -1], BG cat_id: num_classes bg_class_ind = self.num_classes pos_inds = ((flatten_labels_3d >= 0) & (flatten_labels_3d < bg_class_ind)).nonzero().reshape(-1) num_pos = len(pos_inds) loss_dict = dict() loss_dict['loss_cls'] = self.loss_cls( flatten_cls_scores, flatten_labels_3d, avg_factor=num_pos + num_imgs) # avoid num_pos is 0 pos_bbox_preds, pos_dir_cls_preds, pos_depth_cls_preds, pos_weights, \ pos_attr_preds, pos_centerness = self.get_pos_predictions( bbox_preds, dir_cls_preds, depth_cls_preds, weights, attr_preds, centernesses, pos_inds, img_metas) if num_pos > 0: pos_bbox_targets_3d = flatten_bbox_targets_3d[pos_inds] pos_centerness_targets = flatten_centerness_targets[pos_inds] pos_points = flatten_points[pos_inds] if self.pred_attrs: pos_attr_targets = flatten_attr_targets[pos_inds] if self.use_direction_classifier: pos_dir_cls_targets = self.get_direction_target( pos_bbox_targets_3d, self.dir_offset, one_hot=False) bbox_weights = pos_centerness_targets.new_ones( len(pos_centerness_targets), sum(self.group_reg_dims)) equal_weights = pos_centerness_targets.new_ones( pos_centerness_targets.shape) code_weight = self.train_cfg.get('code_weight', None) if code_weight: assert len(code_weight) == sum(self.group_reg_dims) bbox_weights = bbox_weights * bbox_weights.new_tensor( code_weight) if self.diff_rad_by_sin: pos_bbox_preds, pos_bbox_targets_3d = self.add_sin_difference( pos_bbox_preds, pos_bbox_targets_3d) loss_dict['loss_offset'] = self.loss_bbox( pos_bbox_preds[:, :2], pos_bbox_targets_3d[:, :2], weight=bbox_weights[:, :2], avg_factor=equal_weights.sum()) loss_dict['loss_size'] = self.loss_bbox( pos_bbox_preds[:, 3:6], pos_bbox_targets_3d[:, 3:6], weight=bbox_weights[:, 3:6], avg_factor=equal_weights.sum()) loss_dict['loss_rotsin'] = self.loss_bbox( pos_bbox_preds[:, 6], pos_bbox_targets_3d[:, 6], weight=bbox_weights[:, 6], avg_factor=equal_weights.sum()) if self.pred_velo: loss_dict['loss_velo'] = self.loss_bbox( pos_bbox_preds[:, 7:9], pos_bbox_targets_3d[:, 7:9], weight=bbox_weights[:, 7:9], avg_factor=equal_weights.sum()) proj_bbox2d_inputs = (bbox_preds, pos_dir_cls_preds, labels_3d, bbox_targets_3d, pos_points, pos_inds, img_metas) # direction classification loss # TODO: add more check for use_direction_classifier if self.use_direction_classifier: loss_dict['loss_dir'] = self.loss_dir( pos_dir_cls_preds, pos_dir_cls_targets, equal_weights, avg_factor=equal_weights.sum()) # init depth loss with the one computed from direct regression loss_dict['loss_depth'] = self.loss_bbox( pos_bbox_preds[:, 2], pos_bbox_targets_3d[:, 2], weight=bbox_weights[:, 2], avg_factor=equal_weights.sum()) # depth classification loss if self.use_depth_classifier: pos_prob_depth_preds = self.bbox_coder.decode_prob_depth( pos_depth_cls_preds, self.depth_range, self.depth_unit, self.division, self.num_depth_cls) sig_alpha = torch.sigmoid(self.fuse_lambda) if self.weight_dim != -1: loss_fuse_depth = self.loss_depth( sig_alpha * pos_bbox_preds[:, 2] + (1 - sig_alpha) * pos_prob_depth_preds, pos_bbox_targets_3d[:, 2], sigma=pos_weights[:, 0], weight=bbox_weights[:, 2], avg_factor=equal_weights.sum()) else: loss_fuse_depth = self.loss_depth( sig_alpha * pos_bbox_preds[:, 2] + (1 - sig_alpha) * pos_prob_depth_preds, pos_bbox_targets_3d[:, 2], weight=bbox_weights[:, 2], avg_factor=equal_weights.sum()) loss_dict['loss_depth'] = loss_fuse_depth proj_bbox2d_inputs += (pos_depth_cls_preds, ) if self.pred_keypoints: # use smoothL1 to compute consistency loss for keypoints # normalize the offsets with strides proj_bbox2d_preds, pos_decoded_bbox2d_preds, kpts_targets = \ self.get_proj_bbox2d(*proj_bbox2d_inputs, with_kpts=True) loss_dict['loss_kpts'] = self.loss_bbox( pos_bbox_preds[:, self.kpts_start:self.kpts_start + 16], kpts_targets, weight=bbox_weights[:, self.kpts_start:self.kpts_start + 16], avg_factor=equal_weights.sum()) if self.pred_bbox2d: loss_dict['loss_bbox2d'] = self.loss_bbox2d( pos_bbox_preds[:, -4:], pos_bbox_targets_3d[:, -4:], weight=bbox_weights[:, -4:], avg_factor=equal_weights.sum()) if not self.pred_keypoints: proj_bbox2d_preds, pos_decoded_bbox2d_preds = \ self.get_proj_bbox2d(*proj_bbox2d_inputs) loss_dict['loss_consistency'] = self.loss_consistency( proj_bbox2d_preds, pos_decoded_bbox2d_preds, weight=bbox_weights[:, -4:], avg_factor=equal_weights.sum()) loss_dict['loss_centerness'] = self.loss_centerness( pos_centerness, pos_centerness_targets) # attribute classification loss if self.pred_attrs: loss_dict['loss_attr'] = self.loss_attr( pos_attr_preds, pos_attr_targets, pos_centerness_targets, avg_factor=pos_centerness_targets.sum()) else: # need absolute due to possible negative delta x/y loss_dict['loss_offset'] = pos_bbox_preds[:, :2].sum() loss_dict['loss_size'] = pos_bbox_preds[:, 3:6].sum() loss_dict['loss_rotsin'] = pos_bbox_preds[:, 6].sum() loss_dict['loss_depth'] = pos_bbox_preds[:, 2].sum() if self.pred_velo: loss_dict['loss_velo'] = pos_bbox_preds[:, 7:9].sum() if self.pred_keypoints: loss_dict['loss_kpts'] = pos_bbox_preds[:, self.kpts_start:self. kpts_start + 16].sum() if self.pred_bbox2d: loss_dict['loss_bbox2d'] = pos_bbox_preds[:, -4:].sum() loss_dict['loss_consistency'] = pos_bbox_preds[:, -4:].sum() loss_dict['loss_centerness'] = pos_centerness.sum() if self.use_direction_classifier: loss_dict['loss_dir'] = pos_dir_cls_preds.sum() if self.use_depth_classifier: sig_alpha = torch.sigmoid(self.fuse_lambda) loss_fuse_depth = \ sig_alpha * pos_bbox_preds[:, 2].sum() + \ (1 - sig_alpha) * pos_depth_cls_preds.sum() if self.weight_dim != -1: loss_fuse_depth *= torch.exp(-pos_weights[:, 0].sum()) loss_dict['loss_depth'] = loss_fuse_depth if self.pred_attrs: loss_dict['loss_attr'] = pos_attr_preds.sum() return loss_dict @force_fp32( apply_to=('cls_scores', 'bbox_preds', 'dir_cls_preds', 'depth_cls_preds', 'weights', 'attr_preds', 'centernesses')) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds, weights, attr_preds, centernesses, img_metas, cfg=None, rescale=None): """Transform network output for a batch into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level Has shape (N, num_points * num_classes, H, W) bbox_preds (list[Tensor]): Box energies / deltas for each scale level with shape (N, num_points * 4, H, W) dir_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * 2. (bin = 2) depth_cls_preds (list[Tensor]): Box scores for direction class predictions on each scale level, each is a 4D-tensor, the channel number is num_points * self.num_depth_cls. weights (list[Tensor]): Location-aware weights for each scale level, each is a 4D-tensor, the channel number is num_points * self.weight_dim. attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) centernesses (list[Tensor]): Centerness for each scale level with shape (N, num_points * 1, H, W) img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cfg (mmcv.Config, optional): Test / postprocessing configuration, if None, test_cfg would be used. Defaults to None. rescale (bool, optional): If True, return boxes in original image space. Defaults to None. Returns: list[tuple[Tensor]]: Each item in result_list is a tuple, which consists of predicted 3D boxes, scores, labels, attributes and 2D boxes (if necessary). """ assert len(cls_scores) == len(bbox_preds) == len(dir_cls_preds) == \ len(depth_cls_preds) == len(weights) == len(centernesses) == \ len(attr_preds), 'The length of cls_scores, bbox_preds, ' \ 'dir_cls_preds, depth_cls_preds, weights, centernesses, and' \ f'attr_preds: {len(cls_scores)}, {len(bbox_preds)}, ' \ f'{len(dir_cls_preds)}, {len(depth_cls_preds)}, {len(weights)}' \ f'{len(centernesses)}, {len(attr_preds)} are inconsistent.' num_levels = len(cls_scores) featmap_sizes = [featmap.size()[-2:] for featmap in cls_scores] mlvl_points = self.get_points(featmap_sizes, bbox_preds[0].dtype, bbox_preds[0].device) result_list = [] for img_id in range(len(img_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] if self.use_direction_classifier: dir_cls_pred_list = [ dir_cls_preds[i][img_id].detach() for i in range(num_levels) ] else: dir_cls_pred_list = [ cls_scores[i][img_id].new_full( [2, *cls_scores[i][img_id].shape[1:]], 0).detach() for i in range(num_levels) ] if self.use_depth_classifier: depth_cls_pred_list = [ depth_cls_preds[i][img_id].detach() for i in range(num_levels) ] else: depth_cls_pred_list = [ cls_scores[i][img_id].new_full( [self.num_depth_cls, *cls_scores[i][img_id].shape[1:]], 0).detach() for i in range(num_levels) ] if self.weight_dim != -1: weight_list = [ weights[i][img_id].detach() for i in range(num_levels) ] else: weight_list = [ cls_scores[i][img_id].new_full( [1, *cls_scores[i][img_id].shape[1:]], 0).detach() for i in range(num_levels) ] if self.pred_attrs: attr_pred_list = [ attr_preds[i][img_id].detach() for i in range(num_levels) ] else: attr_pred_list = [ cls_scores[i][img_id].new_full( [self.num_attrs, *cls_scores[i][img_id].shape[1:]], self.attr_background_label).detach() for i in range(num_levels) ] centerness_pred_list = [ centernesses[i][img_id].detach() for i in range(num_levels) ] input_meta = img_metas[img_id] det_bboxes = self._get_bboxes_single( cls_score_list, bbox_pred_list, dir_cls_pred_list, depth_cls_pred_list, weight_list, attr_pred_list, centerness_pred_list, mlvl_points, input_meta, cfg, rescale) result_list.append(det_bboxes) return result_list def _get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds, weights, attr_preds, centernesses, mlvl_points, input_meta, cfg, rescale=False): """Transform outputs for a single batch item into bbox predictions. Args: cls_scores (list[Tensor]): Box scores for a single scale level Has shape (num_points * num_classes, H, W). bbox_preds (list[Tensor]): Box energies / deltas for a single scale level with shape (num_points * bbox_code_size, H, W). dir_cls_preds (list[Tensor]): Box scores for direction class predictions on a single scale level with shape (num_points * 2, H, W) depth_cls_preds (list[Tensor]): Box scores for probabilistic depth predictions on a single scale level with shape (num_points * self.num_depth_cls, H, W) weights (list[Tensor]): Location-aware weight maps on a single scale level with shape (num_points * self.weight_dim, H, W). attr_preds (list[Tensor]): Attribute scores for each scale level Has shape (N, num_points * num_attrs, H, W) centernesses (list[Tensor]): Centerness for a single scale level with shape (num_points, H, W). mlvl_points (list[Tensor]): Box reference for a single scale level with shape (num_total_points, 2). input_meta (dict): Metadata of input image. cfg (mmcv.Config): Test / postprocessing configuration, if None, test_cfg would be used. rescale (bool, optional): If True, return boxes in original image space. Defaults to False. Returns: tuples[Tensor]: Predicted 3D boxes, scores, labels, attributes and 2D boxes (if necessary). """ view = np.array(input_meta['cam2img']) scale_factor = input_meta['scale_factor'] cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_points) mlvl_centers2d = [] mlvl_bboxes = [] mlvl_scores = [] mlvl_dir_scores = [] mlvl_attr_scores = [] mlvl_centerness = [] mlvl_depth_cls_scores = [] mlvl_depth_uncertainty = [] mlvl_bboxes2d = None if self.pred_bbox2d: mlvl_bboxes2d = [] for cls_score, bbox_pred, dir_cls_pred, depth_cls_pred, weight, \ attr_pred, centerness, points in zip( cls_scores, bbox_preds, dir_cls_preds, depth_cls_preds, weights, attr_preds, centernesses, mlvl_points): assert cls_score.size()[-2:] == bbox_pred.size()[-2:] scores = cls_score.permute(1, 2, 0).reshape( -1, self.cls_out_channels).sigmoid() dir_cls_pred = dir_cls_pred.permute(1, 2, 0).reshape(-1, 2) dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] depth_cls_pred = depth_cls_pred.permute(1, 2, 0).reshape( -1, self.num_depth_cls) depth_cls_score = F.softmax( depth_cls_pred, dim=-1).topk( k=2, dim=-1)[0].mean(dim=-1) if self.weight_dim != -1: weight = weight.permute(1, 2, 0).reshape(-1, self.weight_dim) else: weight = weight.permute(1, 2, 0).reshape(-1, 1) depth_uncertainty = torch.exp(-weight[:, -1]) attr_pred = attr_pred.permute(1, 2, 0).reshape(-1, self.num_attrs) attr_score = torch.max(attr_pred, dim=-1)[1] centerness = centerness.permute(1, 2, 0).reshape(-1).sigmoid() bbox_pred = bbox_pred.permute(1, 2, 0).reshape(-1, sum(self.group_reg_dims)) bbox_pred3d = bbox_pred[:, :self.bbox_coder.bbox_code_size] if self.pred_bbox2d: bbox_pred2d = bbox_pred[:, -4:] nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: merged_scores = scores * centerness[:, None] if self.use_depth_classifier: merged_scores *= depth_cls_score[:, None] if self.weight_dim != -1: merged_scores *= depth_uncertainty[:, None] max_scores, _ = merged_scores.max(dim=1) _, topk_inds = max_scores.topk(nms_pre) points = points[topk_inds, :] bbox_pred3d = bbox_pred3d[topk_inds, :] scores = scores[topk_inds, :] dir_cls_pred = dir_cls_pred[topk_inds, :] depth_cls_pred = depth_cls_pred[topk_inds, :] centerness = centerness[topk_inds] dir_cls_score = dir_cls_score[topk_inds] depth_cls_score = depth_cls_score[topk_inds] depth_uncertainty = depth_uncertainty[topk_inds] attr_score = attr_score[topk_inds] if self.pred_bbox2d: bbox_pred2d = bbox_pred2d[topk_inds, :] # change the offset to actual center predictions bbox_pred3d[:, :2] = points - bbox_pred3d[:, :2] if rescale: bbox_pred3d[:, :2] /= bbox_pred3d[:, :2].new_tensor( scale_factor) if self.pred_bbox2d: bbox_pred2d /= bbox_pred2d.new_tensor(scale_factor) if self.use_depth_classifier: prob_depth_pred = self.bbox_coder.decode_prob_depth( depth_cls_pred, self.depth_range, self.depth_unit, self.division, self.num_depth_cls) sig_alpha = torch.sigmoid(self.fuse_lambda) bbox_pred3d[:, 2] = sig_alpha * bbox_pred3d[:, 2] + \ (1 - sig_alpha) * prob_depth_pred pred_center2d = bbox_pred3d[:, :3].clone() bbox_pred3d[:, :3] = points_img2cam(bbox_pred3d[:, :3], view) mlvl_centers2d.append(pred_center2d) mlvl_bboxes.append(bbox_pred3d) mlvl_scores.append(scores) mlvl_dir_scores.append(dir_cls_score) mlvl_depth_cls_scores.append(depth_cls_score) mlvl_attr_scores.append(attr_score) mlvl_centerness.append(centerness) mlvl_depth_uncertainty.append(depth_uncertainty) if self.pred_bbox2d: bbox_pred2d = distance2bbox( points, bbox_pred2d, max_shape=input_meta['img_shape']) mlvl_bboxes2d.append(bbox_pred2d) mlvl_centers2d = torch.cat(mlvl_centers2d) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_dir_scores = torch.cat(mlvl_dir_scores) if self.pred_bbox2d: mlvl_bboxes2d = torch.cat(mlvl_bboxes2d) # change local yaw to global yaw for 3D nms cam2img = torch.eye( 4, dtype=mlvl_centers2d.dtype, device=mlvl_centers2d.device) cam2img[:view.shape[0], :view.shape[1]] = \ mlvl_centers2d.new_tensor(view) mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d, mlvl_dir_scores, self.dir_offset, cam2img) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.bbox_coder.bbox_code_size, origin=(0.5, 0.5, 0.5)).bev) mlvl_scores = torch.cat(mlvl_scores) padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) # remind that we set FG labels to [0, num_class-1] since mmdet v2.0 # BG cat_id: num_class mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) mlvl_attr_scores = torch.cat(mlvl_attr_scores) mlvl_centerness = torch.cat(mlvl_centerness) # no scale_factors in box3d_multiclass_nms # Then we multiply it from outside mlvl_nms_scores = mlvl_scores * mlvl_centerness[:, None] if self.use_depth_classifier: # multiply the depth confidence mlvl_depth_cls_scores = torch.cat(mlvl_depth_cls_scores) mlvl_nms_scores *= mlvl_depth_cls_scores[:, None] if self.weight_dim != -1: mlvl_depth_uncertainty = torch.cat(mlvl_depth_uncertainty) mlvl_nms_scores *= mlvl_depth_uncertainty[:, None] results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_nms_scores, cfg.score_thr, cfg.max_per_img, cfg, mlvl_dir_scores, mlvl_attr_scores, mlvl_bboxes2d) bboxes, scores, labels, dir_scores, attrs = results[0:5] attrs = attrs.to(labels.dtype) # change data type to int bboxes = input_meta['box_type_3d']( bboxes, box_dim=self.bbox_coder.bbox_code_size, origin=(0.5, 0.5, 0.5)) # Note that the predictions use origin (0.5, 0.5, 0.5) # Due to the ground truth centers2d are the gravity center of objects # v0.10.0 fix inplace operation to the input tensor of cam_box3d # So here we also need to add origin=(0.5, 0.5, 0.5) if not self.pred_attrs: attrs = None outputs = (bboxes, scores, labels, attrs) if self.pred_bbox2d: bboxes2d = results[-1] bboxes2d = torch.cat([bboxes2d, scores[:, None]], dim=1) outputs = outputs + (bboxes2d, ) return outputs def get_targets(self, points, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, attr_labels_list): """Compute regression, classification and centerss targets for points in multiple images. Args: points (list[Tensor]): Points of each fpn level, each has shape (num_points, 2). gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, each has shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, each has shape (num_gt,). gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each image, each has shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, each has shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, each has shape (num_gt, 2). depths_list (list[Tensor]): Depth of projected 3D centers onto 2D image, each has shape (num_gt, 1). attr_labels_list (list[Tensor]): Attribute labels of each box, each has shape (num_gt,). Returns: tuple: concat_lvl_labels (list[Tensor]): Labels of each level. \ concat_lvl_bbox_targets (list[Tensor]): BBox targets of each \ level. """ assert len(points) == len(self.regress_ranges) num_levels = len(points) # expand regress ranges to align with points expanded_regress_ranges = [ points[i].new_tensor(self.regress_ranges[i])[None].expand_as( points[i]) for i in range(num_levels) ] # concat all levels points and regress ranges concat_regress_ranges = torch.cat(expanded_regress_ranges, dim=0) concat_points = torch.cat(points, dim=0) # the number of points per img, per lvl num_points = [center.size(0) for center in points] if attr_labels_list is None: attr_labels_list = [ gt_labels.new_full(gt_labels.shape, self.attr_background_label) for gt_labels in gt_labels_list ] # get labels and bbox_targets of each image _, bbox_targets_list, labels_3d_list, bbox_targets_3d_list, \ centerness_targets_list, attr_targets_list = multi_apply( self._get_target_single, gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, depths_list, attr_labels_list, points=concat_points, regress_ranges=concat_regress_ranges, num_points_per_lvl=num_points) # split to per img, per level bbox_targets_list = [ bbox_targets.split(num_points, 0) for bbox_targets in bbox_targets_list ] labels_3d_list = [ labels_3d.split(num_points, 0) for labels_3d in labels_3d_list ] bbox_targets_3d_list = [ bbox_targets_3d.split(num_points, 0) for bbox_targets_3d in bbox_targets_3d_list ] centerness_targets_list = [ centerness_targets.split(num_points, 0) for centerness_targets in centerness_targets_list ] attr_targets_list = [ attr_targets.split(num_points, 0) for attr_targets in attr_targets_list ] # concat per level image concat_lvl_labels_3d = [] concat_lvl_bbox_targets_3d = [] concat_lvl_centerness_targets = [] concat_lvl_attr_targets = [] for i in range(num_levels): concat_lvl_labels_3d.append( torch.cat([labels[i] for labels in labels_3d_list])) concat_lvl_centerness_targets.append( torch.cat([ centerness_targets[i] for centerness_targets in centerness_targets_list ])) bbox_targets_3d = torch.cat([ bbox_targets_3d[i] for bbox_targets_3d in bbox_targets_3d_list ]) if self.pred_bbox2d: bbox_targets = torch.cat( [bbox_targets[i] for bbox_targets in bbox_targets_list]) bbox_targets_3d = torch.cat([bbox_targets_3d, bbox_targets], dim=1) concat_lvl_attr_targets.append( torch.cat( [attr_targets[i] for attr_targets in attr_targets_list])) if self.norm_on_bbox: bbox_targets_3d[:, :2] = \ bbox_targets_3d[:, :2] / self.strides[i] if self.pred_bbox2d: bbox_targets_3d[:, -4:] = \ bbox_targets_3d[:, -4:] / self.strides[i] concat_lvl_bbox_targets_3d.append(bbox_targets_3d) return concat_lvl_labels_3d, concat_lvl_bbox_targets_3d, \ concat_lvl_centerness_targets, concat_lvl_attr_targets ================================================ FILE: mmdet3d/models/dense_heads/point_rpn_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn from mmdet3d.core import xywhr2xyxyr from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes, LiDARInstance3DBoxes) from mmdet3d.core.post_processing import nms_bev, nms_normal_bev from mmdet.core import build_bbox_coder, multi_apply from ..builder import HEADS, build_loss @HEADS.register_module() class PointRPNHead(BaseModule): """RPN module for PointRCNN. Args: num_classes (int): Number of classes. train_cfg (dict): Train configs. test_cfg (dict): Test configs. pred_layer_cfg (dict, optional): Config of classification and regression prediction layers. Defaults to None. enlarge_width (float, optional): Enlarge bbox for each side to ignore close points. Defaults to 0.1. cls_loss (dict, optional): Config of direction classification loss. Defaults to None. bbox_loss (dict, optional): Config of localization loss. Defaults to None. bbox_coder (dict, optional): Config dict of box coders. Defaults to None. init_cfg (dict, optional): Config of initialization. Defaults to None. """ def __init__(self, num_classes, train_cfg, test_cfg, pred_layer_cfg=None, enlarge_width=0.1, cls_loss=None, bbox_loss=None, bbox_coder=None, init_cfg=None): super().__init__(init_cfg=init_cfg) self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.enlarge_width = enlarge_width # build loss function self.bbox_loss = build_loss(bbox_loss) self.cls_loss = build_loss(cls_loss) # build box coder self.bbox_coder = build_bbox_coder(bbox_coder) # build pred conv self.cls_layers = self._make_fc_layers( fc_cfg=pred_layer_cfg.cls_linear_channels, input_channels=pred_layer_cfg.in_channels, output_channels=self._get_cls_out_channels()) self.reg_layers = self._make_fc_layers( fc_cfg=pred_layer_cfg.reg_linear_channels, input_channels=pred_layer_cfg.in_channels, output_channels=self._get_reg_out_channels()) def _make_fc_layers(self, fc_cfg, input_channels, output_channels): """Make fully connect layers. Args: fc_cfg (dict): Config of fully connect. input_channels (int): Input channels for fc_layers. output_channels (int): Input channels for fc_layers. Returns: nn.Sequential: Fully connect layers. """ fc_layers = [] c_in = input_channels for k in range(0, fc_cfg.__len__()): fc_layers.extend([ nn.Linear(c_in, fc_cfg[k], bias=False), nn.BatchNorm1d(fc_cfg[k]), nn.ReLU(), ]) c_in = fc_cfg[k] fc_layers.append(nn.Linear(c_in, output_channels, bias=True)) return nn.Sequential(*fc_layers) def _get_cls_out_channels(self): """Return the channel number of classification outputs.""" # Class numbers (k) + objectness (1) return self.num_classes def _get_reg_out_channels(self): """Return the channel number of regression outputs.""" # Bbox classification and regression # (center residual (3), size regression (3) # torch.cos(yaw) (1), torch.sin(yaw) (1) return self.bbox_coder.code_size def forward(self, feat_dict): """Forward pass. Args: feat_dict (dict): Feature dict from backbone. Returns: tuple[list[torch.Tensor]]: Predicted boxes and classification scores. """ point_features = feat_dict['fp_features'] point_features = point_features.permute(0, 2, 1).contiguous() batch_size = point_features.shape[0] feat_cls = point_features.view(-1, point_features.shape[-1]) feat_reg = point_features.view(-1, point_features.shape[-1]) point_cls_preds = self.cls_layers(feat_cls).reshape( batch_size, -1, self._get_cls_out_channels()) point_box_preds = self.reg_layers(feat_reg).reshape( batch_size, -1, self._get_reg_out_channels()) return point_box_preds, point_cls_preds @force_fp32(apply_to=('bbox_preds')) def loss(self, bbox_preds, cls_preds, points, gt_bboxes_3d, gt_labels_3d, img_metas=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of PointRCNN RPN_Head. cls_preds (dict): Classification from forward of PointRCNN RPN_Head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. img_metas (list[dict], Optional): Contain pcd and img's meta info. Defaults to None. Returns: dict: Losses of PointRCNN RPN module. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d) (bbox_targets, mask_targets, positive_mask, negative_mask, box_loss_weights, point_targets) = targets # bbox loss bbox_loss = self.bbox_loss(bbox_preds, bbox_targets, box_loss_weights.unsqueeze(-1)) # calculate semantic loss semantic_points = cls_preds.reshape(-1, self.num_classes) semantic_targets = mask_targets semantic_targets[negative_mask] = self.num_classes semantic_points_label = semantic_targets # for ignore, but now we do not have ignored label semantic_loss_weight = negative_mask.float() + positive_mask.float() semantic_loss = self.cls_loss(semantic_points, semantic_points_label.reshape(-1), semantic_loss_weight.reshape(-1)) semantic_loss /= positive_mask.float().sum() losses = dict(bbox_loss=bbox_loss, semantic_loss=semantic_loss) return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d): """Generate targets of PointRCNN RPN head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. Returns: tuple[torch.Tensor]: Targets of PointRCNN RPN head. """ # find empty example for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) (bbox_targets, mask_targets, positive_mask, negative_mask, point_targets) = multi_apply(self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d) bbox_targets = torch.stack(bbox_targets) mask_targets = torch.stack(mask_targets) positive_mask = torch.stack(positive_mask) negative_mask = torch.stack(negative_mask) box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6) return (bbox_targets, mask_targets, positive_mask, negative_mask, box_loss_weights, point_targets) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d): """Generate targets of PointRCNN RPN head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. Returns: tuple[torch.Tensor]: Targets of ssd3d head. """ gt_bboxes_3d = gt_bboxes_3d.to(points.device) valid_gt = gt_labels_3d != -1 gt_bboxes_3d = gt_bboxes_3d[valid_gt] gt_labels_3d = gt_labels_3d[valid_gt] # transform the bbox coordinate to the point cloud coordinate gt_bboxes_3d_tensor = gt_bboxes_3d.tensor.clone() gt_bboxes_3d_tensor[..., 2] += gt_bboxes_3d_tensor[..., 5] / 2 points_mask, assignment = self._assign_targets_by_points_inside( gt_bboxes_3d, points) gt_bboxes_3d_tensor = gt_bboxes_3d_tensor[assignment] mask_targets = gt_labels_3d[assignment] bbox_targets = self.bbox_coder.encode(gt_bboxes_3d_tensor, points[..., 0:3], mask_targets) positive_mask = (points_mask.max(1)[0] > 0) # add ignore_mask extend_gt_bboxes_3d = gt_bboxes_3d.enlarged_box(self.enlarge_width) points_mask, _ = self._assign_targets_by_points_inside( extend_gt_bboxes_3d, points) negative_mask = (points_mask.max(1)[0] == 0) point_targets = points[..., 0:3] return (bbox_targets, mask_targets, positive_mask, negative_mask, point_targets) def get_bboxes(self, points, bbox_preds, cls_preds, input_metas, rescale=False): """Generate bboxes from RPN head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Regression predictions from PointRCNN head. cls_preds (dict): Class scores predictions from PointRCNN head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool, optional): Whether to rescale bboxes. Defaults to False. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ sem_scores = cls_preds.sigmoid() obj_scores = sem_scores.max(-1)[0] object_class = sem_scores.argmax(dim=-1) batch_size = sem_scores.shape[0] results = list() for b in range(batch_size): bbox3d = self.bbox_coder.decode(bbox_preds[b], points[b, ..., :3], object_class[b]) bbox_selected, score_selected, labels, cls_preds_selected = \ self.class_agnostic_nms(obj_scores[b], sem_scores[b], bbox3d, points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected.clone(), box_dim=bbox_selected.shape[-1], with_yaw=True) results.append((bbox, score_selected, labels, cls_preds_selected)) return results def class_agnostic_nms(self, obj_scores, sem_scores, bbox, points, input_meta): """Class agnostic nms. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): Semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ nms_cfg = self.test_cfg.nms_cfg if not self.training \ else self.train_cfg.nms_cfg if nms_cfg.use_rotate_nms: nms_func = nms_bev else: nms_func = nms_normal_bev num_bbox = bbox.shape[0] bbox = input_meta['box_type_3d']( bbox.clone(), box_dim=bbox.shape[-1], with_yaw=True, origin=(0.5, 0.5, 0.5)) if isinstance(bbox, LiDARInstance3DBoxes): box_idx = bbox.points_in_boxes(points) box_indices = box_idx.new_zeros([num_bbox + 1]) box_idx[box_idx == -1] = num_bbox box_indices.scatter_add_(0, box_idx.long(), box_idx.new_ones(box_idx.shape)) box_indices = box_indices[:-1] nonempty_box_mask = box_indices >= 0 elif isinstance(bbox, DepthInstance3DBoxes): box_indices = bbox.points_in_boxes(points) nonempty_box_mask = box_indices.T.sum(1) >= 0 else: raise NotImplementedError('Unsupported bbox type!') bbox = bbox[nonempty_box_mask] if self.test_cfg.score_thr is not None: score_thr = self.test_cfg.score_thr keep = (obj_scores >= score_thr) obj_scores = obj_scores[keep] sem_scores = sem_scores[keep] bbox = bbox.tensor[keep] if obj_scores.shape[0] > 0: topk = min(nms_cfg.nms_pre, obj_scores.shape[0]) obj_scores_nms, indices = torch.topk(obj_scores, k=topk) bbox_for_nms = xywhr2xyxyr(bbox[indices].bev) sem_scores_nms = sem_scores[indices] keep = nms_func(bbox_for_nms, obj_scores_nms, nms_cfg.iou_thr) keep = keep[:nms_cfg.nms_post] bbox_selected = bbox.tensor[indices][keep] score_selected = obj_scores_nms[keep] cls_preds = sem_scores_nms[keep] labels = torch.argmax(cls_preds, -1) else: bbox_selected = bbox.tensor score_selected = obj_scores.new_zeros([0]) labels = obj_scores.new_zeros([0]) cls_preds = obj_scores.new_zeros([0, sem_scores.shape[-1]]) return bbox_selected, score_selected, labels, cls_preds def _assign_targets_by_points_inside(self, bboxes_3d, points): """Compute assignment by checking whether point is inside bbox. Args: bboxes_3d (:obj:`BaseInstance3DBoxes`): Instance of bounding boxes. points (torch.Tensor): Points of a batch. Returns: tuple[torch.Tensor]: Flags indicating whether each point is inside bbox and the index of box where each point are in. """ # TODO: align points_in_boxes function in each box_structures num_bbox = bboxes_3d.tensor.shape[0] if isinstance(bboxes_3d, LiDARInstance3DBoxes): assignment = bboxes_3d.points_in_boxes(points[:, 0:3]).long() points_mask = assignment.new_zeros( [assignment.shape[0], num_bbox + 1]) assignment[assignment == -1] = num_bbox points_mask.scatter_(1, assignment.unsqueeze(1), 1) points_mask = points_mask[:, :-1] assignment[assignment == num_bbox] = num_bbox - 1 elif isinstance(bboxes_3d, DepthInstance3DBoxes): points_mask = bboxes_3d.points_in_boxes(points) assignment = points_mask.argmax(dim=-1) else: raise NotImplementedError('Unsupported bbox type!') return points_mask, assignment ================================================ FILE: mmdet3d/models/dense_heads/shape_aware_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import numpy as np import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule from torch import nn as nn from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr from mmdet.core import multi_apply from ..builder import HEADS, build_head from .anchor3d_head import Anchor3DHead @HEADS.register_module() class BaseShapeHead(BaseModule): """Base Shape-aware Head in Shape Signature Network. Note: This base shape-aware grouping head uses default settings for small objects. For large and huge objects, it is recommended to use heavier heads, like (64, 64, 64) and (128, 128, 64, 64, 64) in shared conv channels, (2, 1, 1) and (2, 1, 2, 1, 1) in shared conv strides. For tiny objects, we can use smaller heads, like (32, 32) channels and (1, 1) strides. Args: num_cls (int): Number of classes. num_base_anchors (int): Number of anchors per location. box_code_size (int): The dimension of boxes to be encoded. in_channels (int): Input channels for convolutional layers. shared_conv_channels (tuple, optional): Channels for shared convolutional layers. Default: (64, 64). shared_conv_strides (tuple, optional): Strides for shared convolutional layers. Default: (1, 1). use_direction_classifier (bool, optional): Whether to use direction classifier. Default: True. conv_cfg (dict, optional): Config of conv layer. Default: dict(type='Conv2d') norm_cfg (dict, optional): Config of norm layer. Default: dict(type='BN2d'). bias (bool | str, optional): Type of bias. Default: False. """ def __init__(self, num_cls, num_base_anchors, box_code_size, in_channels, shared_conv_channels=(64, 64), shared_conv_strides=(1, 1), use_direction_classifier=True, conv_cfg=dict(type='Conv2d'), norm_cfg=dict(type='BN2d'), bias=False, init_cfg=None): super().__init__(init_cfg=init_cfg) self.num_cls = num_cls self.num_base_anchors = num_base_anchors self.use_direction_classifier = use_direction_classifier self.box_code_size = box_code_size assert len(shared_conv_channels) == len(shared_conv_strides), \ 'Lengths of channels and strides list should be equal.' self.shared_conv_channels = [in_channels] + list(shared_conv_channels) self.shared_conv_strides = list(shared_conv_strides) shared_conv = [] for i in range(len(self.shared_conv_strides)): shared_conv.append( ConvModule( self.shared_conv_channels[i], self.shared_conv_channels[i + 1], kernel_size=3, stride=self.shared_conv_strides[i], padding=1, conv_cfg=conv_cfg, bias=bias, norm_cfg=norm_cfg)) self.shared_conv = nn.Sequential(*shared_conv) out_channels = self.shared_conv_channels[-1] self.conv_cls = nn.Conv2d(out_channels, num_base_anchors * num_cls, 1) self.conv_reg = nn.Conv2d(out_channels, num_base_anchors * box_code_size, 1) if use_direction_classifier: self.conv_dir_cls = nn.Conv2d(out_channels, num_base_anchors * 2, 1) if init_cfg is None: if use_direction_classifier: self.init_cfg = dict( type='Kaiming', layer='Conv2d', override=[ dict(type='Normal', name='conv_reg', std=0.01), dict( type='Normal', name='conv_cls', std=0.01, bias_prob=0.01), dict( type='Normal', name='conv_dir_cls', std=0.01, bias_prob=0.01) ]) else: self.init_cfg = dict( type='Kaiming', layer='Conv2d', override=[ dict(type='Normal', name='conv_reg', std=0.01), dict( type='Normal', name='conv_cls', std=0.01, bias_prob=0.01) ]) def forward(self, x): """Forward function for SmallHead. Args: x (torch.Tensor): Input feature map with the shape of [B, C, H, W]. Returns: dict[torch.Tensor]: Contain score of each class, bbox regression and direction classification predictions. Note that all the returned tensors are reshaped as [bs*num_base_anchors*H*W, num_cls/box_code_size/dir_bins]. It is more convenient to concat anchors for different classes even though they have different feature map sizes. """ x = self.shared_conv(x) cls_score = self.conv_cls(x) bbox_pred = self.conv_reg(x) featmap_size = bbox_pred.shape[-2:] H, W = featmap_size B = bbox_pred.shape[0] cls_score = cls_score.view(-1, self.num_base_anchors, self.num_cls, H, W).permute(0, 1, 3, 4, 2).reshape(B, -1, self.num_cls) bbox_pred = bbox_pred.view(-1, self.num_base_anchors, self.box_code_size, H, W).permute( 0, 1, 3, 4, 2).reshape(B, -1, self.box_code_size) dir_cls_preds = None if self.use_direction_classifier: dir_cls_preds = self.conv_dir_cls(x) dir_cls_preds = dir_cls_preds.view(-1, self.num_base_anchors, 2, H, W).permute(0, 1, 3, 4, 2).reshape(B, -1, 2) ret = dict( cls_score=cls_score, bbox_pred=bbox_pred, dir_cls_preds=dir_cls_preds, featmap_size=featmap_size) return ret @HEADS.register_module() class ShapeAwareHead(Anchor3DHead): """Shape-aware grouping head for SSN. Args: tasks (dict): Shape-aware groups of multi-class objects. assign_per_class (bool, optional): Whether to do assignment for each class. Default: True. kwargs (dict): Other arguments are the same as those in :class:`Anchor3DHead`. """ def __init__(self, tasks, assign_per_class=True, init_cfg=None, **kwargs): self.tasks = tasks self.featmap_sizes = [] super().__init__( assign_per_class=assign_per_class, init_cfg=init_cfg, **kwargs) def init_weights(self): if not self._is_init: for m in self.heads: if hasattr(m, 'init_weights'): m.init_weights() self._is_init = True else: warnings.warn(f'init_weights of {self.__class__.__name__} has ' f'been called more than once.') def _init_layers(self): """Initialize neural network layers of the head.""" self.heads = nn.ModuleList() cls_ptr = 0 for task in self.tasks: sizes = self.anchor_generator.sizes[cls_ptr:cls_ptr + task['num_class']] num_size = torch.tensor(sizes).reshape(-1, 3).size(0) num_rot = len(self.anchor_generator.rotations) num_base_anchors = num_rot * num_size branch = dict( type='BaseShapeHead', num_cls=self.num_classes, num_base_anchors=num_base_anchors, box_code_size=self.box_code_size, in_channels=self.in_channels, shared_conv_channels=task['shared_conv_channels'], shared_conv_strides=task['shared_conv_strides']) self.heads.append(build_head(branch)) cls_ptr += task['num_class'] def forward_single(self, x): """Forward function on a single-scale feature map. Args: x (torch.Tensor): Input features. Returns: tuple[torch.Tensor]: Contain score of each class, bbox regression and direction classification predictions. """ results = [] for head in self.heads: results.append(head(x)) cls_score = torch.cat([result['cls_score'] for result in results], dim=1) bbox_pred = torch.cat([result['bbox_pred'] for result in results], dim=1) dir_cls_preds = None if self.use_direction_classifier: dir_cls_preds = torch.cat( [result['dir_cls_preds'] for result in results], dim=1) self.featmap_sizes = [] for i, task in enumerate(self.tasks): for _ in range(task['num_class']): self.featmap_sizes.append(results[i]['featmap_size']) assert len(self.featmap_sizes) == len(self.anchor_generator.ranges), \ 'Length of feature map sizes must be equal to length of ' + \ 'different ranges of anchor generator.' return cls_score, bbox_pred, dir_cls_preds def loss_single(self, cls_score, bbox_pred, dir_cls_preds, labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, num_total_samples): """Calculate loss of Single-level results. Args: cls_score (torch.Tensor): Class score in single-level. bbox_pred (torch.Tensor): Bbox prediction in single-level. dir_cls_preds (torch.Tensor): Predictions of direction class in single-level. labels (torch.Tensor): Labels of class. label_weights (torch.Tensor): Weights of class loss. bbox_targets (torch.Tensor): Targets of bbox predictions. bbox_weights (torch.Tensor): Weights of bbox loss. dir_targets (torch.Tensor): Targets of direction predictions. dir_weights (torch.Tensor): Weights of direction loss. num_total_samples (int): The number of valid samples. Returns: tuple[torch.Tensor]: Losses of class, bbox and direction, respectively. """ # classification loss if num_total_samples is None: num_total_samples = int(cls_score.shape[0]) labels = labels.reshape(-1) label_weights = label_weights.reshape(-1) cls_score = cls_score.reshape(-1, self.num_classes) loss_cls = self.loss_cls( cls_score, labels, label_weights, avg_factor=num_total_samples) # regression loss bbox_targets = bbox_targets.reshape(-1, self.box_code_size) bbox_weights = bbox_weights.reshape(-1, self.box_code_size) code_weight = self.train_cfg.get('code_weight', None) if code_weight: bbox_weights = bbox_weights * bbox_weights.new_tensor(code_weight) bbox_pred = bbox_pred.reshape(-1, self.box_code_size) if self.diff_rad_by_sin: bbox_pred, bbox_targets = self.add_sin_difference( bbox_pred, bbox_targets) loss_bbox = self.loss_bbox( bbox_pred, bbox_targets, bbox_weights, avg_factor=num_total_samples) # direction classification loss loss_dir = None if self.use_direction_classifier: dir_cls_preds = dir_cls_preds.reshape(-1, 2) dir_targets = dir_targets.reshape(-1) dir_weights = dir_weights.reshape(-1) loss_dir = self.loss_dir( dir_cls_preds, dir_targets, dir_weights, avg_factor=num_total_samples) return loss_cls, loss_bbox, loss_dir def loss(self, cls_scores, bbox_preds, dir_cls_preds, gt_bboxes, gt_labels, input_metas, gt_bboxes_ignore=None): """Calculate losses. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Gt bboxes of each sample. gt_labels (list[torch.Tensor]): Gt labels of each sample. input_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict[str, list[torch.Tensor]]: Classification, bbox, and direction losses of each level. - loss_cls (list[torch.Tensor]): Classification losses. - loss_bbox (list[torch.Tensor]): Box regression losses. - loss_dir (list[torch.Tensor]): Direction classification losses. """ device = cls_scores[0].device anchor_list = self.get_anchors( self.featmap_sizes, input_metas, device=device) cls_reg_targets = self.anchor_target_3d( anchor_list, gt_bboxes, input_metas, gt_bboxes_ignore_list=gt_bboxes_ignore, gt_labels_list=gt_labels, num_classes=self.num_classes, sampling=self.sampling) if cls_reg_targets is None: return None (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_pos, num_total_neg) = cls_reg_targets num_total_samples = ( num_total_pos + num_total_neg if self.sampling else num_total_pos) # num_total_samples = None losses_cls, losses_bbox, losses_dir = multi_apply( self.loss_single, cls_scores, bbox_preds, dir_cls_preds, labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_samples=num_total_samples) return dict( loss_cls=losses_cls, loss_bbox=losses_bbox, loss_dir=losses_dir) def get_bboxes(self, cls_scores, bbox_preds, dir_cls_preds, input_metas, cfg=None, rescale=False): """Get bboxes of anchor head. Args: cls_scores (list[torch.Tensor]): Multi-level class scores. bbox_preds (list[torch.Tensor]): Multi-level bbox predictions. dir_cls_preds (list[torch.Tensor]): Multi-level direction class predictions. input_metas (list[dict]): Contain pcd and img's meta info. cfg (:obj:`ConfigDict`, optional): Training or testing config. Default: None. rescale (list[torch.Tensor], optional): Whether to rescale bbox. Default: False. Returns: list[tuple]: Prediction resultes of batches. """ assert len(cls_scores) == len(bbox_preds) assert len(cls_scores) == len(dir_cls_preds) num_levels = len(cls_scores) assert num_levels == 1, 'Only support single level inference.' device = cls_scores[0].device mlvl_anchors = self.anchor_generator.grid_anchors( self.featmap_sizes, device=device) # `anchor` is a list of anchors for different classes mlvl_anchors = [torch.cat(anchor, dim=0) for anchor in mlvl_anchors] result_list = [] for img_id in range(len(input_metas)): cls_score_list = [ cls_scores[i][img_id].detach() for i in range(num_levels) ] bbox_pred_list = [ bbox_preds[i][img_id].detach() for i in range(num_levels) ] dir_cls_pred_list = [ dir_cls_preds[i][img_id].detach() for i in range(num_levels) ] input_meta = input_metas[img_id] proposals = self.get_bboxes_single(cls_score_list, bbox_pred_list, dir_cls_pred_list, mlvl_anchors, input_meta, cfg, rescale) result_list.append(proposals) return result_list def get_bboxes_single(self, cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors, input_meta, cfg=None, rescale=False): """Get bboxes of single branch. Args: cls_scores (torch.Tensor): Class score in single batch. bbox_preds (torch.Tensor): Bbox prediction in single batch. dir_cls_preds (torch.Tensor): Predictions of direction class in single batch. mlvl_anchors (List[torch.Tensor]): Multi-level anchors in single batch. input_meta (list[dict]): Contain pcd and img's meta info. cfg (:obj:`ConfigDict`): Training or testing config. rescale (list[torch.Tensor], optional): whether to rescale bbox. Default: False. Returns: tuple: Contain predictions of single batch. - bboxes (:obj:`BaseInstance3DBoxes`): Predicted 3d bboxes. - scores (torch.Tensor): Class score of each bbox. - labels (torch.Tensor): Label of each bbox. """ cfg = self.test_cfg if cfg is None else cfg assert len(cls_scores) == len(bbox_preds) == len(mlvl_anchors) mlvl_bboxes = [] mlvl_scores = [] mlvl_dir_scores = [] for cls_score, bbox_pred, dir_cls_pred, anchors in zip( cls_scores, bbox_preds, dir_cls_preds, mlvl_anchors): assert cls_score.size()[-2] == bbox_pred.size()[-2] assert cls_score.size()[-2] == dir_cls_pred.size()[-2] dir_cls_score = torch.max(dir_cls_pred, dim=-1)[1] if self.use_sigmoid_cls: scores = cls_score.sigmoid() else: scores = cls_score.softmax(-1) nms_pre = cfg.get('nms_pre', -1) if nms_pre > 0 and scores.shape[0] > nms_pre: if self.use_sigmoid_cls: max_scores, _ = scores.max(dim=1) else: max_scores, _ = scores[:, :-1].max(dim=1) _, topk_inds = max_scores.topk(nms_pre) anchors = anchors[topk_inds, :] bbox_pred = bbox_pred[topk_inds, :] scores = scores[topk_inds, :] dir_cls_score = dir_cls_score[topk_inds] bboxes = self.bbox_coder.decode(anchors, bbox_pred) mlvl_bboxes.append(bboxes) mlvl_scores.append(scores) mlvl_dir_scores.append(dir_cls_score) mlvl_bboxes = torch.cat(mlvl_bboxes) mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.box_code_size).bev) mlvl_scores = torch.cat(mlvl_scores) mlvl_dir_scores = torch.cat(mlvl_dir_scores) if self.use_sigmoid_cls: # Add a dummy background class to the front when using sigmoid padding = mlvl_scores.new_zeros(mlvl_scores.shape[0], 1) mlvl_scores = torch.cat([mlvl_scores, padding], dim=1) score_thr = cfg.get('score_thr', 0) results = box3d_multiclass_nms(mlvl_bboxes, mlvl_bboxes_for_nms, mlvl_scores, score_thr, cfg.max_num, cfg, mlvl_dir_scores) bboxes, scores, labels, dir_scores = results if bboxes.shape[0] > 0: dir_rot = limit_period(bboxes[..., 6] - self.dir_offset, self.dir_limit_offset, np.pi) bboxes[..., 6] = ( dir_rot + self.dir_offset + np.pi * dir_scores.to(bboxes.dtype)) bboxes = input_meta['box_type_3d'](bboxes, box_dim=self.box_code_size) return bboxes, scores, labels ================================================ FILE: mmdet3d/models/dense_heads/smoke_mono3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch.nn import functional as F from mmdet.core import multi_apply from mmdet.core.bbox.builder import build_bbox_coder from mmdet.models.utils import gaussian_radius, gen_gaussian_target from mmdet.models.utils.gaussian_target import (get_local_maximum, get_topk_from_heatmap, transpose_and_gather_feat) from ..builder import HEADS from .anchor_free_mono3d_head import AnchorFreeMono3DHead @HEADS.register_module() class SMOKEMono3DHead(AnchorFreeMono3DHead): r"""Anchor-free head used in `SMOKE `_ .. code-block:: none /-----> 3*3 conv -----> 1*1 conv -----> cls feature \-----> 3*3 conv -----> 1*1 conv -----> reg Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. dim_channel (list[int]): indices of dimension offset preds in regression heatmap channels. ori_channel (list[int]): indices of orientation offset pred in regression heatmap channels. bbox_coder (:obj:`CameraInstance3DBoxes`): Bbox coder for encoding and decoding boxes. loss_cls (dict, optional): Config of classification loss. Default: loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0). loss_bbox (dict, optional): Config of localization loss. Default: loss_bbox=dict(type='L1Loss', loss_weight=10.0). loss_dir (dict, optional): Config of direction classification loss. In SMOKE, Default: None. loss_attr (dict, optional): Config of attribute classification loss. In SMOKE, Default: None. loss_centerness (dict): Config of centerness loss. norm_cfg (dict): Dictionary to construct and config norm layer. Default: norm_cfg=dict(type='GN', num_groups=32, requires_grad=True). init_cfg (dict): Initialization config dict. Default: None. """ # noqa: E501 def __init__(self, num_classes, in_channels, dim_channel, ori_channel, bbox_coder, loss_cls=dict(type='GaussionFocalLoss', loss_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=0.1), loss_dir=None, loss_attr=None, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), init_cfg=None, **kwargs): super().__init__( num_classes, in_channels, loss_cls=loss_cls, loss_bbox=loss_bbox, loss_dir=loss_dir, loss_attr=loss_attr, norm_cfg=norm_cfg, init_cfg=init_cfg, **kwargs) self.dim_channel = dim_channel self.ori_channel = ori_channel self.bbox_coder = build_bbox_coder(bbox_coder) def forward(self, feats): """Forward features from the upstream network. Args: feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor. Returns: tuple: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_points * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_points * bbox_code_size. """ return multi_apply(self.forward_single, feats) def forward_single(self, x): """Forward features of a single scale level. Args: x (Tensor): Input feature map. Returns: tuple: Scores for each class, bbox of input feature maps. """ cls_score, bbox_pred, dir_cls_pred, attr_pred, cls_feat, reg_feat = \ super().forward_single(x) cls_score = cls_score.sigmoid() # turn to 0-1 cls_score = cls_score.clamp(min=1e-4, max=1 - 1e-4) # (N, C, H, W) offset_dims = bbox_pred[:, self.dim_channel, ...] bbox_pred[:, self.dim_channel, ...] = offset_dims.sigmoid() - 0.5 # (N, C, H, W) vector_ori = bbox_pred[:, self.ori_channel, ...] bbox_pred[:, self.ori_channel, ...] = F.normalize(vector_ori) return cls_score, bbox_pred def get_bboxes(self, cls_scores, bbox_preds, img_metas, rescale=None): """Generate bboxes from bbox head predictions. Args: cls_scores (list[Tensor]): Box scores for each scale level. bbox_preds (list[Tensor]): Box regression for each scale. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. rescale (bool): If True, return boxes in original image space. Returns: list[tuple[:obj:`CameraInstance3DBoxes`, Tensor, Tensor, None]]: Each item in result_list is 4-tuple. """ assert len(cls_scores) == len(bbox_preds) == 1 cam2imgs = torch.stack([ cls_scores[0].new_tensor(img_meta['cam2img']) for img_meta in img_metas ]) trans_mats = torch.stack([ cls_scores[0].new_tensor(img_meta['trans_mat']) for img_meta in img_metas ]) batch_bboxes, batch_scores, batch_topk_labels = self.decode_heatmap( cls_scores[0], bbox_preds[0], img_metas, cam2imgs=cam2imgs, trans_mats=trans_mats, topk=100, kernel=3) result_list = [] for img_id in range(len(img_metas)): bboxes = batch_bboxes[img_id] scores = batch_scores[img_id] labels = batch_topk_labels[img_id] keep_idx = scores > 0.25 bboxes = bboxes[keep_idx] scores = scores[keep_idx] labels = labels[keep_idx] bboxes = img_metas[img_id]['box_type_3d']( bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)) attrs = None result_list.append((bboxes, scores, labels, attrs)) return result_list def decode_heatmap(self, cls_score, reg_pred, img_metas, cam2imgs, trans_mats, topk=100, kernel=3): """Transform outputs into detections raw bbox predictions. Args: class_score (Tensor): Center predict heatmap, shape (B, num_classes, H, W). reg_pred (Tensor): Box regression map. shape (B, channel, H , W). img_metas (List[dict]): Meta information of each image, e.g., image size, scaling factor, etc. cam2imgs (Tensor): Camera intrinsic matrixs. shape (B, 4, 4) trans_mats (Tensor): Transformation matrix from original image to feature map. shape: (batch, 3, 3) topk (int): Get top k center keypoints from heatmap. Default 100. kernel (int): Max pooling kernel for extract local maximum pixels. Default 3. Returns: tuple[torch.Tensor]: Decoded output of SMOKEHead, containing the following Tensors: - batch_bboxes (Tensor): Coords of each 3D box. shape (B, k, 7) - batch_scores (Tensor): Scores of each 3D box. shape (B, k) - batch_topk_labels (Tensor): Categories of each 3D box. shape (B, k) """ img_h, img_w = img_metas[0]['pad_shape'][:2] bs, _, feat_h, feat_w = cls_score.shape center_heatmap_pred = get_local_maximum(cls_score, kernel=kernel) *batch_dets, topk_ys, topk_xs = get_topk_from_heatmap( center_heatmap_pred, k=topk) batch_scores, batch_index, batch_topk_labels = batch_dets regression = transpose_and_gather_feat(reg_pred, batch_index) regression = regression.view(-1, 8) points = torch.cat([topk_xs.view(-1, 1), topk_ys.view(-1, 1).float()], dim=1) locations, dimensions, orientations = self.bbox_coder.decode( regression, points, batch_topk_labels, cam2imgs, trans_mats) batch_bboxes = torch.cat((locations, dimensions, orientations), dim=1) batch_bboxes = batch_bboxes.view(bs, -1, self.bbox_code_size) return batch_bboxes, batch_scores, batch_topk_labels def get_predictions(self, labels3d, centers2d, gt_locations, gt_dimensions, gt_orientations, indices, img_metas, pred_reg): """Prepare predictions for computing loss. Args: labels3d (Tensor): Labels of each 3D box. shape (B, max_objs, ) centers2d (Tensor): Coords of each projected 3D box center on image. shape (B * max_objs, 2) gt_locations (Tensor): Coords of each 3D box's location. shape (B * max_objs, 3) gt_dimensions (Tensor): Dimensions of each 3D box. shape (N, 3) gt_orientations (Tensor): Orientation(yaw) of each 3D box. shape (N, 1) indices (Tensor): Indices of the existence of the 3D box. shape (B * max_objs, ) img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. pre_reg (Tensor): Box regression map. shape (B, channel, H , W). Returns: dict: the dict has components below: - bbox3d_yaws (:obj:`CameraInstance3DBoxes`): bbox calculated using pred orientations. - bbox3d_dims (:obj:`CameraInstance3DBoxes`): bbox calculated using pred dimensions. - bbox3d_locs (:obj:`CameraInstance3DBoxes`): bbox calculated using pred locations. """ batch, channel = pred_reg.shape[0], pred_reg.shape[1] w = pred_reg.shape[3] cam2imgs = torch.stack([ gt_locations.new_tensor(img_meta['cam2img']) for img_meta in img_metas ]) trans_mats = torch.stack([ gt_locations.new_tensor(img_meta['trans_mat']) for img_meta in img_metas ]) centers2d_inds = centers2d[:, 1] * w + centers2d[:, 0] centers2d_inds = centers2d_inds.view(batch, -1) pred_regression = transpose_and_gather_feat(pred_reg, centers2d_inds) pred_regression_pois = pred_regression.view(-1, channel) locations, dimensions, orientations = self.bbox_coder.decode( pred_regression_pois, centers2d, labels3d, cam2imgs, trans_mats, gt_locations) locations, dimensions, orientations = locations[indices], dimensions[ indices], orientations[indices] locations[:, 1] += dimensions[:, 1] / 2 gt_locations = gt_locations[indices] assert len(locations) == len(gt_locations) assert len(dimensions) == len(gt_dimensions) assert len(orientations) == len(gt_orientations) bbox3d_yaws = self.bbox_coder.encode(gt_locations, gt_dimensions, orientations, img_metas) bbox3d_dims = self.bbox_coder.encode(gt_locations, dimensions, gt_orientations, img_metas) bbox3d_locs = self.bbox_coder.encode(locations, gt_dimensions, gt_orientations, img_metas) pred_bboxes = dict(ori=bbox3d_yaws, dim=bbox3d_dims, loc=bbox3d_locs) return pred_bboxes def get_targets(self, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, feat_shape, img_shape, img_metas): """Get training targets for batch images. Args: gt_bboxes (list[Tensor]): Ground truth bboxes of each image, shape (num_gt, 4). gt_labels (list[Tensor]): Ground truth labels of each box, shape (num_gt,). gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D Ground truth bboxes of each image, shape (num_gt, bbox_code_size). gt_labels_3d (list[Tensor]): 3D Ground truth labels of each box, shape (num_gt,). centers2d (list[Tensor]): Projected 3D centers onto 2D image, shape (num_gt, 2). feat_shape (tuple[int]): Feature map shape with value, shape (B, _, H, W). img_shape (tuple[int]): Image shape in [h, w] format. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple[Tensor, dict]: The Tensor value is the targets of center heatmap, the dict has components below: - gt_centers2d (Tensor): Coords of each projected 3D box center on image. shape (B * max_objs, 2) - gt_labels3d (Tensor): Labels of each 3D box. shape (B, max_objs, ) - indices (Tensor): Indices of the existence of the 3D box. shape (B * max_objs, ) - affine_indices (Tensor): Indices of the affine of the 3D box. shape (N, ) - gt_locs (Tensor): Coords of each 3D box's location. shape (N, 3) - gt_dims (Tensor): Dimensions of each 3D box. shape (N, 3) - gt_yaws (Tensor): Orientation(yaw) of each 3D box. shape (N, 1) - gt_cors (Tensor): Coords of the corners of each 3D box. shape (N, 8, 3) """ reg_mask = torch.stack([ gt_bboxes[0].new_tensor( not img_meta['affine_aug'], dtype=torch.bool) for img_meta in img_metas ]) img_h, img_w = img_shape[:2] bs, _, feat_h, feat_w = feat_shape width_ratio = float(feat_w / img_w) # 1/4 height_ratio = float(feat_h / img_h) # 1/4 assert width_ratio == height_ratio center_heatmap_target = gt_bboxes[-1].new_zeros( [bs, self.num_classes, feat_h, feat_w]) gt_centers2d = centers2d.copy() for batch_id in range(bs): gt_bbox = gt_bboxes[batch_id] gt_label = gt_labels[batch_id] # project centers2d from input image to feat map gt_center2d = gt_centers2d[batch_id] * width_ratio for j, center in enumerate(gt_center2d): center_x_int, center_y_int = center.int() scale_box_h = (gt_bbox[j][3] - gt_bbox[j][1]) * height_ratio scale_box_w = (gt_bbox[j][2] - gt_bbox[j][0]) * width_ratio radius = gaussian_radius([scale_box_h, scale_box_w], min_overlap=0.7) radius = max(0, int(radius)) ind = gt_label[j] gen_gaussian_target(center_heatmap_target[batch_id, ind], [center_x_int, center_y_int], radius) avg_factor = max(1, center_heatmap_target.eq(1).sum()) num_ctrs = [center2d.shape[0] for center2d in centers2d] max_objs = max(num_ctrs) reg_inds = torch.cat( [reg_mask[i].repeat(num_ctrs[i]) for i in range(bs)]) inds = torch.zeros((bs, max_objs), dtype=torch.bool).to(centers2d[0].device) # put gt 3d bboxes to gpu gt_bboxes_3d = [ gt_bbox_3d.to(centers2d[0].device) for gt_bbox_3d in gt_bboxes_3d ] batch_centers2d = centers2d[0].new_zeros((bs, max_objs, 2)) batch_labels_3d = gt_labels_3d[0].new_zeros((bs, max_objs)) batch_gt_locations = \ gt_bboxes_3d[0].tensor.new_zeros((bs, max_objs, 3)) for i in range(bs): inds[i, :num_ctrs[i]] = 1 batch_centers2d[i, :num_ctrs[i]] = centers2d[i] batch_labels_3d[i, :num_ctrs[i]] = gt_labels_3d[i] batch_gt_locations[i, :num_ctrs[i]] = \ gt_bboxes_3d[i].tensor[:, :3] inds = inds.flatten() batch_centers2d = batch_centers2d.view(-1, 2) * width_ratio batch_gt_locations = batch_gt_locations.view(-1, 3) # filter the empty image, without gt_bboxes_3d gt_bboxes_3d = [ gt_bbox_3d for gt_bbox_3d in gt_bboxes_3d if gt_bbox_3d.tensor.shape[0] > 0 ] gt_dimensions = torch.cat( [gt_bbox_3d.tensor[:, 3:6] for gt_bbox_3d in gt_bboxes_3d]) gt_orientations = torch.cat([ gt_bbox_3d.tensor[:, 6].unsqueeze(-1) for gt_bbox_3d in gt_bboxes_3d ]) gt_corners = torch.cat( [gt_bbox_3d.corners for gt_bbox_3d in gt_bboxes_3d]) target_labels = dict( gt_centers2d=batch_centers2d.long(), gt_labels3d=batch_labels_3d, indices=inds, reg_indices=reg_inds, gt_locs=batch_gt_locations, gt_dims=gt_dimensions, gt_yaws=gt_orientations, gt_cors=gt_corners) return center_heatmap_target, avg_factor, target_labels def loss(self, cls_scores, bbox_preds, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, img_metas, gt_bboxes_ignore=None): """Compute loss of the head. Args: cls_scores (list[Tensor]): Box scores for each scale level. shape (num_gt, 4). bbox_preds (list[Tensor]): Box dims is a 4D-tensor, the channel number is bbox_code_size. shape (B, 7, H, W). gt_bboxes (list[Tensor]): Ground truth bboxes for each image. shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box. shape (num_gts, ). gt_bboxes_3d (list[:obj:`CameraInstance3DBoxes`]): 3D boxes ground truth. it is the flipped gt_bboxes gt_labels_3d (list[Tensor]): Same as gt_labels. centers2d (list[Tensor]): 2D centers on the image. shape (num_gts, 2). depths (list[Tensor]): Depth ground truth. shape (num_gts, ). attr_labels (list[Tensor]): Attributes indices of each box. In kitti it's None. img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Default: None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert len(cls_scores) == len(bbox_preds) == 1 assert attr_labels is None assert gt_bboxes_ignore is None center2d_heatmap = cls_scores[0] pred_reg = bbox_preds[0] center2d_heatmap_target, avg_factor, target_labels = \ self.get_targets(gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, center2d_heatmap.shape, img_metas[0]['pad_shape'], img_metas) pred_bboxes = self.get_predictions( labels3d=target_labels['gt_labels3d'], centers2d=target_labels['gt_centers2d'], gt_locations=target_labels['gt_locs'], gt_dimensions=target_labels['gt_dims'], gt_orientations=target_labels['gt_yaws'], indices=target_labels['indices'], img_metas=img_metas, pred_reg=pred_reg) loss_cls = self.loss_cls( center2d_heatmap, center2d_heatmap_target, avg_factor=avg_factor) reg_inds = target_labels['reg_indices'] loss_bbox_oris = self.loss_bbox( pred_bboxes['ori'].corners[reg_inds, ...], target_labels['gt_cors'][reg_inds, ...]) loss_bbox_dims = self.loss_bbox( pred_bboxes['dim'].corners[reg_inds, ...], target_labels['gt_cors'][reg_inds, ...]) loss_bbox_locs = self.loss_bbox( pred_bboxes['loc'].corners[reg_inds, ...], target_labels['gt_cors'][reg_inds, ...]) loss_bbox = loss_bbox_dims + loss_bbox_locs + loss_bbox_oris loss_dict = dict(loss_cls=loss_cls, loss_bbox=loss_bbox) return loss_dict ================================================ FILE: mmdet3d/models/dense_heads/ssd_3d_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.ops.nms import batched_nms from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core.bbox.structures import (DepthInstance3DBoxes, LiDARInstance3DBoxes, rotation_3d_in_axis) from mmdet.core import multi_apply from ..builder import HEADS, build_loss from .vote_head import VoteHead @HEADS.register_module() class SSD3DHead(VoteHead): r"""Bbox head of `3DSSD `_. Args: num_classes (int): The number of class. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. in_channels (int): The number of input feature channel. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. vote_module_cfg (dict): Config of VoteModule for point-wise votes. vote_aggregation_cfg (dict): Config of vote aggregation layer. pred_layer_cfg (dict): Config of classfication and regression prediction layers. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. act_cfg (dict): Config of activation in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_res_loss (dict): Config of size residual regression loss. corner_loss (dict): Config of bbox corners regression loss. vote_loss (dict): Config of candidate points regression loss. """ def __init__(self, num_classes, bbox_coder, in_channels=256, train_cfg=None, test_cfg=None, vote_module_cfg=None, vote_aggregation_cfg=None, pred_layer_cfg=None, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_res_loss=None, corner_loss=None, vote_loss=None, init_cfg=None): super(SSD3DHead, self).__init__( num_classes, bbox_coder, train_cfg=train_cfg, test_cfg=test_cfg, vote_module_cfg=vote_module_cfg, vote_aggregation_cfg=vote_aggregation_cfg, pred_layer_cfg=pred_layer_cfg, conv_cfg=conv_cfg, norm_cfg=norm_cfg, objectness_loss=objectness_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_class_loss=None, size_res_loss=size_res_loss, semantic_loss=None, init_cfg=init_cfg) self.corner_loss = build_loss(corner_loss) self.vote_loss = build_loss(vote_loss) self.num_candidates = vote_module_cfg['num_points'] def _get_cls_out_channels(self): """Return the channel number of classification outputs.""" # Class numbers (k) + objectness (1) return self.num_classes def _get_reg_out_channels(self): """Return the channel number of regression outputs.""" # Bbox classification and regression # (center residual (3), size regression (3) # heading class+residual (num_dir_bins*2)), return 3 + 3 + self.num_dir_bins * 2 def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. torch.Tensor: Indices of input points. """ seed_points = feat_dict['sa_xyz'][-1] seed_features = feat_dict['sa_features'][-1] seed_indices = feat_dict['sa_indices'][-1] return seed_points, seed_features, seed_indices @force_fp32(apply_to=('bbox_preds', )) def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of SSD3DHead. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict: Losses of 3DSSD. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask, centerness_weights, box_loss_weights, heading_res_loss_weight) = targets # calculate centerness loss centerness_loss = self.objectness_loss( bbox_preds['obj_scores'].transpose(2, 1), centerness_targets, weight=centerness_weights) # calculate center loss center_loss = self.center_loss( bbox_preds['center_offset'], center_targets, weight=box_loss_weights.unsqueeze(-1)) # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds['dir_class'].transpose(1, 2), dir_class_targets, weight=box_loss_weights) # calculate direction residual loss dir_res_loss = self.dir_res_loss( bbox_preds['dir_res_norm'], dir_res_targets.unsqueeze(-1).repeat(1, 1, self.num_dir_bins), weight=heading_res_loss_weight) # calculate size residual loss size_loss = self.size_res_loss( bbox_preds['size'], size_res_targets, weight=box_loss_weights.unsqueeze(-1)) # calculate corner loss one_hot_dir_class_targets = dir_class_targets.new_zeros( bbox_preds['dir_class'].shape) one_hot_dir_class_targets.scatter_(2, dir_class_targets.unsqueeze(-1), 1) pred_bbox3d = self.bbox_coder.decode( dict( center=bbox_preds['center'], dir_res=bbox_preds['dir_res'], dir_class=one_hot_dir_class_targets, size=bbox_preds['size'])) pred_bbox3d = pred_bbox3d.reshape(-1, pred_bbox3d.shape[-1]) pred_bbox3d = img_metas[0]['box_type_3d']( pred_bbox3d.clone(), box_dim=pred_bbox3d.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) pred_corners3d = pred_bbox3d.corners.reshape(-1, 8, 3) corner_loss = self.corner_loss( pred_corners3d, corner3d_targets.reshape(-1, 8, 3), weight=box_loss_weights.view(-1, 1, 1)) # calculate vote loss vote_loss = self.vote_loss( bbox_preds['vote_offset'].transpose(1, 2), vote_targets, weight=vote_mask.unsqueeze(-1)) losses = dict( centerness_loss=centerness_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_res_loss=size_loss, corner_loss=corner_loss, vote_loss=vote_loss) return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of ssd3d head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of ssd3d head. Returns: tuple[torch.Tensor]: Targets of ssd3d head. """ # find empty example for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] aggregated_points = [ bbox_preds['aggregated_points'][i] for i in range(len(gt_labels_3d)) ] seed_points = [ bbox_preds['seed_points'][i, :self.num_candidates].detach() for i in range(len(gt_labels_3d)) ] (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask) = multi_apply( self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, aggregated_points, seed_points) center_targets = torch.stack(center_targets) positive_mask = torch.stack(positive_mask) negative_mask = torch.stack(negative_mask) dir_class_targets = torch.stack(dir_class_targets) dir_res_targets = torch.stack(dir_res_targets) size_res_targets = torch.stack(size_res_targets) mask_targets = torch.stack(mask_targets) centerness_targets = torch.stack(centerness_targets).detach() corner3d_targets = torch.stack(corner3d_targets) vote_targets = torch.stack(vote_targets) vote_mask = torch.stack(vote_mask) center_targets -= bbox_preds['aggregated_points'] centerness_weights = (positive_mask + negative_mask).unsqueeze(-1).repeat( 1, 1, self.num_classes).float() centerness_weights = centerness_weights / \ (centerness_weights.sum() + 1e-6) vote_mask = vote_mask / (vote_mask.sum() + 1e-6) box_loss_weights = positive_mask / (positive_mask.sum() + 1e-6) batch_size, proposal_num = dir_class_targets.shape[:2] heading_label_one_hot = dir_class_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) heading_res_loss_weight = heading_label_one_hot * \ box_loss_weights.unsqueeze(-1) return (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask, centerness_weights, box_loss_weights, heading_res_loss_weight) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, aggregated_points=None, seed_points=None): """Generate targets of ssd3d head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from candidate points layer. seed_points (torch.Tensor): Seed points of candidate points. Returns: tuple[torch.Tensor]: Targets of ssd3d head. """ assert self.bbox_coder.with_rot or pts_semantic_mask is not None gt_bboxes_3d = gt_bboxes_3d.to(points.device) valid_gt = gt_labels_3d != -1 gt_bboxes_3d = gt_bboxes_3d[valid_gt] gt_labels_3d = gt_labels_3d[valid_gt] # Generate fake GT for empty scene if valid_gt.sum() == 0: vote_targets = points.new_zeros(self.num_candidates, 3) center_targets = points.new_zeros(self.num_candidates, 3) size_res_targets = points.new_zeros(self.num_candidates, 3) dir_class_targets = points.new_zeros( self.num_candidates, dtype=torch.int64) dir_res_targets = points.new_zeros(self.num_candidates) mask_targets = points.new_zeros( self.num_candidates, dtype=torch.int64) centerness_targets = points.new_zeros(self.num_candidates, self.num_classes) corner3d_targets = points.new_zeros(self.num_candidates, 8, 3) vote_mask = points.new_zeros(self.num_candidates, dtype=torch.bool) positive_mask = points.new_zeros( self.num_candidates, dtype=torch.bool) negative_mask = points.new_ones( self.num_candidates, dtype=torch.bool) return (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask) gt_corner3d = gt_bboxes_3d.corners (center_targets, size_targets, dir_class_targets, dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d) points_mask, assignment = self._assign_targets_by_points_inside( gt_bboxes_3d, aggregated_points) center_targets = center_targets[assignment] size_res_targets = size_targets[assignment] mask_targets = gt_labels_3d[assignment] dir_class_targets = dir_class_targets[assignment] dir_res_targets = dir_res_targets[assignment] corner3d_targets = gt_corner3d[assignment] top_center_targets = center_targets.clone() top_center_targets[:, 2] += size_res_targets[:, 2] dist = torch.norm(aggregated_points - top_center_targets, dim=1) dist_mask = dist < self.train_cfg.pos_distance_thr positive_mask = (points_mask.max(1)[0] > 0) * dist_mask negative_mask = (points_mask.max(1)[0] == 0) # Centerness loss targets canonical_xyz = aggregated_points - center_targets if self.bbox_coder.with_rot: # TODO: Align points rotation implementation of # LiDARInstance3DBoxes and DepthInstance3DBoxes canonical_xyz = rotation_3d_in_axis( canonical_xyz.unsqueeze(0).transpose(0, 1), -gt_bboxes_3d.yaw[assignment], axis=2).squeeze(1) distance_front = torch.clamp( size_res_targets[:, 0] - canonical_xyz[:, 0], min=0) distance_back = torch.clamp( size_res_targets[:, 0] + canonical_xyz[:, 0], min=0) distance_left = torch.clamp( size_res_targets[:, 1] - canonical_xyz[:, 1], min=0) distance_right = torch.clamp( size_res_targets[:, 1] + canonical_xyz[:, 1], min=0) distance_top = torch.clamp( size_res_targets[:, 2] - canonical_xyz[:, 2], min=0) distance_bottom = torch.clamp( size_res_targets[:, 2] + canonical_xyz[:, 2], min=0) centerness_l = torch.min(distance_front, distance_back) / torch.max( distance_front, distance_back) centerness_w = torch.min(distance_left, distance_right) / torch.max( distance_left, distance_right) centerness_h = torch.min(distance_bottom, distance_top) / torch.max( distance_bottom, distance_top) centerness_targets = torch.clamp( centerness_l * centerness_w * centerness_h, min=0) centerness_targets = centerness_targets.pow(1 / 3.0) centerness_targets = torch.clamp(centerness_targets, min=0, max=1) proposal_num = centerness_targets.shape[0] one_hot_centerness_targets = centerness_targets.new_zeros( (proposal_num, self.num_classes)) one_hot_centerness_targets.scatter_(1, mask_targets.unsqueeze(-1), 1) centerness_targets = centerness_targets.unsqueeze( 1) * one_hot_centerness_targets # Vote loss targets enlarged_gt_bboxes_3d = gt_bboxes_3d.enlarged_box( self.train_cfg.expand_dims_length) enlarged_gt_bboxes_3d.tensor[:, 2] -= self.train_cfg.expand_dims_length vote_mask, vote_assignment = self._assign_targets_by_points_inside( enlarged_gt_bboxes_3d, seed_points) vote_targets = gt_bboxes_3d.gravity_center vote_targets = vote_targets[vote_assignment] - seed_points vote_mask = vote_mask.max(1)[0] > 0 return (vote_targets, center_targets, size_res_targets, dir_class_targets, dir_res_targets, mask_targets, centerness_targets, corner3d_targets, vote_mask, positive_mask, negative_mask) def get_bboxes(self, points, bbox_preds, input_metas, rescale=False): """Generate bboxes from 3DSSD head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from sdd3d head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # decode boxes sem_scores = F.sigmoid(bbox_preds['obj_scores']).transpose(1, 2) obj_scores = sem_scores.max(-1)[0] bbox3d = self.bbox_coder.decode(bbox_preds) batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = self.multiclass_nms_single( obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected.clone(), box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): Semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ bbox = input_meta['box_type_3d']( bbox.clone(), box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) if isinstance(bbox, (LiDARInstance3DBoxes, DepthInstance3DBoxes)): box_indices = bbox.points_in_boxes_all(points) nonempty_box_mask = box_indices.T.sum(1) >= 0 else: raise NotImplementedError('Unsupported bbox type!') corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] bbox_classes = torch.argmax(sem_scores, -1) nms_keep = batched_nms( minmax_box3d[nonempty_box_mask][:, [0, 1, 3, 4]], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_cfg)[1] if nms_keep.shape[0] > self.test_cfg.max_output_num: nms_keep = nms_keep[:self.test_cfg.max_output_num] # filter empty boxes and boxes with low score scores_mask = (obj_scores >= self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_keep], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels def _assign_targets_by_points_inside(self, bboxes_3d, points): """Compute assignment by checking whether point is inside bbox. Args: bboxes_3d (BaseInstance3DBoxes): Instance of bounding boxes. points (torch.Tensor): Points of a batch. Returns: tuple[torch.Tensor]: Flags indicating whether each point is inside bbox and the index of box where each point are in. """ if isinstance(bboxes_3d, (LiDARInstance3DBoxes, DepthInstance3DBoxes)): points_mask = bboxes_3d.points_in_boxes_all(points) assignment = points_mask.argmax(dim=-1) else: raise NotImplementedError('Unsupported bbox type!') return points_mask, assignment ================================================ FILE: mmdet3d/models/dense_heads/train_mixins.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmdet3d.core import limit_period from mmdet.core import images_to_levels, multi_apply class AnchorTrainMixin(object): """Mixin class for target assigning of dense heads.""" def anchor_target_3d(self, anchor_list, gt_bboxes_list, input_metas, gt_bboxes_ignore_list=None, gt_labels_list=None, label_channels=1, num_classes=1, sampling=True): """Compute regression and classification targets for anchors. Args: anchor_list (list[list]): Multi level anchors of each image. gt_bboxes_list (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each image. input_metas (list[dict]): Meta info of each image. gt_bboxes_ignore_list (list): Ignore list of gt bboxes. gt_labels_list (list[torch.Tensor]): Gt labels of batches. label_channels (int): The channel of labels. num_classes (int): The number of classes. sampling (bool): Whether to sample anchors. Returns: tuple (list, list, list, list, list, list, int, int): Anchor targets, including labels, label weights, bbox targets, bbox weights, direction targets, direction weights, number of positive anchors and number of negative anchors. """ num_imgs = len(input_metas) assert len(anchor_list) == num_imgs if isinstance(anchor_list[0][0], list): # sizes of anchors are different # anchor number of a single level num_level_anchors = [ sum([anchor.size(0) for anchor in anchors]) for anchors in anchor_list[0] ] for i in range(num_imgs): anchor_list[i] = anchor_list[i][0] else: # anchor number of multi levels num_level_anchors = [ anchors.view(-1, self.box_code_size).size(0) for anchors in anchor_list[0] ] # concat all level anchors and flags to a single tensor for i in range(num_imgs): anchor_list[i] = torch.cat(anchor_list[i]) # compute targets for each image if gt_bboxes_ignore_list is None: gt_bboxes_ignore_list = [None for _ in range(num_imgs)] if gt_labels_list is None: gt_labels_list = [None for _ in range(num_imgs)] (all_labels, all_label_weights, all_bbox_targets, all_bbox_weights, all_dir_targets, all_dir_weights, pos_inds_list, neg_inds_list) = multi_apply( self.anchor_target_3d_single, anchor_list, gt_bboxes_list, gt_bboxes_ignore_list, gt_labels_list, input_metas, label_channels=label_channels, num_classes=num_classes, sampling=sampling) # no valid anchors if any([labels is None for labels in all_labels]): return None # sampled anchors of all images num_total_pos = sum([max(inds.numel(), 1) for inds in pos_inds_list]) num_total_neg = sum([max(inds.numel(), 1) for inds in neg_inds_list]) # split targets to a list w.r.t. multiple levels labels_list = images_to_levels(all_labels, num_level_anchors) label_weights_list = images_to_levels(all_label_weights, num_level_anchors) bbox_targets_list = images_to_levels(all_bbox_targets, num_level_anchors) bbox_weights_list = images_to_levels(all_bbox_weights, num_level_anchors) dir_targets_list = images_to_levels(all_dir_targets, num_level_anchors) dir_weights_list = images_to_levels(all_dir_weights, num_level_anchors) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, dir_targets_list, dir_weights_list, num_total_pos, num_total_neg) def anchor_target_3d_single(self, anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, label_channels=1, num_classes=1, sampling=True): """Compute targets of anchors in single batch. Args: anchors (torch.Tensor): Concatenated multi-level anchor. gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes. gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes. gt_labels (torch.Tensor): Gt class labels. input_meta (dict): Meta info of each image. label_channels (int): The channel of labels. num_classes (int): The number of classes. sampling (bool): Whether to sample anchors. Returns: tuple[torch.Tensor]: Anchor targets. """ if isinstance(self.bbox_assigner, list) and (not isinstance(anchors, list)): feat_size = anchors.size(0) * anchors.size(1) * anchors.size(2) rot_angles = anchors.size(-2) assert len(self.bbox_assigner) == anchors.size(-3) (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], [] current_anchor_num = 0 for i, assigner in enumerate(self.bbox_assigner): current_anchors = anchors[..., i, :, :].reshape( -1, self.box_code_size) current_anchor_num += current_anchors.size(0) if self.assign_per_class: gt_per_cls = (gt_labels == i) anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes[gt_per_cls, :], gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta, num_classes, sampling) else: anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes, sampling) (labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets total_labels.append(labels.reshape(feat_size, 1, rot_angles)) total_label_weights.append( label_weights.reshape(feat_size, 1, rot_angles)) total_bbox_targets.append( bbox_targets.reshape(feat_size, 1, rot_angles, anchors.size(-1))) total_bbox_weights.append( bbox_weights.reshape(feat_size, 1, rot_angles, anchors.size(-1))) total_dir_targets.append( dir_targets.reshape(feat_size, 1, rot_angles)) total_dir_weights.append( dir_weights.reshape(feat_size, 1, rot_angles)) total_pos_inds.append(pos_inds) total_neg_inds.append(neg_inds) total_labels = torch.cat(total_labels, dim=-2).reshape(-1) total_label_weights = torch.cat( total_label_weights, dim=-2).reshape(-1) total_bbox_targets = torch.cat( total_bbox_targets, dim=-3).reshape(-1, anchors.size(-1)) total_bbox_weights = torch.cat( total_bbox_weights, dim=-3).reshape(-1, anchors.size(-1)) total_dir_targets = torch.cat( total_dir_targets, dim=-2).reshape(-1) total_dir_weights = torch.cat( total_dir_weights, dim=-2).reshape(-1) total_pos_inds = torch.cat(total_pos_inds, dim=0).reshape(-1) total_neg_inds = torch.cat(total_neg_inds, dim=0).reshape(-1) return (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) elif isinstance(self.bbox_assigner, list) and isinstance( anchors, list): # class-aware anchors with different feature map sizes assert len(self.bbox_assigner) == len(anchors), \ 'The number of bbox assigners and anchors should be the same.' (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) = [], [], [], [], [], [], [], [] current_anchor_num = 0 for i, assigner in enumerate(self.bbox_assigner): current_anchors = anchors[i] current_anchor_num += current_anchors.size(0) if self.assign_per_class: gt_per_cls = (gt_labels == i) anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes[gt_per_cls, :], gt_bboxes_ignore, gt_labels[gt_per_cls], input_meta, num_classes, sampling) else: anchor_targets = self.anchor_target_single_assigner( assigner, current_anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes, sampling) (labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, pos_inds, neg_inds) = anchor_targets total_labels.append(labels) total_label_weights.append(label_weights) total_bbox_targets.append( bbox_targets.reshape(-1, anchors[i].size(-1))) total_bbox_weights.append( bbox_weights.reshape(-1, anchors[i].size(-1))) total_dir_targets.append(dir_targets) total_dir_weights.append(dir_weights) total_pos_inds.append(pos_inds) total_neg_inds.append(neg_inds) total_labels = torch.cat(total_labels, dim=0) total_label_weights = torch.cat(total_label_weights, dim=0) total_bbox_targets = torch.cat(total_bbox_targets, dim=0) total_bbox_weights = torch.cat(total_bbox_weights, dim=0) total_dir_targets = torch.cat(total_dir_targets, dim=0) total_dir_weights = torch.cat(total_dir_weights, dim=0) total_pos_inds = torch.cat(total_pos_inds, dim=0) total_neg_inds = torch.cat(total_neg_inds, dim=0) return (total_labels, total_label_weights, total_bbox_targets, total_bbox_weights, total_dir_targets, total_dir_weights, total_pos_inds, total_neg_inds) else: return self.anchor_target_single_assigner(self.bbox_assigner, anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes, sampling) def anchor_target_single_assigner(self, bbox_assigner, anchors, gt_bboxes, gt_bboxes_ignore, gt_labels, input_meta, num_classes=1, sampling=True): """Assign anchors and encode positive anchors. Args: bbox_assigner (BaseAssigner): assign positive and negative boxes. anchors (torch.Tensor): Concatenated multi-level anchor. gt_bboxes (:obj:`BaseInstance3DBoxes`): Gt bboxes. gt_bboxes_ignore (torch.Tensor): Ignored gt bboxes. gt_labels (torch.Tensor): Gt class labels. input_meta (dict): Meta info of each image. num_classes (int): The number of classes. sampling (bool): Whether to sample anchors. Returns: tuple[torch.Tensor]: Anchor targets. """ anchors = anchors.reshape(-1, anchors.size(-1)) num_valid_anchors = anchors.shape[0] bbox_targets = torch.zeros_like(anchors) bbox_weights = torch.zeros_like(anchors) dir_targets = anchors.new_zeros((anchors.shape[0]), dtype=torch.long) dir_weights = anchors.new_zeros((anchors.shape[0]), dtype=torch.float) labels = anchors.new_zeros(num_valid_anchors, dtype=torch.long) label_weights = anchors.new_zeros(num_valid_anchors, dtype=torch.float) if len(gt_bboxes) > 0: if not isinstance(gt_bboxes, torch.Tensor): gt_bboxes = gt_bboxes.tensor.to(anchors.device) assign_result = bbox_assigner.assign(anchors, gt_bboxes, gt_bboxes_ignore, gt_labels) sampling_result = self.bbox_sampler.sample(assign_result, anchors, gt_bboxes) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds else: pos_inds = torch.nonzero( anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) > 0, as_tuple=False).squeeze(-1).unique() neg_inds = torch.nonzero( anchors.new_zeros((anchors.shape[0], ), dtype=torch.bool) == 0, as_tuple=False).squeeze(-1).unique() if gt_labels is not None: labels += num_classes if len(pos_inds) > 0: pos_bbox_targets = self.bbox_coder.encode( sampling_result.pos_bboxes, sampling_result.pos_gt_bboxes) pos_dir_targets = get_direction_target( sampling_result.pos_bboxes, pos_bbox_targets, self.dir_offset, self.dir_limit_offset, one_hot=False) bbox_targets[pos_inds, :] = pos_bbox_targets bbox_weights[pos_inds, :] = 1.0 dir_targets[pos_inds] = pos_dir_targets dir_weights[pos_inds] = 1.0 if gt_labels is None: labels[pos_inds] = 1 else: labels[pos_inds] = gt_labels[ sampling_result.pos_assigned_gt_inds] if self.train_cfg.pos_weight <= 0: label_weights[pos_inds] = 1.0 else: label_weights[pos_inds] = self.train_cfg.pos_weight if len(neg_inds) > 0: label_weights[neg_inds] = 1.0 return (labels, label_weights, bbox_targets, bbox_weights, dir_targets, dir_weights, pos_inds, neg_inds) def get_direction_target(anchors, reg_targets, dir_offset=0, dir_limit_offset=0, num_bins=2, one_hot=True): """Encode direction to 0 ~ num_bins-1. Args: anchors (torch.Tensor): Concatenated multi-level anchor. reg_targets (torch.Tensor): Bbox regression targets. dir_offset (int): Direction offset. num_bins (int): Number of bins to divide 2*PI. one_hot (bool): Whether to encode as one hot. Returns: torch.Tensor: Encoded direction targets. """ rot_gt = reg_targets[..., 6] + anchors[..., 6] offset_rot = limit_period(rot_gt - dir_offset, dir_limit_offset, 2 * np.pi) dir_cls_targets = torch.floor(offset_rot / (2 * np.pi / num_bins)).long() dir_cls_targets = torch.clamp(dir_cls_targets, min=0, max=num_bins - 1) if one_hot: dir_targets = torch.zeros( *list(dir_cls_targets.shape), num_bins, dtype=anchors.dtype, device=dir_cls_targets.device) dir_targets.scatter_(dir_cls_targets.unsqueeze(dim=-1).long(), 1.0) dir_cls_targets = dir_targets return dir_cls_targets ================================================ FILE: mmdet3d/models/dense_heads/vote_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.ops import furthest_point_sample from mmcv.runner import BaseModule, force_fp32 from torch.nn import functional as F from mmdet3d.core.post_processing import aligned_3d_nms from mmdet3d.models.losses import chamfer_distance from mmdet3d.models.model_utils import VoteModule from mmdet3d.ops import build_sa_module from mmdet.core import build_bbox_coder, multi_apply from ..builder import HEADS, build_loss from .base_conv_bbox_head import BaseConvBboxHead @HEADS.register_module() class VoteHead(BaseModule): r"""Bbox head of `Votenet `_. Args: num_classes (int): The number of class. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. vote_module_cfg (dict): Config of VoteModule for point-wise votes. vote_aggregation_cfg (dict): Config of vote aggregation layer. pred_layer_cfg (dict): Config of classfication and regression prediction layers. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_class_loss (dict): Config of size classification loss. size_res_loss (dict): Config of size residual regression loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. """ def __init__(self, num_classes, bbox_coder, train_cfg=None, test_cfg=None, vote_module_cfg=None, vote_aggregation_cfg=None, pred_layer_cfg=None, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_class_loss=None, size_res_loss=None, semantic_loss=None, iou_loss=None, init_cfg=None): super(VoteHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.gt_per_seed = vote_module_cfg['gt_per_seed'] self.num_proposal = vote_aggregation_cfg['num_point'] self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.dir_res_loss = build_loss(dir_res_loss) self.dir_class_loss = build_loss(dir_class_loss) self.size_res_loss = build_loss(size_res_loss) if size_class_loss is not None: self.size_class_loss = build_loss(size_class_loss) if semantic_loss is not None: self.semantic_loss = build_loss(semantic_loss) if iou_loss is not None: self.iou_loss = build_loss(iou_loss) else: self.iou_loss = None self.bbox_coder = build_bbox_coder(bbox_coder) self.num_sizes = self.bbox_coder.num_sizes self.num_dir_bins = self.bbox_coder.num_dir_bins self.vote_module = VoteModule(**vote_module_cfg) self.vote_aggregation = build_sa_module(vote_aggregation_cfg) self.fp16_enabled = False # Bbox classification and regression self.conv_pred = BaseConvBboxHead( **pred_layer_cfg, num_cls_out_channels=self._get_cls_out_channels(), num_reg_out_channels=self._get_reg_out_channels()) def _get_cls_out_channels(self): """Return the channel number of classification outputs.""" # Class numbers (k) + objectness (2) return self.num_classes + 2 def _get_reg_out_channels(self): """Return the channel number of regression outputs.""" # Objectness scores (2), center residual (3), # heading class+residual (num_dir_bins*2), # size class+residual(num_sizes*4) return 3 + self.num_dir_bins * 2 + self.num_sizes * 4 def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone. Returns: torch.Tensor: Coordinates of input points. torch.Tensor: Features of input points. torch.Tensor: Indices of input points. """ # for imvotenet if 'seed_points' in feat_dict and \ 'seed_features' in feat_dict and \ 'seed_indices' in feat_dict: seed_points = feat_dict['seed_points'] seed_features = feat_dict['seed_features'] seed_indices = feat_dict['seed_indices'] # for votenet else: seed_points = feat_dict['fp_xyz'][-1] seed_features = feat_dict['fp_features'][-1] seed_indices = feat_dict['fp_indices'][-1] return seed_points, seed_features, seed_indices def forward(self, feat_dict, sample_mod): """Forward pass. Note: The forward of VoteHead is divided into 4 steps: 1. Generate vote_points from seed_points. 2. Aggregate vote_points. 3. Predict bbox and score. 4. Decode predictions. Args: feat_dict (dict): Feature dict from backbone. sample_mod (str): Sample mode for vote aggregation layer. valid modes are "vote", "seed", "random" and "spec". Returns: dict: Predictions of vote head. """ assert sample_mod in ['vote', 'seed', 'random', 'spec'] seed_points, seed_features, seed_indices = self._extract_input( feat_dict) # 1. generate vote_points from seed_points vote_points, vote_features, vote_offset = self.vote_module( seed_points, seed_features) results = dict( seed_points=seed_points, seed_indices=seed_indices, vote_points=vote_points, vote_features=vote_features, vote_offset=vote_offset) # 2. aggregate vote_points if sample_mod == 'vote': # use fps in vote_aggregation aggregation_inputs = dict( points_xyz=vote_points, features=vote_features) elif sample_mod == 'seed': # FPS on seed and choose the votes corresponding to the seeds sample_indices = furthest_point_sample(seed_points, self.num_proposal) aggregation_inputs = dict( points_xyz=vote_points, features=vote_features, indices=sample_indices) elif sample_mod == 'random': # Random sampling from the votes batch_size, num_seed = seed_points.shape[:2] sample_indices = seed_points.new_tensor( torch.randint(0, num_seed, (batch_size, self.num_proposal)), dtype=torch.int32) aggregation_inputs = dict( points_xyz=vote_points, features=vote_features, indices=sample_indices) elif sample_mod == 'spec': # Specify the new center in vote_aggregation aggregation_inputs = dict( points_xyz=seed_points, features=seed_features, target_xyz=vote_points) else: raise NotImplementedError( f'Sample mode {sample_mod} is not supported!') vote_aggregation_ret = self.vote_aggregation(**aggregation_inputs) aggregated_points, features, aggregated_indices = vote_aggregation_ret results['aggregated_points'] = aggregated_points results['aggregated_features'] = features results['aggregated_indices'] = aggregated_indices # 3. predict bbox and score cls_predictions, reg_predictions = self.conv_pred(features) # 4. decode predictions decode_res = self.bbox_coder.split_pred(cls_predictions, reg_predictions, aggregated_points) results.update(decode_res) return results @force_fp32(apply_to=('bbox_preds', )) def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None, ret_target=False): """Compute loss. Args: bbox_preds (dict): Predictions from forward of vote head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. ret_target (Bool): Return targets or not. Returns: dict: Losses of Votenet. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) = targets # calculate vote loss vote_loss = self.vote_module.get_loss(bbox_preds['seed_points'], bbox_preds['vote_points'], bbox_preds['seed_indices'], vote_target_masks, vote_targets) # calculate objectness loss objectness_loss = self.objectness_loss( bbox_preds['obj_scores'].transpose(2, 1), objectness_targets, weight=objectness_weights) # calculate center loss source2target_loss, target2source_loss = self.center_loss( bbox_preds['center'], center_targets, src_weight=box_loss_weights, dst_weight=valid_gt_weights) center_loss = source2target_loss + target2source_loss # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds['dir_class'].transpose(2, 1), dir_class_targets, weight=box_loss_weights) # calculate direction residual loss batch_size, proposal_num = size_class_targets.shape[:2] heading_label_one_hot = vote_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) dir_res_norm = torch.sum( bbox_preds['dir_res_norm'] * heading_label_one_hot, -1) dir_res_loss = self.dir_res_loss( dir_res_norm, dir_res_targets, weight=box_loss_weights) # calculate size class loss size_class_loss = self.size_class_loss( bbox_preds['size_class'].transpose(2, 1), size_class_targets, weight=box_loss_weights) # calculate size residual loss one_hot_size_targets = vote_targets.new_zeros( (batch_size, proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets_expand = one_hot_size_targets.unsqueeze( -1).repeat(1, 1, 1, 3).contiguous() size_residual_norm = torch.sum( bbox_preds['size_res_norm'] * one_hot_size_targets_expand, 2) box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat( 1, 1, 3) size_res_loss = self.size_res_loss( size_residual_norm, size_res_targets, weight=box_loss_weights_expand) # calculate semantic loss semantic_loss = self.semantic_loss( bbox_preds['sem_scores'].transpose(2, 1), mask_targets, weight=box_loss_weights) losses = dict( vote_loss=vote_loss, objectness_loss=objectness_loss, semantic_loss=semantic_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_class_loss=size_class_loss, size_res_loss=size_res_loss) if self.iou_loss: corners_pred = self.bbox_coder.decode_corners( bbox_preds['center'], size_residual_norm, one_hot_size_targets_expand) corners_target = self.bbox_coder.decode_corners( assigned_center_targets, size_res_targets, one_hot_size_targets_expand) iou_loss = self.iou_loss( corners_pred, corners_target, weight=box_loss_weights) losses['iou_loss'] = iou_loss if ret_target: losses['targets'] = targets return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of vote head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. Returns: tuple[torch.Tensor]: Targets of vote head. """ # find empty example valid_gt_masks = list() gt_num = list() for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) valid_gt_masks.append(gt_labels_3d[index].new_zeros(1)) gt_num.append(1) else: valid_gt_masks.append(gt_labels_3d[index].new_ones( gt_labels_3d[index].shape)) gt_num.append(gt_labels_3d[index].shape[0]) max_gt_num = max(gt_num) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] aggregated_points = [ bbox_preds['aggregated_points'][i] for i in range(len(gt_labels_3d)) ] (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, objectness_targets, objectness_masks) = multi_apply(self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, aggregated_points) # pad targets as original code of votenet. for index in range(len(gt_labels_3d)): pad_num = max_gt_num - gt_labels_3d[index].shape[0] center_targets[index] = F.pad(center_targets[index], (0, 0, 0, pad_num)) valid_gt_masks[index] = F.pad(valid_gt_masks[index], (0, pad_num)) vote_targets = torch.stack(vote_targets) vote_target_masks = torch.stack(vote_target_masks) center_targets = torch.stack(center_targets) valid_gt_masks = torch.stack(valid_gt_masks) assigned_center_targets = torch.stack(assigned_center_targets) objectness_targets = torch.stack(objectness_targets) objectness_weights = torch.stack(objectness_masks) objectness_weights /= (torch.sum(objectness_weights) + 1e-6) box_loss_weights = objectness_targets.float() / ( torch.sum(objectness_targets).float() + 1e-6) valid_gt_weights = valid_gt_masks.float() / ( torch.sum(valid_gt_masks.float()) + 1e-6) dir_class_targets = torch.stack(dir_class_targets) dir_res_targets = torch.stack(dir_res_targets) size_class_targets = torch.stack(size_class_targets) size_res_targets = torch.stack(size_res_targets) mask_targets = torch.stack(mask_targets) return (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, aggregated_points=None): """Generate targets of vote head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from vote aggregation layer. Returns: tuple[torch.Tensor]: Targets of vote head. """ assert self.bbox_coder.with_rot or pts_semantic_mask is not None gt_bboxes_3d = gt_bboxes_3d.to(points.device) # generate votes target num_points = points.shape[0] if self.bbox_coder.with_rot: vote_targets = points.new_zeros([num_points, 3 * self.gt_per_seed]) vote_target_masks = points.new_zeros([num_points], dtype=torch.long) vote_target_idx = points.new_zeros([num_points], dtype=torch.long) box_indices_all = gt_bboxes_3d.points_in_boxes_all(points) for i in range(gt_labels_3d.shape[0]): box_indices = box_indices_all[:, i] indices = torch.nonzero( box_indices, as_tuple=False).squeeze(-1) selected_points = points[indices] vote_target_masks[indices] = 1 vote_targets_tmp = vote_targets[indices] votes = gt_bboxes_3d.gravity_center[i].unsqueeze( 0) - selected_points[:, :3] for j in range(self.gt_per_seed): column_indices = torch.nonzero( vote_target_idx[indices] == j, as_tuple=False).squeeze(-1) vote_targets_tmp[column_indices, int(j * 3):int(j * 3 + 3)] = votes[column_indices] if j == 0: vote_targets_tmp[column_indices] = votes[ column_indices].repeat(1, self.gt_per_seed) vote_targets[indices] = vote_targets_tmp vote_target_idx[indices] = torch.clamp( vote_target_idx[indices] + 1, max=2) elif pts_semantic_mask is not None: vote_targets = points.new_zeros([num_points, 3]) vote_target_masks = points.new_zeros([num_points], dtype=torch.long) for i in torch.unique(pts_instance_mask): indices = torch.nonzero( pts_instance_mask == i, as_tuple=False).squeeze(-1) if pts_semantic_mask[indices[0]] < self.num_classes: selected_points = points[indices, :3] center = 0.5 * ( selected_points.min(0)[0] + selected_points.max(0)[0]) vote_targets[indices, :] = center - selected_points vote_target_masks[indices] = 1 vote_targets = vote_targets.repeat((1, self.gt_per_seed)) else: raise NotImplementedError (center_targets, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets) = self.bbox_coder.encode(gt_bboxes_3d, gt_labels_3d) proposal_num = aggregated_points.shape[0] distance1, _, assignment, _ = chamfer_distance( aggregated_points.unsqueeze(0), center_targets.unsqueeze(0), reduction='none') assignment = assignment.squeeze(0) euclidean_distance1 = torch.sqrt(distance1.squeeze(0) + 1e-6) objectness_targets = points.new_zeros((proposal_num), dtype=torch.long) objectness_targets[ euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1 objectness_masks = points.new_zeros((proposal_num)) objectness_masks[ euclidean_distance1 < self.train_cfg['pos_distance_thr']] = 1.0 objectness_masks[ euclidean_distance1 > self.train_cfg['neg_distance_thr']] = 1.0 dir_class_targets = dir_class_targets[assignment] dir_res_targets = dir_res_targets[assignment] dir_res_targets /= (np.pi / self.num_dir_bins) size_class_targets = size_class_targets[assignment] size_res_targets = size_res_targets[assignment] one_hot_size_targets = gt_bboxes_3d.tensor.new_zeros( (proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(1, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets = one_hot_size_targets.unsqueeze(-1).repeat( 1, 1, 3) mean_sizes = size_res_targets.new_tensor( self.bbox_coder.mean_sizes).unsqueeze(0) pos_mean_sizes = torch.sum(one_hot_size_targets * mean_sizes, 1) size_res_targets /= pos_mean_sizes mask_targets = gt_labels_3d[assignment] assigned_center_targets = center_targets[assignment] return (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, assigned_center_targets, mask_targets.long(), objectness_targets, objectness_masks) def get_bboxes(self, points, bbox_preds, input_metas, rescale=False, use_nms=True): """Generate bboxes from vote head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from vote head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. use_nms (bool): Whether to apply NMS, skip nms postprocessing while using vote head in rpn stage. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # decode boxes obj_scores = F.softmax(bbox_preds['obj_scores'], dim=-1)[..., -1] sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1) bbox3d = self.bbox_coder.decode(bbox_preds) if use_nms: batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = \ self.multiclass_nms_single(obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected, box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results else: return bbox3d def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ bbox = input_meta['box_type_3d']( bbox, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) box_indices = bbox.points_in_boxes_all(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] nonempty_box_mask = box_indices.T.sum(1) > 5 bbox_classes = torch.argmax(sem_scores, -1) nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_thr) # filter empty boxes and boxes with low score scores_mask = (obj_scores > self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_selected], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected] * sem_scores[selected][:, k]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels ================================================ FILE: mmdet3d/models/detectors/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base import Base3DDetector from .bevdet import BEVDepth4D, BEVDet, BEVDet4D, BEVDetTRT from .centerpoint import CenterPoint from .dynamic_voxelnet import DynamicVoxelNet from .fcos_mono3d import FCOSMono3D from .groupfree3dnet import GroupFree3DNet from .h3dnet import H3DNet from .imvotenet import ImVoteNet from .imvoxelnet import ImVoxelNet from .mink_single_stage import MinkSingleStage3DDetector from .mvx_faster_rcnn import DynamicMVXFasterRCNN, MVXFasterRCNN from .mvx_two_stage import MVXTwoStageDetector from .parta2 import PartA2 from .point_rcnn import PointRCNN from .sassd import SASSD from .single_stage_mono3d import SingleStageMono3DDetector from .smoke_mono3d import SMOKEMono3D from .ssd3dnet import SSD3DNet from .votenet import VoteNet from .voxelnet import VoxelNet __all__ = [ 'Base3DDetector', 'VoxelNet', 'DynamicVoxelNet', 'MVXTwoStageDetector', 'DynamicMVXFasterRCNN', 'MVXFasterRCNN', 'PartA2', 'VoteNet', 'H3DNet', 'CenterPoint', 'SSD3DNet', 'ImVoteNet', 'SingleStageMono3DDetector', 'FCOSMono3D', 'ImVoxelNet', 'GroupFree3DNet', 'PointRCNN', 'SMOKEMono3D', 'MinkSingleStage3DDetector', 'SASSD', 'BEVDet', 'BEVDet4D', 'BEVDepth4D', 'BEVDetTRT' ] ================================================ FILE: mmdet3d/models/detectors/base.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp import mmcv import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import auto_fp16 from mmdet3d.core import Box3DMode, Coord3DMode, show_result from mmdet.models.detectors import BaseDetector class Base3DDetector(BaseDetector): """Base class for detectors.""" def forward_test(self, points, img_metas, img=None, **kwargs): """ Args: points (list[torch.Tensor]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch img (list[torch.Tensor], optional): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. """ for var, name in [(points, 'points'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(points) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(points), len(img_metas))) if num_augs == 1: img = [img] if img is None else img return self.simple_test(points[0], img_metas[0], img[0], **kwargs) else: return self.aug_test(points, img_metas, img, **kwargs) @auto_fp16(apply_to=('img', 'points')) def forward(self, return_loss=True, **kwargs): """Calls either forward_train or forward_test depending on whether return_loss=True. Note this setting will change the expected inputs. When `return_loss=True`, img and img_metas are single-nested (i.e. torch.Tensor and list[dict]), and when `resturn_loss=False`, img and img_metas should be double nested (i.e. list[torch.Tensor], list[list[dict]]), with the outer list indicating test time augmentations. """ if return_loss: return self.forward_train(**kwargs) else: return self.forward_test(**kwargs) def show_results(self, data, result, out_dir, show=False, score_thr=None): """Results visualization. Args: data (list[dict]): Input points and the information of the sample. result (list[dict]): Prediction results. out_dir (str): Output directory of visualization result. show (bool, optional): Determines whether you are going to show result by open3d. Defaults to False. score_thr (float, optional): Score threshold of bounding boxes. Default to None. """ for batch_id in range(len(result)): if isinstance(data['points'][0], DC): points = data['points'][0]._data[0][batch_id].numpy() elif mmcv.is_list_of(data['points'][0], torch.Tensor): points = data['points'][0][batch_id] else: ValueError(f"Unsupported data type {type(data['points'][0])} " f'for visualization!') if isinstance(data['img_metas'][0], DC): pts_filename = data['img_metas'][0]._data[0][batch_id][ 'pts_filename'] box_mode_3d = data['img_metas'][0]._data[0][batch_id][ 'box_mode_3d'] elif mmcv.is_list_of(data['img_metas'][0], dict): pts_filename = data['img_metas'][0][batch_id]['pts_filename'] box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d'] else: ValueError( f"Unsupported data type {type(data['img_metas'][0])} " f'for visualization!') file_name = osp.split(pts_filename)[-1].split('.')[0] assert out_dir is not None, 'Expect out_dir, got none.' pred_bboxes = result[batch_id]['boxes_3d'] pred_labels = result[batch_id]['labels_3d'] if score_thr is not None: mask = result[batch_id]['scores_3d'] > score_thr pred_bboxes = pred_bboxes[mask] pred_labels = pred_labels[mask] # for now we convert points and bbox into depth mode if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d == Box3DMode.LIDAR): points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d, Box3DMode.DEPTH) elif box_mode_3d != Box3DMode.DEPTH: ValueError( f'Unsupported box_mode_3d {box_mode_3d} for conversion!') pred_bboxes = pred_bboxes.tensor.cpu().numpy() show_result( points, None, pred_bboxes, out_dir, file_name, show=show, pred_labels=pred_labels) ================================================ FILE: mmdet3d/models/detectors/bevdet.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import torch import torch.nn.functional as F from mmcv.runner import force_fp32 from mmdet3d.ops.bev_pool_v2.bev_pool import TRTBEVPoolv2 from mmdet.models import DETECTORS from .. import builder from .centerpoint import CenterPoint from mmdet3d.core.bbox import (CameraInstance3DBoxes, DepthInstance3DBoxes, LiDARInstance3DBoxes, box_np_ops) import torch from torchvision.utils import make_grid import torchvision import matplotlib.pyplot as plt import cv2 def convert_color(img_path): plt.figure() img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) plt.imsave(img_path, img, cmap=plt.get_cmap('viridis')) plt.close() def save_tensor(tensor, path, pad_value=254.0,normalize=False): print('save_tensor', path) tensor = tensor.to(torch.float).detach().cpu() max_ = tensor.flatten(1).max(-1).values[:, None, None] min_ = tensor.flatten(1).min(-1).values[:, None, None] tensor = (tensor-min_)/(max_-min_) if tensor.type() == 'torch.BoolTensor': tensor = tensor*255 if len(tensor.shape) == 3: tensor = tensor.unsqueeze(1) tensor = make_grid(tensor, pad_value=pad_value, normalize=normalize).permute(1, 2, 0).numpy().copy() torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path) convert_color(path) @DETECTORS.register_module() class BEVDet(CenterPoint): def __init__(self, img_view_transformer, img_bev_encoder_backbone, img_bev_encoder_neck, **kwargs): super(BEVDet, self).__init__(**kwargs) self.img_view_transformer = builder.build_neck(img_view_transformer) self.img_bev_encoder_backbone = \ builder.build_backbone(img_bev_encoder_backbone) self.img_bev_encoder_neck = builder.build_neck(img_bev_encoder_neck) def image_encoder(self, img): imgs = img B, N, C, imH, imW = imgs.shape imgs = imgs.view(B * N, C, imH, imW) x = self.img_backbone(imgs) if self.with_img_neck: x = self.img_neck(x) if type(x) in [list, tuple]: x = x[0] _, output_dim, ouput_H, output_W = x.shape x = x.view(B, N, output_dim, ouput_H, output_W) return x @force_fp32() def bev_encoder(self, x): x = self.img_bev_encoder_backbone(x) x = self.img_bev_encoder_neck(x) if type(x) in [list, tuple]: x = x[0] return x def extract_img_feat(self, img, img_metas, **kwargs): """Extract features of images.""" x = self.image_encoder(img[0]) x, depth = self.img_view_transformer([x] + img[1:7]) # from IPython import embed # embed() # exit() x = self.bev_encoder(x) return [x], depth def extract_feat(self, points, img, img_metas, **kwargs): """Extract features from images and points.""" img_feats, depth = self.extract_img_feat(img, img_metas, **kwargs) pts_feats = None return (img_feats, pts_feats, depth) def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img_inputs=None, proposals=None, gt_bboxes_ignore=None, **kwargs): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats, _ = self.extract_feat( points, img=img_inputs, img_metas=img_metas, **kwargs) losses = dict() losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) return losses def forward_test(self, points=None, img_metas=None, img_inputs=None, **kwargs): """ Args: points (list[torch.Tensor]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch img (list[torch.Tensor], optional): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. """ for var, name in [(img_inputs, 'img_inputs'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(img_inputs) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(img_inputs), len(img_metas))) if not isinstance(img_inputs[0][0], list): img_inputs = [img_inputs] if img_inputs is None else img_inputs points = [points] if points is None else points return self.simple_test(points[0], img_metas[0], img_inputs[0], **kwargs) else: return self.aug_test(None, img_metas[0], img_inputs[0], **kwargs) def aug_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" assert False def simple_test(self, points, img_metas, img=None, rescale=False, **kwargs): """Test function without augmentaiton.""" img_feats, _, _ = self.extract_feat( points, img=img, img_metas=img_metas, **kwargs) bbox_list = [dict() for _ in range(len(img_metas))] bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale) # from IPython import embed # embed() # exit() # x = torch.arange(0, 200, 1) * 0.4 -39.8 # y = torch.arange(0, 200, 1) * 0.4 - 39.8 # z = torch.arange(0, 16, 1) * 0.4 - 0.8 # xx, yy, zz = torch.meshgrid(x, y, z) # points = torch.stack([xx,yy,zz], -1) # points = points.reshape(-1, 3) # import numpy as np # car_index = (bbox_pts[0]['labels_3d'] == 7) & (bbox_pts[0]['scores_3d']>0.4) # mask = box_np_ops.points_in_rbbox(points.numpy(), bbox_pts[0]['boxes_3d'].tensor[car_index].cpu().numpy(), origin=[0.5,0.5,0.]) # points = points[mask.sum(-1)>0] # points[:, 0] = torch.tensor((points[:, 0]+39.8)//0.4) # points[:, 1] = torch.tensor((points[:, 1]+39.8)//0.4) # points[:, 2] = torch.tensor((points[:, 2]+0.8)//0.4) # pred_occupancy = torch.zeros([200, 200, 16]) # points = points.to(torch.long) # pred_occupancy[points[:, 0], points[:, 1], points[:, 2]] = 7 # pred_occupancy= pred_occupancy.cpu().numpy() for result_dict, pts_bbox in zip(bbox_list, bbox_pts): pts_bbox['index'] = img_metas[0]['index'] result_dict['pts_bbox'] = pts_bbox # result_dict['pred_occupancy'] = pred_occupancy # result_dict['index'] = img_metas[0]['index'] return bbox_list def forward_dummy(self, points=None, img_metas=None, img_inputs=None, **kwargs): img_feats, _, _ = self.extract_feat( points, img=img_inputs, img_metas=img_metas, **kwargs) assert self.with_pts_bbox outs = self.pts_bbox_head(img_feats) return outs @DETECTORS.register_module() class BEVDetTRT(BEVDet): def result_serialize(self, outs): outs_ = [] for out in outs: for key in ['reg', 'height', 'dim', 'rot', 'vel', 'heatmap']: outs_.append(out[0][key]) return outs_ def result_deserialize(self, outs): outs_ = [] keys = ['reg', 'height', 'dim', 'rot', 'vel', 'heatmap'] for head_id in range(len(outs) // 6): outs_head = [dict()] for kid, key in enumerate(keys): outs_head[0][key] = outs[head_id * 6 + kid] outs_.append(outs_head) return outs_ def forward( self, img, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, ): x = self.img_backbone(img) x = self.img_neck(x) x = self.img_view_transformer.depth_net(x) depth = x[:, :self.img_view_transformer.D].softmax(dim=1) tran_feat = x[:, self.img_view_transformer.D:( self.img_view_transformer.D + self.img_view_transformer.out_channels)] tran_feat = tran_feat.permute(0, 2, 3, 1) x = TRTBEVPoolv2.apply(depth.contiguous(), tran_feat.contiguous(), ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths) x = x.permute(0, 3, 1, 2).contiguous() bev_feat = self.bev_encoder(x) outs = self.pts_bbox_head([bev_feat]) outs = self.result_serialize(outs) return outs def get_bev_pool_input(self, input): coor = self.img_view_transformer.get_lidar_coor(*input[1:7]) return self.img_view_transformer.voxel_pooling_prepare_v2(coor) @DETECTORS.register_module() class BEVDet4D(BEVDet): def __init__(self, pre_process=None, align_after_view_transfromation=False, num_adj=1, with_prev=True, use_depth_supervision = True, **kwargs): super(BEVDet4D, self).__init__(**kwargs) self.pre_process = pre_process is not None if self.pre_process: self.pre_process_net = builder.build_backbone(pre_process) self.align_after_view_transfromation = align_after_view_transfromation self.num_frame = num_adj + 1 self.with_prev = with_prev self.use_depth_supervision = use_depth_supervision @force_fp32() def shift_feature(self, input, trans, rots, bda, bda_adj=None): n, c, h, w = input.shape _, v, _ = trans[0].shape # generate grid xs = torch.linspace( 0, w - 1, w, dtype=input.dtype, device=input.device).view(1, w).expand(h, w) ys = torch.linspace( 0, h - 1, h, dtype=input.dtype, device=input.device).view(h, 1).expand(h, w) grid = torch.stack((xs, ys, torch.ones_like(xs)), -1) grid = grid.view(1, h, w, 3).expand(n, h, w, 3).view(n, h, w, 3, 1) # get transformation from current ego frame to adjacent ego frame # transformation from current camera frame to current ego frame c02l0 = torch.zeros((n, 1, 4, 4), dtype=grid.dtype).to(grid) c02l0[:, :, :3, :3] = rots[0][:, 0:1, :, :] c02l0[:, :, :3, 3] = trans[0][:, 0:1, :] c02l0[:, :, 3, 3] = 1 # transformation from adjacent camera frame to current ego frame c12l0 = torch.zeros((n, 1, 4, 4), dtype=grid.dtype).to(grid) c12l0[:, :, :3, :3] = rots[1][:, 0:1, :, :] c12l0[:, :, :3, 3] = trans[1][:, 0:1, :] c12l0[:, :, 3, 3] = 1 # add bev data augmentation bda_ = torch.zeros((n, 1, 4, 4), dtype=grid.dtype).to(grid) bda_[:, :, :3, :3] = bda.unsqueeze(1) bda_[:, :, 3, 3] = 1 c02l0 = bda_.matmul(c02l0) if bda_adj is not None: bda_ = torch.zeros((n, 1, 4, 4), dtype=grid.dtype).to(grid) bda_[:, :, :3, :3] = bda_adj.unsqueeze(1) bda_[:, :, 3, 3] = 1 c12l0 = bda_.matmul(c12l0) # transformation from current ego frame to adjacent ego frame l02l1 = c02l0.matmul(torch.inverse(c12l0))[:, 0, :, :].view( n, 1, 1, 4, 4) ''' c02l0 * inv(c12l0) = c02l0 * inv(l12l0 * c12l1) = c02l0 * inv(c12l1) * inv(l12l0) = l02l1 # c02l0==c12l1 ''' l02l1 = l02l1[:, :, :, [True, True, False, True], :][:, :, :, :, [True, True, False, True]] feat2bev = torch.zeros((3, 3), dtype=grid.dtype).to(grid) feat2bev[0, 0] = self.img_view_transformer.grid_interval[0] feat2bev[1, 1] = self.img_view_transformer.grid_interval[1] feat2bev[0, 2] = self.img_view_transformer.grid_lower_bound[0] feat2bev[1, 2] = self.img_view_transformer.grid_lower_bound[1] feat2bev[2, 2] = 1 feat2bev = feat2bev.view(1, 3, 3) tf = torch.inverse(feat2bev).matmul(l02l1).matmul(feat2bev) # transform and normalize grid = tf.matmul(grid) normalize_factor = torch.tensor([w - 1.0, h - 1.0], dtype=input.dtype, device=input.device) grid = grid[:, :, :, :2, 0] / normalize_factor.view(1, 1, 1, 2) * 2.0 - 1.0 output = F.grid_sample(input, grid.to(input.dtype), align_corners=True) return output def prepare_bev_feat(self, img, rot, tran, intrin, post_rot, post_tran, bda, mlp_input): x = self.image_encoder(img) bev_feat, depth = self.img_view_transformer( [x, rot, tran, intrin, post_rot, post_tran, bda, mlp_input]) if self.pre_process: bev_feat = self.pre_process_net(bev_feat)[0] return bev_feat, depth def extract_img_feat_sequential(self, inputs, feat_prev): imgs, rots_curr, trans_curr, intrins = inputs[:4] rots_prev, trans_prev, post_rots, post_trans, bda = inputs[4:] bev_feat_list = [] mlp_input = self.img_view_transformer.get_mlp_input( rots_curr[0:1, ...], trans_curr[0:1, ...], intrins, post_rots, post_trans, bda[0:1, ...]) inputs_curr = (imgs, rots_curr[0:1, ...], trans_curr[0:1, ...], intrins, post_rots, post_trans, bda[0:1, ...], mlp_input) bev_feat, depth = self.prepare_bev_feat(*inputs_curr) bev_feat_list.append(bev_feat) # align the feat_prev _, C, H, W = feat_prev.shape feat_prev = \ self.shift_feature(feat_prev, [trans_curr, trans_prev], [rots_curr, rots_prev], bda) bev_feat_list.append(feat_prev.view(1, (self.num_frame - 1) * C, H, W)) bev_feat = torch.cat(bev_feat_list, dim=1) x = self.bev_encoder(bev_feat) return [x], depth def prepare_inputs(self, inputs): # split the inputs into each frame B, N, _, H, W = inputs[0].shape N = N // self.num_frame imgs = inputs[0].view(B, N, self.num_frame, 3, H, W) imgs = torch.split(imgs, 1, 2) imgs = [t.squeeze(2) for t in imgs] rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7] extra = [ rots.view(B, self.num_frame, N, 3, 3), trans.view(B, self.num_frame, N, 3), intrins.view(B, self.num_frame, N, 3, 3), post_rots.view(B, self.num_frame, N, 3, 3), post_trans.view(B, self.num_frame, N, 3) ] extra = [torch.split(t, 1, 1) for t in extra] extra = [[p.squeeze(1) for p in t] for t in extra] rots, trans, intrins, post_rots, post_trans = extra return imgs, rots, trans, intrins, post_rots, post_trans, bda def extract_img_feat(self, img, img_metas, pred_prev=False, sequential=False, **kwargs): if sequential: return self.extract_img_feat_sequential(img, kwargs['feat_prev']) imgs, rots, trans, intrins, post_rots, post_trans, bda = \ self.prepare_inputs(img) """Extract features of images.""" bev_feat_list = [] depth_list = [] key_frame = True # back propagation for key frame only for img, rot, tran, intrin, post_rot, post_tran in zip( imgs, rots, trans, intrins, post_rots, post_trans): if key_frame or self.with_prev: if self.align_after_view_transfromation: rot, tran = rots[0], trans[0] mlp_input = self.img_view_transformer.get_mlp_input( rots[0], trans[0], intrin, post_rot, post_tran, bda) inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda, mlp_input) if key_frame: bev_feat, depth = self.prepare_bev_feat(*inputs_curr) else: with torch.no_grad(): bev_feat, depth = self.prepare_bev_feat(*inputs_curr) else: bev_feat = torch.zeros_like(bev_feat_list[0]) depth = None bev_feat_list.append(bev_feat) depth_list.append(depth) key_frame = False if pred_prev: assert self.align_after_view_transfromation assert rots[0].shape[0] == 1 feat_prev = torch.cat(bev_feat_list[1:], dim=0) trans_curr = trans[0].repeat(self.num_frame - 1, 1, 1) rots_curr = rots[0].repeat(self.num_frame - 1, 1, 1, 1) trans_prev = torch.cat(trans[1:], dim=0) rots_prev = torch.cat(rots[1:], dim=0) bda_curr = bda.repeat(self.num_frame - 1, 1, 1) return feat_prev, [ imgs[0], rots_curr, trans_curr, intrins[0], rots_prev, trans_prev, post_rots[0], post_trans[0], bda_curr ] if self.align_after_view_transfromation: for adj_id in range(1, self.num_frame): bev_feat_list[adj_id] = \ self.shift_feature(bev_feat_list[adj_id], [trans[0], trans[adj_id]], [rots[0], rots[adj_id]], bda) bev_feat = torch.cat(bev_feat_list, dim=1) x = self.bev_encoder(bev_feat) return [x], depth_list[0] @DETECTORS.register_module() class BEVDepth4D(BEVDet4D): def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img_inputs=None, proposals=None, gt_bboxes_ignore=None, **kwargs): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats, depth = self.extract_feat( points, img=img_inputs, img_metas=img_metas, **kwargs) gt_depth = kwargs['gt_depth'] loss_depth = self.img_view_transformer.get_depth_loss(gt_depth, depth) if not self.use_depth_supervision: loss_depth = loss_depth * 0 losses = dict(loss_depth=loss_depth) losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) return losses ================================================ FILE: mmdet3d/models/detectors/centerpoint.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from ..builder import DETECTORS from .mvx_two_stage import MVXTwoStageDetector @DETECTORS.register_module() class CenterPoint(MVXTwoStageDetector): """Base class of Multi-modality VoxelNet.""" def __init__(self, pts_voxel_layer=None, pts_voxel_encoder=None, pts_middle_encoder=None, pts_fusion_layer=None, img_backbone=None, pts_backbone=None, img_neck=None, pts_neck=None, pts_bbox_head=None, img_roi_head=None, img_rpn_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(CenterPoint, self).__init__(pts_voxel_layer, pts_voxel_encoder, pts_middle_encoder, pts_fusion_layer, img_backbone, pts_backbone, img_neck, pts_neck, pts_bbox_head, img_roi_head, img_rpn_head, train_cfg, test_cfg, pretrained, init_cfg) @property def with_velocity(self): """bool: Whether the head predicts velocity""" return self.pts_bbox_head is not None and \ self.pts_bbox_head.with_velocity def extract_pts_feat(self, pts, img_feats, img_metas): """Extract features of points.""" if not self.with_pts_bbox: return None voxels, num_points, coors = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x def forward_pts_train(self, pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None): """Forward function for point cloud branch. Args: pts_feats (list[torch.Tensor]): Features of point cloud branch gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole img_metas (list[dict]): Meta information of samples. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ outs = self.pts_bbox_head(pts_feats) loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs] losses = self.pts_bbox_head.loss(*loss_inputs) return losses def simple_test_pts(self, x, img_metas, rescale=False): """Test function of point cloud branch.""" outs = self.pts_bbox_head(x) bbox_list = self.pts_bbox_head.get_bboxes( outs, img_metas, rescale=rescale) return bbox_list # bbox_results = [ # bbox3d2result(bboxes, scores, labels) # for bboxes, scores, labels in bbox_list # ] # return bbox_results def aug_test_pts(self, feats, img_metas, rescale=False): """Test function of point cloud branch with augmentaiton. The function implementation process is as follows: - step 1: map features back for double-flip augmentation. - step 2: merge all features and generate boxes. - step 3: map boxes back for scale augmentation. - step 4: merge results. Args: feats (list[torch.Tensor]): Feature of point cloud. img_metas (list[dict]): Meta information of samples. rescale (bool, optional): Whether to rescale bboxes. Default: False. Returns: dict: Returned bboxes consists of the following keys: - boxes_3d (:obj:`LiDARInstance3DBoxes`): Predicted bboxes. - scores_3d (torch.Tensor): Scores of predicted boxes. - labels_3d (torch.Tensor): Labels of predicted boxes. """ # only support aug_test for one sample outs_list = [] for x, img_meta in zip(feats, img_metas): outs = self.pts_bbox_head(x) # merge augmented outputs before decoding bboxes for task_id, out in enumerate(outs): for key in out[0].keys(): if img_meta[0]['pcd_horizontal_flip']: outs[task_id][0][key] = torch.flip( outs[task_id][0][key], dims=[2]) if key == 'reg': outs[task_id][0][key][:, 1, ...] = 1 - outs[ task_id][0][key][:, 1, ...] elif key == 'rot': outs[task_id][0][ key][:, 0, ...] = -outs[task_id][0][key][:, 0, ...] elif key == 'vel': outs[task_id][0][ key][:, 1, ...] = -outs[task_id][0][key][:, 1, ...] if img_meta[0]['pcd_vertical_flip']: outs[task_id][0][key] = torch.flip( outs[task_id][0][key], dims=[3]) if key == 'reg': outs[task_id][0][key][:, 0, ...] = 1 - outs[ task_id][0][key][:, 0, ...] elif key == 'rot': outs[task_id][0][ key][:, 1, ...] = -outs[task_id][0][key][:, 1, ...] elif key == 'vel': outs[task_id][0][ key][:, 0, ...] = -outs[task_id][0][key][:, 0, ...] outs_list.append(outs) preds_dicts = dict() scale_img_metas = [] # concat outputs sharing the same pcd_scale_factor for i, (img_meta, outs) in enumerate(zip(img_metas, outs_list)): pcd_scale_factor = img_meta[0]['pcd_scale_factor'] if pcd_scale_factor not in preds_dicts.keys(): preds_dicts[pcd_scale_factor] = outs scale_img_metas.append(img_meta) else: for task_id, out in enumerate(outs): for key in out[0].keys(): preds_dicts[pcd_scale_factor][task_id][0][key] += out[ 0][key] aug_bboxes = [] for pcd_scale_factor, preds_dict in preds_dicts.items(): for task_id, pred_dict in enumerate(preds_dict): # merge outputs with different flips before decoding bboxes for key in pred_dict[0].keys(): preds_dict[task_id][0][key] /= len(outs_list) / len( preds_dicts.keys()) bbox_list = self.pts_bbox_head.get_bboxes( preds_dict, img_metas[0], rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) if len(preds_dicts.keys()) > 1: # merge outputs with different scales after decoding bboxes merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, scale_img_metas, self.pts_bbox_head.test_cfg) return merged_bboxes else: for key in bbox_list[0].keys(): bbox_list[0][key] = bbox_list[0][key].to('cpu') return bbox_list[0] def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" img_feats, pts_feats = self.extract_feats(points, img_metas, imgs) bbox_list = dict() if pts_feats and self.with_pts_bbox: pts_bbox = self.aug_test_pts(pts_feats, img_metas, rescale) bbox_list.update(pts_bbox=pts_bbox) return [bbox_list] ================================================ FILE: mmdet3d/models/detectors/dynamic_voxelnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from ..builder import DETECTORS from .voxelnet import VoxelNet @DETECTORS.register_module() class DynamicVoxelNet(VoxelNet): r"""VoxelNet using `dynamic voxelization `_. """ def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(DynamicVoxelNet, self).__init__( voxel_layer=voxel_layer, voxel_encoder=voxel_encoder, middle_encoder=middle_encoder, backbone=backbone, neck=neck, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) def extract_feat(self, points, img_metas): """Extract features from points.""" voxels, coors = self.voxelize(points) voxel_features, feature_coors = self.voxel_encoder(voxels, coors) batch_size = coors[-1, 0].item() + 1 x = self.middle_encoder(voxel_features, feature_coors, batch_size) x = self.backbone(x) if self.with_neck: x = self.neck(x) return x @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points and coordinates. """ coors = [] # dynamic voxelization only provide a coors mapping for res in points: res_coors = self.voxel_layer(res) coors.append(res_coors) points = torch.cat(points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return points, coors_batch ================================================ FILE: mmdet3d/models/detectors/fcos_mono3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..builder import DETECTORS from .single_stage_mono3d import SingleStageMono3DDetector @DETECTORS.register_module() class FCOSMono3D(SingleStageMono3DDetector): r"""`FCOS3D `_ for monocular 3D object detection. Currently please refer to our entry on the `leaderboard `_. """ # noqa: E501 def __init__(self, backbone, neck, bbox_head, train_cfg=None, test_cfg=None, pretrained=None): super(FCOSMono3D, self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg, pretrained) ================================================ FILE: mmdet3d/models/detectors/groupfree3dnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from ..builder import DETECTORS from .single_stage import SingleStage3DDetector @DETECTORS.register_module() class GroupFree3DNet(SingleStage3DDetector): """`Group-Free 3D `_.""" def __init__(self, backbone, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None): super(GroupFree3DNet, self).__init__( backbone=backbone, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained) def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, gt_bboxes_ignore=None): """Forward of training. Args: points (list[torch.Tensor]): Points of each batch. img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict[str: torch.Tensor]: Losses. """ # TODO: refactor votenet series to reduce redundant codes. points_cat = torch.stack(points) x = self.extract_feat(points_cat) bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod) loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) losses = self.bbox_head.loss( bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Forward of testing. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list): Image metas. rescale (bool): Whether to rescale results. Returns: list: Predicted 3d boxes. """ points_cat = torch.stack(points) x = self.extract_feat(points_cat) bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod) bbox_list = self.bbox_head.get_bboxes( points_cat, bbox_preds, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test with augmentation.""" points_cat = [torch.stack(pts) for pts in points] feats = self.extract_feats(points_cat, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, pts_cat, img_meta in zip(feats, points_cat, img_metas): bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod) bbox_list = self.bbox_head.get_bboxes( pts_cat, bbox_preds, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/detectors/h3dnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core import merge_aug_bboxes_3d from ..builder import DETECTORS from .two_stage import TwoStage3DDetector @DETECTORS.register_module() class H3DNet(TwoStage3DDetector): r"""H3DNet model. Please refer to the `paper `_ """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(H3DNet, self).__init__( backbone=backbone, neck=neck, rpn_head=rpn_head, roi_head=roi_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, gt_bboxes_ignore=None): """Forward of training. Args: points (list[torch.Tensor]): Points of each batch. img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict: Losses. """ points_cat = torch.stack(points) feats_dict = self.extract_feat(points_cat) feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]] feats_dict['fp_features'] = [feats_dict['hd_feature']] feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]] losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(feats_dict, self.train_cfg.rpn.sample_mod) feats_dict.update(rpn_outs) rpn_loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) rpn_losses = self.rpn_head.loss( rpn_outs, *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore, ret_target=True) feats_dict['targets'] = rpn_losses.pop('targets') losses.update(rpn_losses) # Generate rpn proposals proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = (points, rpn_outs, img_metas) proposal_list = self.rpn_head.get_bboxes( *proposal_inputs, use_nms=proposal_cfg.use_nms) feats_dict['proposal_list'] = proposal_list else: raise NotImplementedError roi_losses = self.roi_head.forward_train(feats_dict, img_metas, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, gt_bboxes_ignore) losses.update(roi_losses) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Forward of testing. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list): Image metas. rescale (bool): Whether to rescale results. Returns: list: Predicted 3d boxes. """ points_cat = torch.stack(points) feats_dict = self.extract_feat(points_cat) feats_dict['fp_xyz'] = [feats_dict['fp_xyz_net0'][-1]] feats_dict['fp_features'] = [feats_dict['hd_feature']] feats_dict['fp_indices'] = [feats_dict['fp_indices_net0'][-1]] if self.with_rpn: proposal_cfg = self.test_cfg.rpn rpn_outs = self.rpn_head(feats_dict, proposal_cfg.sample_mod) feats_dict.update(rpn_outs) # Generate rpn proposals proposal_list = self.rpn_head.get_bboxes( points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms) feats_dict['proposal_list'] = proposal_list else: raise NotImplementedError return self.roi_head.simple_test( feats_dict, img_metas, points_cat, rescale=rescale) def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test with augmentation.""" points_cat = [torch.stack(pts) for pts in points] feats_dict = self.extract_feats(points_cat, img_metas) for feat_dict in feats_dict: feat_dict['fp_xyz'] = [feat_dict['fp_xyz_net0'][-1]] feat_dict['fp_features'] = [feat_dict['hd_feature']] feat_dict['fp_indices'] = [feat_dict['fp_indices_net0'][-1]] # only support aug_test for one sample aug_bboxes = [] for feat_dict, pts_cat, img_meta in zip(feats_dict, points_cat, img_metas): if self.with_rpn: proposal_cfg = self.test_cfg.rpn rpn_outs = self.rpn_head(feat_dict, proposal_cfg.sample_mod) feat_dict.update(rpn_outs) # Generate rpn proposals proposal_list = self.rpn_head.get_bboxes( points, rpn_outs, img_metas, use_nms=proposal_cfg.use_nms) feat_dict['proposal_list'] = proposal_list else: raise NotImplementedError bbox_results = self.roi_head.simple_test( feat_dict, self.test_cfg.rcnn.sample_mod, img_meta, pts_cat, rescale=rescale) aug_bboxes.append(bbox_results) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] def extract_feats(self, points, img_metas): """Extract features of multiple samples.""" return [ self.extract_feat(pts, img_meta) for pts, img_meta in zip(points, img_metas) ] ================================================ FILE: mmdet3d/models/detectors/imvotenet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings import numpy as np import torch from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.utils import MLP from .. import builder from ..builder import DETECTORS from .base import Base3DDetector def sample_valid_seeds(mask, num_sampled_seed=1024): r"""Randomly sample seeds from all imvotes. Modified from ``_ Args: mask (torch.Tensor): Bool tensor in shape ( seed_num*max_imvote_per_pixel), indicates whether this imvote corresponds to a 2D bbox. num_sampled_seed (int): How many to sample from all imvotes. Returns: torch.Tensor: Indices with shape (num_sampled_seed). """ # noqa: E501 device = mask.device batch_size = mask.shape[0] sample_inds = mask.new_zeros((batch_size, num_sampled_seed), dtype=torch.int64) for bidx in range(batch_size): # return index of non zero elements valid_inds = torch.nonzero(mask[bidx, :]).squeeze(-1) if len(valid_inds) < num_sampled_seed: # compute set t1 - t2 t1 = torch.arange(num_sampled_seed, device=device) t2 = valid_inds % num_sampled_seed combined = torch.cat((t1, t2)) uniques, counts = combined.unique(return_counts=True) difference = uniques[counts == 1] rand_inds = torch.randperm( len(difference), device=device)[:num_sampled_seed - len(valid_inds)] cur_sample_inds = difference[rand_inds] cur_sample_inds = torch.cat((valid_inds, cur_sample_inds)) else: rand_inds = torch.randperm( len(valid_inds), device=device)[:num_sampled_seed] cur_sample_inds = valid_inds[rand_inds] sample_inds[bidx, :] = cur_sample_inds return sample_inds @DETECTORS.register_module() class ImVoteNet(Base3DDetector): r"""`ImVoteNet `_ for 3D detection.""" def __init__(self, pts_backbone=None, pts_bbox_heads=None, pts_neck=None, img_backbone=None, img_neck=None, img_roi_head=None, img_rpn_head=None, img_mlp=None, freeze_img_branch=False, fusion_layer=None, num_sampled_seed=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(ImVoteNet, self).__init__(init_cfg=init_cfg) # point branch if pts_backbone is not None: self.pts_backbone = builder.build_backbone(pts_backbone) if pts_neck is not None: self.pts_neck = builder.build_neck(pts_neck) if pts_bbox_heads is not None: pts_bbox_head_common = pts_bbox_heads.common pts_bbox_head_common.update( train_cfg=train_cfg.pts if train_cfg is not None else None) pts_bbox_head_common.update(test_cfg=test_cfg.pts) pts_bbox_head_joint = pts_bbox_head_common.copy() pts_bbox_head_joint.update(pts_bbox_heads.joint) pts_bbox_head_pts = pts_bbox_head_common.copy() pts_bbox_head_pts.update(pts_bbox_heads.pts) pts_bbox_head_img = pts_bbox_head_common.copy() pts_bbox_head_img.update(pts_bbox_heads.img) self.pts_bbox_head_joint = builder.build_head(pts_bbox_head_joint) self.pts_bbox_head_pts = builder.build_head(pts_bbox_head_pts) self.pts_bbox_head_img = builder.build_head(pts_bbox_head_img) self.pts_bbox_heads = [ self.pts_bbox_head_joint, self.pts_bbox_head_pts, self.pts_bbox_head_img ] self.loss_weights = pts_bbox_heads.loss_weights # image branch if img_backbone: self.img_backbone = builder.build_backbone(img_backbone) if img_neck is not None: self.img_neck = builder.build_neck(img_neck) if img_rpn_head is not None: rpn_train_cfg = train_cfg.img_rpn if train_cfg \ is not None else None img_rpn_head_ = img_rpn_head.copy() img_rpn_head_.update( train_cfg=rpn_train_cfg, test_cfg=test_cfg.img_rpn) self.img_rpn_head = builder.build_head(img_rpn_head_) if img_roi_head is not None: rcnn_train_cfg = train_cfg.img_rcnn if train_cfg \ is not None else None img_roi_head.update( train_cfg=rcnn_train_cfg, test_cfg=test_cfg.img_rcnn) self.img_roi_head = builder.build_head(img_roi_head) # fusion if fusion_layer is not None: self.fusion_layer = builder.build_fusion_layer(fusion_layer) self.max_imvote_per_pixel = fusion_layer.max_imvote_per_pixel self.freeze_img_branch = freeze_img_branch if freeze_img_branch: self.freeze_img_branch_params() if img_mlp is not None: self.img_mlp = MLP(**img_mlp) self.num_sampled_seed = num_sampled_seed self.train_cfg = train_cfg self.test_cfg = test_cfg if pretrained is None: img_pretrained = None pts_pretrained = None elif isinstance(pretrained, dict): img_pretrained = pretrained.get('img', None) pts_pretrained = pretrained.get('pts', None) else: raise ValueError( f'pretrained should be a dict, got {type(pretrained)}') if self.with_img_backbone: if img_pretrained is not None: warnings.warn('DeprecationWarning: pretrained is a deprecated ' 'key, please consider using init_cfg.') self.img_backbone.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_img_roi_head: if img_pretrained is not None: warnings.warn('DeprecationWarning: pretrained is a deprecated ' 'key, please consider using init_cfg.') self.img_roi_head.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_pts_backbone: if img_pretrained is not None: warnings.warn('DeprecationWarning: pretrained is a deprecated ' 'key, please consider using init_cfg.') self.pts_backbone.init_cfg = dict( type='Pretrained', checkpoint=pts_pretrained) def freeze_img_branch_params(self): """Freeze all image branch parameters.""" if self.with_img_bbox_head: for param in self.img_bbox_head.parameters(): param.requires_grad = False if self.with_img_backbone: for param in self.img_backbone.parameters(): param.requires_grad = False if self.with_img_neck: for param in self.img_neck.parameters(): param.requires_grad = False if self.with_img_rpn: for param in self.img_rpn_head.parameters(): param.requires_grad = False if self.with_img_roi_head: for param in self.img_roi_head.parameters(): param.requires_grad = False def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """Overload in order to load img network ckpts into img branch.""" module_names = ['backbone', 'neck', 'roi_head', 'rpn_head'] for key in list(state_dict): for module_name in module_names: if key.startswith(module_name) and ('img_' + key) not in state_dict: state_dict['img_' + key] = state_dict.pop(key) super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) def train(self, mode=True): """Overload in order to keep image branch modules in eval mode.""" super(ImVoteNet, self).train(mode) if self.freeze_img_branch: if self.with_img_bbox_head: self.img_bbox_head.eval() if self.with_img_backbone: self.img_backbone.eval() if self.with_img_neck: self.img_neck.eval() if self.with_img_rpn: self.img_rpn_head.eval() if self.with_img_roi_head: self.img_roi_head.eval() @property def with_img_bbox(self): """bool: Whether the detector has a 2D image box head.""" return ((hasattr(self, 'img_roi_head') and self.img_roi_head.with_bbox) or (hasattr(self, 'img_bbox_head') and self.img_bbox_head is not None)) @property def with_img_bbox_head(self): """bool: Whether the detector has a 2D image box head (not roi).""" return hasattr(self, 'img_bbox_head') and self.img_bbox_head is not None @property def with_img_backbone(self): """bool: Whether the detector has a 2D image backbone.""" return hasattr(self, 'img_backbone') and self.img_backbone is not None @property def with_img_neck(self): """bool: Whether the detector has a neck in image branch.""" return hasattr(self, 'img_neck') and self.img_neck is not None @property def with_img_rpn(self): """bool: Whether the detector has a 2D RPN in image detector branch.""" return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None @property def with_img_roi_head(self): """bool: Whether the detector has a RoI Head in image branch.""" return hasattr(self, 'img_roi_head') and self.img_roi_head is not None @property def with_pts_bbox(self): """bool: Whether the detector has a 3D box head.""" return hasattr(self, 'pts_bbox_head') and self.pts_bbox_head is not None @property def with_pts_backbone(self): """bool: Whether the detector has a 3D backbone.""" return hasattr(self, 'pts_backbone') and self.pts_backbone is not None @property def with_pts_neck(self): """bool: Whether the detector has a neck in 3D detector branch.""" return hasattr(self, 'pts_neck') and self.pts_neck is not None def extract_feat(self, imgs): """Just to inherit from abstract method.""" pass def extract_img_feat(self, img): """Directly extract features from the img backbone+neck.""" x = self.img_backbone(img) if self.with_img_neck: x = self.img_neck(x) return x def extract_img_feats(self, imgs): """Extract features from multiple images. Args: imgs (list[torch.Tensor]): A list of images. The images are augmented from the same image but in different ways. Returns: list[torch.Tensor]: Features of different images """ assert isinstance(imgs, list) return [self.extract_img_feat(img) for img in imgs] def extract_pts_feat(self, pts): """Extract features of points.""" x = self.pts_backbone(pts) if self.with_pts_neck: x = self.pts_neck(x) seed_points = x['fp_xyz'][-1] seed_features = x['fp_features'][-1] seed_indices = x['fp_indices'][-1] return (seed_points, seed_features, seed_indices) def extract_pts_feats(self, pts): """Extract features of points from multiple samples.""" assert isinstance(pts, list) return [self.extract_pts_feat(pt) for pt in pts] @torch.no_grad() def extract_bboxes_2d(self, img, img_metas, train=True, bboxes_2d=None, **kwargs): """Extract bounding boxes from 2d detector. Args: img (torch.Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): Image meta info. train (bool): train-time or not. bboxes_2d (list[torch.Tensor]): provided 2d bboxes, not supported yet. Return: list[torch.Tensor]: a list of processed 2d bounding boxes. """ if bboxes_2d is None: x = self.extract_img_feat(img) proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas) rets = self.img_roi_head.simple_test( x, proposal_list, img_metas, rescale=False) rets_processed = [] for ret in rets: tmp = np.concatenate(ret, axis=0) sem_class = img.new_zeros((len(tmp))) start = 0 for i, bboxes in enumerate(ret): sem_class[start:start + len(bboxes)] = i start += len(bboxes) ret = img.new_tensor(tmp) # append class index ret = torch.cat([ret, sem_class[:, None]], dim=-1) inds = torch.argsort(ret[:, 4], descending=True) ret = ret.index_select(0, inds) # drop half bboxes during training for better generalization if train: rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2] rand_drop = torch.sort(rand_drop)[0] ret = ret[rand_drop] rets_processed.append(ret.float()) return rets_processed else: rets_processed = [] for ret in bboxes_2d: if len(ret) > 0 and train: rand_drop = torch.randperm(len(ret))[:(len(ret) + 1) // 2] rand_drop = torch.sort(rand_drop)[0] ret = ret[rand_drop] rets_processed.append(ret.float()) return rets_processed def forward_train(self, points=None, img=None, img_metas=None, gt_bboxes=None, gt_labels=None, gt_bboxes_ignore=None, gt_masks=None, proposals=None, bboxes_2d=None, gt_bboxes_3d=None, gt_labels_3d=None, pts_semantic_mask=None, pts_instance_mask=None, **kwargs): """Forwarding of train for image branch pretrain or stage 2 train. Args: points (list[torch.Tensor]): Points of each batch. img (torch.Tensor): of shape (N, C, H, W) encoding input images. Typically these should be mean centered and std scaled. img_metas (list[dict]): list of image and point cloud meta info dict. For example, keys include 'ori_shape', 'img_norm_cfg', and 'transformation_3d_flow'. For details on the values of the keys see `mmdet/datasets/pipelines/formatting.py:Collect`. gt_bboxes (list[torch.Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[torch.Tensor]): class indices for each 2d bounding box. gt_bboxes_ignore (list[torch.Tensor]): specify which 2d bounding boxes can be ignored when computing the loss. gt_masks (torch.Tensor): true segmentation masks for each 2d bbox, used if the architecture supports a segmentation task. proposals: override rpn proposals (2d) with custom proposals. Use when `with_rpn` is False. bboxes_2d (list[torch.Tensor]): provided 2d bboxes, not supported yet. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): 3d gt bboxes. gt_labels_3d (list[torch.Tensor]): gt class labels for 3d bboxes. pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. Returns: dict[str, torch.Tensor]: a dictionary of loss components. """ if points is None: x = self.extract_img_feat(img) losses = dict() # RPN forward and loss if self.with_img_rpn: proposal_cfg = self.train_cfg.get('img_rpn_proposal', self.test_cfg.img_rpn) rpn_losses, proposal_list = self.img_rpn_head.forward_train( x, img_metas, gt_bboxes, gt_labels=None, gt_bboxes_ignore=gt_bboxes_ignore, proposal_cfg=proposal_cfg) losses.update(rpn_losses) else: proposal_list = proposals roi_losses = self.img_roi_head.forward_train( x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore, gt_masks, **kwargs) losses.update(roi_losses) return losses else: bboxes_2d = self.extract_bboxes_2d( img, img_metas, bboxes_2d=bboxes_2d, **kwargs) points = torch.stack(points) seeds_3d, seed_3d_features, seed_indices = \ self.extract_pts_feat(points) img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d, img_metas) inds = sample_valid_seeds(masks, self.num_sampled_seed) batch_size, img_feat_size = img_features.shape[:2] pts_feat_size = seed_3d_features.shape[1] inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) img_features = img_features.gather(-1, inds_img) inds = inds % inds.shape[1] inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) seeds_3d = seeds_3d.gather(1, inds_seed_xyz) inds_seed_feats = inds.view(batch_size, 1, -1).expand(-1, pts_feat_size, -1) seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats) seed_indices = seed_indices.gather(1, inds) img_features = self.img_mlp(img_features) fused_features = torch.cat([seed_3d_features, img_features], dim=1) feat_dict_joint = dict( seed_points=seeds_3d, seed_features=fused_features, seed_indices=seed_indices) feat_dict_pts = dict( seed_points=seeds_3d, seed_features=seed_3d_features, seed_indices=seed_indices) feat_dict_img = dict( seed_points=seeds_3d, seed_features=img_features, seed_indices=seed_indices) loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) bbox_preds_joints = self.pts_bbox_head_joint( feat_dict_joint, self.train_cfg.pts.sample_mod) bbox_preds_pts = self.pts_bbox_head_pts( feat_dict_pts, self.train_cfg.pts.sample_mod) bbox_preds_img = self.pts_bbox_head_img( feat_dict_img, self.train_cfg.pts.sample_mod) losses_towers = [] losses_joint = self.pts_bbox_head_joint.loss( bbox_preds_joints, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses_pts = self.pts_bbox_head_pts.loss( bbox_preds_pts, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses_img = self.pts_bbox_head_img.loss( bbox_preds_img, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses_towers.append(losses_joint) losses_towers.append(losses_pts) losses_towers.append(losses_img) combined_losses = dict() for loss_term in losses_joint: if 'loss' in loss_term: combined_losses[loss_term] = 0 for i in range(len(losses_towers)): combined_losses[loss_term] += \ losses_towers[i][loss_term] * \ self.loss_weights[i] else: # only save the metric of the joint head # if it is not a loss combined_losses[loss_term] = \ losses_towers[0][loss_term] return combined_losses def forward_test(self, points=None, img_metas=None, img=None, bboxes_2d=None, **kwargs): """Forwarding of test for image branch pretrain or stage 2 train. Args: points (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and the inner list contains all points in the batch, where each Tensor should have a shape NxC. Defaults to None. img_metas (list[list[dict]], optional): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. Defaults to None. img (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. Defaults to None. bboxes_2d (list[list[torch.Tensor]], optional): Provided 2d bboxes, not supported yet. Defaults to None. Returns: list[list[torch.Tensor]]|list[dict]: Predicted 2d or 3d boxes. """ if points is None: for var, name in [(img, 'img'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError( f'{name} must be a list, but got {type(var)}') num_augs = len(img) if num_augs != len(img_metas): raise ValueError(f'num of augmentations ({len(img)}) ' f'!= num of image meta ({len(img_metas)})') if num_augs == 1: # proposals (List[List[Tensor]]): the outer list indicates # test-time augs (multiscale, flip, etc.) and the inner list # indicates images in a batch. # The Tensor should have a shape Px4, where P is the number of # proposals. if 'proposals' in kwargs: kwargs['proposals'] = kwargs['proposals'][0] return self.simple_test_img_only( img=img[0], img_metas=img_metas[0], **kwargs) else: assert img[0].size(0) == 1, 'aug test does not support ' \ 'inference with batch size ' \ f'{img[0].size(0)}' # TODO: support test augmentation for predefined proposals assert 'proposals' not in kwargs return self.aug_test_img_only( img=img, img_metas=img_metas, **kwargs) else: for var, name in [(points, 'points'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(points) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'. format(len(points), len(img_metas))) if num_augs == 1: return self.simple_test( points[0], img_metas[0], img[0], bboxes_2d=bboxes_2d[0] if bboxes_2d is not None else None, **kwargs) else: return self.aug_test(points, img_metas, img, bboxes_2d, **kwargs) def simple_test_img_only(self, img, img_metas, proposals=None, rescale=False): r"""Test without augmentation, image network pretrain. May refer to ``_. Args: img (torch.Tensor): Should have a shape NxCxHxW, which contains all images in the batch. img_metas (list[dict]): proposals (list[Tensor], optional): override rpn proposals with custom proposals. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes to the original shape of input image. Defaults to False. Returns: list[list[torch.Tensor]]: Predicted 2d boxes. """ # noqa: E501 assert self.with_img_bbox, 'Img bbox head must be implemented.' assert self.with_img_backbone, 'Img backbone must be implemented.' assert self.with_img_rpn, 'Img rpn must be implemented.' assert self.with_img_roi_head, 'Img roi head must be implemented.' x = self.extract_img_feat(img) if proposals is None: proposal_list = self.img_rpn_head.simple_test_rpn(x, img_metas) else: proposal_list = proposals ret = self.img_roi_head.simple_test( x, proposal_list, img_metas, rescale=rescale) return ret def simple_test(self, points=None, img_metas=None, img=None, bboxes_2d=None, rescale=False, **kwargs): """Test without augmentation, stage 2. Args: points (list[torch.Tensor], optional): Elements in the list should have a shape NxC, the list indicates all point-clouds in the batch. Defaults to None. img_metas (list[dict], optional): List indicates images in a batch. Defaults to None. img (torch.Tensor, optional): Should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. bboxes_2d (list[torch.Tensor], optional): Provided 2d bboxes, not supported yet. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes. Defaults to False. Returns: list[dict]: Predicted 3d boxes. """ bboxes_2d = self.extract_bboxes_2d( img, img_metas, train=False, bboxes_2d=bboxes_2d, **kwargs) points = torch.stack(points) seeds_3d, seed_3d_features, seed_indices = \ self.extract_pts_feat(points) img_features, masks = self.fusion_layer(img, bboxes_2d, seeds_3d, img_metas) inds = sample_valid_seeds(masks, self.num_sampled_seed) batch_size, img_feat_size = img_features.shape[:2] pts_feat_size = seed_3d_features.shape[1] inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) img_features = img_features.gather(-1, inds_img) inds = inds % inds.shape[1] inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) seeds_3d = seeds_3d.gather(1, inds_seed_xyz) inds_seed_feats = inds.view(batch_size, 1, -1).expand(-1, pts_feat_size, -1) seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats) seed_indices = seed_indices.gather(1, inds) img_features = self.img_mlp(img_features) fused_features = torch.cat([seed_3d_features, img_features], dim=1) feat_dict = dict( seed_points=seeds_3d, seed_features=fused_features, seed_indices=seed_indices) bbox_preds = self.pts_bbox_head_joint(feat_dict, self.test_cfg.pts.sample_mod) bbox_list = self.pts_bbox_head_joint.get_bboxes( points, bbox_preds, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test_img_only(self, img, img_metas, rescale=False): r"""Test function with augmentation, image network pretrain. May refer to ``_. Args: img (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. Defaults to None. img_metas (list[list[dict]], optional): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes to the original shape of input image. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. Defaults to None. Returns: list[list[torch.Tensor]]: Predicted 2d boxes. """ # noqa: E501 assert self.with_img_bbox, 'Img bbox head must be implemented.' assert self.with_img_backbone, 'Img backbone must be implemented.' assert self.with_img_rpn, 'Img rpn must be implemented.' assert self.with_img_roi_head, 'Img roi head must be implemented.' x = self.extract_img_feats(img) proposal_list = self.img_rpn_head.aug_test_rpn(x, img_metas) return self.img_roi_head.aug_test( x, proposal_list, img_metas, rescale=rescale) def aug_test(self, points=None, img_metas=None, imgs=None, bboxes_2d=None, rescale=False, **kwargs): """Test function with augmentation, stage 2. Args: points (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and the inner list contains all points in the batch, where each Tensor should have a shape NxC. Defaults to None. img_metas (list[list[dict]], optional): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. Defaults to None. imgs (list[list[torch.Tensor]], optional): the outer list indicates test-time augmentations and inner Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. Defaults to None. bboxes_2d (list[list[torch.Tensor]], optional): Provided 2d bboxes, not supported yet. Defaults to None. rescale (bool, optional): Whether or not rescale bboxes. Defaults to False. Returns: list[dict]: Predicted 3d boxes. """ points_cat = [torch.stack(pts) for pts in points] feats = self.extract_pts_feats(points_cat, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, pts_cat, img_meta, bbox_2d, img in zip(feats, points_cat, img_metas, bboxes_2d, imgs): bbox_2d = self.extract_bboxes_2d( img, img_metas, train=False, bboxes_2d=bbox_2d, **kwargs) seeds_3d, seed_3d_features, seed_indices = x img_features, masks = self.fusion_layer(img, bbox_2d, seeds_3d, img_metas) inds = sample_valid_seeds(masks, self.num_sampled_seed) batch_size, img_feat_size = img_features.shape[:2] pts_feat_size = seed_3d_features.shape[1] inds_img = inds.view(batch_size, 1, -1).expand(-1, img_feat_size, -1) img_features = img_features.gather(-1, inds_img) inds = inds % inds.shape[1] inds_seed_xyz = inds.view(batch_size, -1, 1).expand(-1, -1, 3) seeds_3d = seeds_3d.gather(1, inds_seed_xyz) inds_seed_feats = inds.view(batch_size, 1, -1).expand(-1, pts_feat_size, -1) seed_3d_features = seed_3d_features.gather(-1, inds_seed_feats) seed_indices = seed_indices.gather(1, inds) img_features = self.img_mlp(img_features) fused_features = torch.cat([seed_3d_features, img_features], dim=1) feat_dict = dict( seed_points=seeds_3d, seed_features=fused_features, seed_indices=seed_indices) bbox_preds = self.pts_bbox_head_joint(feat_dict, self.test_cfg.pts.sample_mod) bbox_list = self.pts_bbox_head_joint.get_bboxes( pts_cat, bbox_preds, img_metas, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/detectors/imvoxelnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core import bbox3d2result, build_prior_generator from mmdet3d.models.fusion_layers.point_fusion import point_sample from mmdet.models.detectors import BaseDetector from ..builder import DETECTORS, build_backbone, build_head, build_neck @DETECTORS.register_module() class ImVoxelNet(BaseDetector): r"""`ImVoxelNet `_.""" def __init__(self, backbone, neck, neck_3d, bbox_head, n_voxels, anchor_generator, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super().__init__(init_cfg=init_cfg) self.backbone = build_backbone(backbone) self.neck = build_neck(neck) self.neck_3d = build_neck(neck_3d) bbox_head.update(train_cfg=train_cfg) bbox_head.update(test_cfg=test_cfg) self.bbox_head = build_head(bbox_head) self.n_voxels = n_voxels self.anchor_generator = build_prior_generator(anchor_generator) self.train_cfg = train_cfg self.test_cfg = test_cfg def extract_feat(self, img, img_metas): """Extract 3d features from the backbone -> fpn -> 3d projection. Args: img (torch.Tensor): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. Returns: torch.Tensor: of shape (N, C_out, N_x, N_y, N_z) """ x = self.backbone(img) x = self.neck(x)[0] points = self.anchor_generator.grid_anchors( [self.n_voxels[::-1]], device=img.device)[0][:, :3] volumes = [] for feature, img_meta in zip(x, img_metas): img_scale_factor = ( points.new_tensor(img_meta['scale_factor'][:2]) if 'scale_factor' in img_meta.keys() else 1) img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False img_crop_offset = ( points.new_tensor(img_meta['img_crop_offset']) if 'img_crop_offset' in img_meta.keys() else 0) volume = point_sample( img_meta, img_features=feature[None, ...], points=points, proj_mat=points.new_tensor(img_meta['lidar2img']), coord_type='LIDAR', img_scale_factor=img_scale_factor, img_crop_offset=img_crop_offset, img_flip=img_flip, img_pad_shape=img.shape[-2:], img_shape=img_meta['img_shape'][:2], aligned=False) volumes.append( volume.reshape(self.n_voxels[::-1] + [-1]).permute(3, 2, 1, 0)) x = torch.stack(volumes) x = self.neck_3d(x) return x def forward_train(self, img, img_metas, gt_bboxes_3d, gt_labels_3d, **kwargs): """Forward of training. Args: img (torch.Tensor): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. Returns: dict[str, torch.Tensor]: A dictionary of loss components. """ x = self.extract_feat(img, img_metas) x = self.bbox_head(x) losses = self.bbox_head.loss(*x, gt_bboxes_3d, gt_labels_3d, img_metas) return losses def forward_test(self, img, img_metas, **kwargs): """Forward of testing. Args: img (torch.Tensor): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. Returns: list[dict]: Predicted 3d boxes. """ # not supporting aug_test for now return self.simple_test(img, img_metas) def simple_test(self, img, img_metas): """Test without augmentations. Args: img (torch.Tensor): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. Returns: list[dict]: Predicted 3d boxes. """ x = self.extract_feat(img, img_metas) x = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes(*x, img_metas) bbox_results = [ bbox3d2result(det_bboxes, det_scores, det_labels) for det_bboxes, det_scores, det_labels in bbox_list ] return bbox_results def aug_test(self, imgs, img_metas, **kwargs): """Test with augmentations. Args: imgs (list[torch.Tensor]): Input images of shape (N, C_in, H, W). img_metas (list): Image metas. Returns: list[dict]: Predicted 3d boxes. """ raise NotImplementedError ================================================ FILE: mmdet3d/models/detectors/mink_single_stage.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Adapted from https://github.com/SamsungLabs/fcaf3d/blob/master/mmdet3d/models/detectors/single_stage_sparse.py # noqa try: import MinkowskiEngine as ME except ImportError: # Please follow getting_started.md to install MinkowskiEngine. pass from mmdet3d.core import bbox3d2result from mmdet3d.models import DETECTORS, build_backbone, build_head from .base import Base3DDetector @DETECTORS.register_module() class MinkSingleStage3DDetector(Base3DDetector): r"""Single stage detector based on MinkowskiEngine `GSDN `_. Args: backbone (dict): Config of the backbone. head (dict): Config of the head. voxel_size (float): Voxel size in meters. train_cfg (dict, optional): Config for train stage. Defaults to None. test_cfg (dict, optional): Config for test stage. Defaults to None. init_cfg (dict, optional): Config for weight initialization. Defaults to None. pretrained (str, optional): Deprecated initialization parameter. Defaults to None. """ def __init__(self, backbone, head, voxel_size, train_cfg=None, test_cfg=None, init_cfg=None, pretrained=None): super(MinkSingleStage3DDetector, self).__init__(init_cfg) self.backbone = build_backbone(backbone) head.update(train_cfg=train_cfg) head.update(test_cfg=test_cfg) self.head = build_head(head) self.voxel_size = voxel_size self.init_weights() def extract_feat(self, points): """Extract features from points. Args: points (list[Tensor]): Raw point clouds. Returns: SparseTensor: Voxelized point clouds. """ coordinates, features = ME.utils.batch_sparse_collate( [(p[:, :3] / self.voxel_size, p[:, 3:]) for p in points], device=points[0].device) x = ME.SparseTensor(coordinates=coordinates, features=features) x = self.backbone(x) return x def forward_train(self, points, gt_bboxes_3d, gt_labels_3d, img_metas): """Forward of training. Args: points (list[Tensor]): Raw point clouds. gt_bboxes (list[BaseInstance3DBoxes]): Ground truth bboxes of each sample. gt_labels(list[torch.Tensor]): Labels of each sample. img_metas (list[dict]): Contains scene meta infos. Returns: dict: Centerness, bbox and classification loss values. """ x = self.extract_feat(points) losses = self.head.forward_train(x, gt_bboxes_3d, gt_labels_3d, img_metas) return losses def simple_test(self, points, img_metas, *args, **kwargs): """Test without augmentations. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list[dict]): Contains scene meta infos. Returns: list[dict]: Predicted 3d boxes. """ x = self.extract_feat(points) bbox_list = self.head.forward_test(x, img_metas) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, **kwargs): """Test with augmentations. Args: points (list[list[torch.Tensor]]): Points of each sample. img_metas (list[dict]): Contains scene meta infos. Returns: list[dict]: Predicted 3d boxes. """ raise NotImplementedError ================================================ FILE: mmdet3d/models/detectors/mvx_faster_rcnn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import force_fp32 from torch.nn import functional as F from ..builder import DETECTORS from .mvx_two_stage import MVXTwoStageDetector @DETECTORS.register_module() class MVXFasterRCNN(MVXTwoStageDetector): """Multi-modality VoxelNet using Faster R-CNN.""" def __init__(self, **kwargs): super(MVXFasterRCNN, self).__init__(**kwargs) @DETECTORS.register_module() class DynamicMVXFasterRCNN(MVXTwoStageDetector): """Multi-modality VoxelNet using Faster R-CNN and dynamic voxelization.""" def __init__(self, **kwargs): super(DynamicMVXFasterRCNN, self).__init__(**kwargs) @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points and coordinates. """ coors = [] # dynamic voxelization only provide a coors mapping for res in points: res_coors = self.pts_voxel_layer(res) coors.append(res_coors) points = torch.cat(points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return points, coors_batch def extract_pts_feat(self, points, img_feats, img_metas): """Extract point features.""" if not self.with_pts_bbox: return None voxels, coors = self.voxelize(points) voxel_features, feature_coors = self.pts_voxel_encoder( voxels, coors, points, img_feats, img_metas) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, feature_coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x ================================================ FILE: mmdet3d/models/detectors/mvx_two_stage.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from os import path as osp import mmcv import torch from mmcv.ops import Voxelization from mmcv.parallel import DataContainer as DC from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core import (Box3DMode, Coord3DMode, bbox3d2result, merge_aug_bboxes_3d, show_result) from mmdet.core import multi_apply from .. import builder from ..builder import DETECTORS from .base import Base3DDetector @DETECTORS.register_module() class MVXTwoStageDetector(Base3DDetector): """Base class of Multi-modality VoxelNet.""" def __init__(self, pts_voxel_layer=None, pts_voxel_encoder=None, pts_middle_encoder=None, pts_fusion_layer=None, img_backbone=None, pts_backbone=None, img_neck=None, pts_neck=None, pts_bbox_head=None, img_roi_head=None, img_rpn_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(MVXTwoStageDetector, self).__init__(init_cfg=init_cfg) if pts_voxel_layer: self.pts_voxel_layer = Voxelization(**pts_voxel_layer) if pts_voxel_encoder: self.pts_voxel_encoder = builder.build_voxel_encoder( pts_voxel_encoder) if pts_middle_encoder: self.pts_middle_encoder = builder.build_middle_encoder( pts_middle_encoder) if pts_backbone: self.pts_backbone = builder.build_backbone(pts_backbone) if pts_fusion_layer: self.pts_fusion_layer = builder.build_fusion_layer( pts_fusion_layer) if pts_neck is not None: self.pts_neck = builder.build_neck(pts_neck) if pts_bbox_head: pts_train_cfg = train_cfg.pts if train_cfg else None pts_bbox_head.update(train_cfg=pts_train_cfg) pts_test_cfg = test_cfg.pts if test_cfg else None pts_bbox_head.update(test_cfg=pts_test_cfg) self.pts_bbox_head = builder.build_head(pts_bbox_head) if img_backbone: self.img_backbone = builder.build_backbone(img_backbone) if img_neck is not None: self.img_neck = builder.build_neck(img_neck) if img_rpn_head is not None: self.img_rpn_head = builder.build_head(img_rpn_head) if img_roi_head is not None: self.img_roi_head = builder.build_head(img_roi_head) self.train_cfg = train_cfg self.test_cfg = test_cfg if pretrained is None: img_pretrained = None pts_pretrained = None elif isinstance(pretrained, dict): img_pretrained = pretrained.get('img', None) pts_pretrained = pretrained.get('pts', None) else: raise ValueError( f'pretrained should be a dict, got {type(pretrained)}') if self.with_img_backbone: if img_pretrained is not None: warnings.warn('DeprecationWarning: pretrained is a deprecated ' 'key, please consider using init_cfg.') self.img_backbone.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_img_roi_head: if img_pretrained is not None: warnings.warn('DeprecationWarning: pretrained is a deprecated ' 'key, please consider using init_cfg.') self.img_roi_head.init_cfg = dict( type='Pretrained', checkpoint=img_pretrained) if self.with_pts_backbone: if pts_pretrained is not None: warnings.warn('DeprecationWarning: pretrained is a deprecated ' 'key, please consider using init_cfg') self.pts_backbone.init_cfg = dict( type='Pretrained', checkpoint=pts_pretrained) @property def with_img_shared_head(self): """bool: Whether the detector has a shared head in image branch.""" return hasattr(self, 'img_shared_head') and self.img_shared_head is not None @property def with_pts_bbox(self): """bool: Whether the detector has a 3D box head.""" return hasattr(self, 'pts_bbox_head') and self.pts_bbox_head is not None @property def with_img_bbox(self): """bool: Whether the detector has a 2D image box head.""" return hasattr(self, 'img_bbox_head') and self.img_bbox_head is not None @property def with_img_backbone(self): """bool: Whether the detector has a 2D image backbone.""" return hasattr(self, 'img_backbone') and self.img_backbone is not None @property def with_pts_backbone(self): """bool: Whether the detector has a 3D backbone.""" return hasattr(self, 'pts_backbone') and self.pts_backbone is not None @property def with_fusion(self): """bool: Whether the detector has a fusion layer.""" return hasattr(self, 'pts_fusion_layer') and self.fusion_layer is not None @property def with_img_neck(self): """bool: Whether the detector has a neck in image branch.""" return hasattr(self, 'img_neck') and self.img_neck is not None @property def with_pts_neck(self): """bool: Whether the detector has a neck in 3D detector branch.""" return hasattr(self, 'pts_neck') and self.pts_neck is not None @property def with_img_rpn(self): """bool: Whether the detector has a 2D RPN in image detector branch.""" return hasattr(self, 'img_rpn_head') and self.img_rpn_head is not None @property def with_img_roi_head(self): """bool: Whether the detector has a RoI Head in image branch.""" return hasattr(self, 'img_roi_head') and self.img_roi_head is not None @property def with_voxel_encoder(self): """bool: Whether the detector has a voxel encoder.""" return hasattr(self, 'voxel_encoder') and self.voxel_encoder is not None @property def with_middle_encoder(self): """bool: Whether the detector has a middle encoder.""" return hasattr(self, 'middle_encoder') and self.middle_encoder is not None def extract_img_feat(self, img, img_metas): """Extract features of images.""" if self.with_img_backbone and img is not None: input_shape = img.shape[-2:] # update real input shape of each single img for img_meta in img_metas: img_meta.update(input_shape=input_shape) if img.dim() == 5 and img.size(0) == 1: img.squeeze_() elif img.dim() == 5 and img.size(0) > 1: B, N, C, H, W = img.size() img = img.view(B * N, C, H, W) img_feats = self.img_backbone(img) else: return None if self.with_img_neck: img_feats = self.img_neck(img_feats) return img_feats def extract_pts_feat(self, pts, img_feats, img_metas): """Extract features of points.""" if not self.with_pts_bbox: return None voxels, num_points, coors = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors, img_feats, img_metas) batch_size = coors[-1, 0] + 1 x = self.pts_middle_encoder(voxel_features, coors, batch_size) x = self.pts_backbone(x) if self.with_pts_neck: x = self.pts_neck(x) return x def extract_feat(self, points, img, img_metas): """Extract features from images and points.""" img_feats = self.extract_img_feat(img, img_metas) pts_feats = self.extract_pts_feat(points, img_feats, img_metas) return (img_feats, pts_feats) @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply dynamic voxelization to points. Args: points (list[torch.Tensor]): Points of each sample. Returns: tuple[torch.Tensor]: Concatenated points, number of points per voxel, and coordinates. """ voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.pts_voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return voxels, num_points, coors_batch def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img=None, proposals=None, gt_bboxes_ignore=None): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor, optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) losses = dict() if pts_feats: losses_pts = self.forward_pts_train(pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) if img_feats: losses_img = self.forward_img_train( img_feats, img_metas=img_metas, gt_bboxes=gt_bboxes, gt_labels=gt_labels, gt_bboxes_ignore=gt_bboxes_ignore, proposals=proposals) losses.update(losses_img) return losses def forward_pts_train(self, pts_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore=None): """Forward function for point cloud branch. Args: pts_feats (list[torch.Tensor]): Features of point cloud branch gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole img_metas (list[dict]): Meta information of samples. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ outs = self.pts_bbox_head(pts_feats) loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas) losses = self.pts_bbox_head.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def forward_img_train(self, x, img_metas, gt_bboxes, gt_labels, gt_bboxes_ignore=None, proposals=None, **kwargs): """Forward function for image branch. This function works similar to the forward function of Faster R-CNN. Args: x (list[torch.Tensor]): Image features of shape (B, C, H, W) of multiple levels. img_metas (list[dict]): Meta information of images. gt_bboxes (list[torch.Tensor]): Ground truth boxes of each image sample. gt_labels (list[torch.Tensor]): Ground truth labels of boxes. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. proposals (list[torch.Tensor], optional): Proposals of each sample. Defaults to None. Returns: dict: Losses of each branch. """ losses = dict() # RPN forward and loss if self.with_img_rpn: rpn_outs = self.img_rpn_head(x) rpn_loss_inputs = rpn_outs + (gt_bboxes, img_metas, self.train_cfg.img_rpn) rpn_losses = self.img_rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('img_rpn_proposal', self.test_cfg.img_rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals # bbox head forward and loss if self.with_img_bbox: # bbox head forward and loss img_roi_losses = self.img_roi_head.forward_train( x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore, **kwargs) losses.update(img_roi_losses) return losses def simple_test_img(self, x, img_metas, proposals=None, rescale=False): """Test without augmentation.""" if proposals is None: proposal_list = self.simple_test_rpn(x, img_metas, self.test_cfg.img_rpn) else: proposal_list = proposals return self.img_roi_head.simple_test( x, proposal_list, img_metas, rescale=rescale) def simple_test_rpn(self, x, img_metas, rpn_test_cfg): """RPN test function.""" rpn_outs = self.img_rpn_head(x) proposal_inputs = rpn_outs + (img_metas, rpn_test_cfg) proposal_list = self.img_rpn_head.get_bboxes(*proposal_inputs) return proposal_list def simple_test_pts(self, x, img_metas, rescale=False): """Test function of point cloud branch.""" outs = self.pts_bbox_head(x) bbox_list = self.pts_bbox_head.get_bboxes( *outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def simple_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" img_feats, pts_feats = self.extract_feat( points, img=img, img_metas=img_metas) bbox_list = [dict() for i in range(len(img_metas))] if pts_feats and self.with_pts_bbox: bbox_pts = self.simple_test_pts( pts_feats, img_metas, rescale=rescale) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox if img_feats and self.with_img_bbox: bbox_img = self.simple_test_img( img_feats, img_metas, rescale=rescale) for result_dict, img_bbox in zip(bbox_list, bbox_img): result_dict['img_bbox'] = img_bbox return bbox_list def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" img_feats, pts_feats = self.extract_feats(points, img_metas, imgs) bbox_list = dict() if pts_feats and self.with_pts_bbox: bbox_pts = self.aug_test_pts(pts_feats, img_metas, rescale) bbox_list.update(pts_bbox=bbox_pts) return [bbox_list] def extract_feats(self, points, img_metas, imgs=None): """Extract point and image features of multiple samples.""" if imgs is None: imgs = [None] * len(img_metas) img_feats, pts_feats = multi_apply(self.extract_feat, points, imgs, img_metas) return img_feats, pts_feats def aug_test_pts(self, feats, img_metas, rescale=False): """Test function of point cloud branch with augmentaiton.""" # only support aug_test for one sample aug_bboxes = [] for x, img_meta in zip(feats, img_metas): outs = self.pts_bbox_head(x) bbox_list = self.pts_bbox_head.get_bboxes( *outs, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.pts_bbox_head.test_cfg) return merged_bboxes def show_results(self, data, result, out_dir): """Results visualization. Args: data (dict): Input points and the information of the sample. result (dict): Prediction results. out_dir (str): Output directory of visualization result. """ for batch_id in range(len(result)): if isinstance(data['points'][0], DC): points = data['points'][0]._data[0][batch_id].numpy() elif mmcv.is_list_of(data['points'][0], torch.Tensor): points = data['points'][0][batch_id] else: ValueError(f"Unsupported data type {type(data['points'][0])} " f'for visualization!') if isinstance(data['img_metas'][0], DC): pts_filename = data['img_metas'][0]._data[0][batch_id][ 'pts_filename'] box_mode_3d = data['img_metas'][0]._data[0][batch_id][ 'box_mode_3d'] elif mmcv.is_list_of(data['img_metas'][0], dict): pts_filename = data['img_metas'][0][batch_id]['pts_filename'] box_mode_3d = data['img_metas'][0][batch_id]['box_mode_3d'] else: ValueError( f"Unsupported data type {type(data['img_metas'][0])} " f'for visualization!') file_name = osp.split(pts_filename)[-1].split('.')[0] assert out_dir is not None, 'Expect out_dir, got none.' inds = result[batch_id]['pts_bbox']['scores_3d'] > 0.1 pred_bboxes = result[batch_id]['pts_bbox']['boxes_3d'][inds] # for now we convert points and bbox into depth mode if (box_mode_3d == Box3DMode.CAM) or (box_mode_3d == Box3DMode.LIDAR): points = Coord3DMode.convert_point(points, Coord3DMode.LIDAR, Coord3DMode.DEPTH) pred_bboxes = Box3DMode.convert(pred_bboxes, box_mode_3d, Box3DMode.DEPTH) elif box_mode_3d != Box3DMode.DEPTH: ValueError( f'Unsupported box_mode_3d {box_mode_3d} for conversion!') pred_bboxes = pred_bboxes.tensor.cpu().numpy() show_result(points, None, pred_bboxes, out_dir, file_name) ================================================ FILE: mmdet3d/models/detectors/parta2.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.ops import Voxelization from torch.nn import functional as F from .. import builder from ..builder import DETECTORS from .two_stage import TwoStage3DDetector @DETECTORS.register_module() class PartA2(TwoStage3DDetector): r"""Part-A2 detector. Please refer to the `paper `_ """ def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, rpn_head=None, roi_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(PartA2, self).__init__( backbone=backbone, neck=neck, rpn_head=rpn_head, roi_head=roi_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) self.voxel_layer = Voxelization(**voxel_layer) self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) self.middle_encoder = builder.build_middle_encoder(middle_encoder) def extract_feat(self, points, img_metas): """Extract features from points.""" voxel_dict = self.voxelize(points) voxel_features = self.voxel_encoder(voxel_dict['voxels'], voxel_dict['num_points'], voxel_dict['coors']) batch_size = voxel_dict['coors'][-1, 0].item() + 1 feats_dict = self.middle_encoder(voxel_features, voxel_dict['coors'], batch_size) x = self.backbone(feats_dict['spatial_features']) if self.with_neck: neck_feats = self.neck(x) feats_dict.update({'neck_feats': neck_feats}) return feats_dict, voxel_dict @torch.no_grad() def voxelize(self, points): """Apply hard voxelization to points.""" voxels, coors, num_points, voxel_centers = [], [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.voxel_layer(res) res_voxel_centers = ( res_coors[:, [2, 1, 0]] + 0.5) * res_voxels.new_tensor( self.voxel_layer.voxel_size) + res_voxels.new_tensor( self.voxel_layer.point_cloud_range[0:3]) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxel_centers.append(res_voxel_centers) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) voxel_centers = torch.cat(voxel_centers, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) voxel_dict = dict( voxels=voxels, num_points=num_points, coors=coors_batch, voxel_centers=voxel_centers) return voxel_dict def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, gt_bboxes_ignore=None, proposals=None): """Training forward function. Args: points (list[torch.Tensor]): Point cloud of each sample. img_metas (list[dict]): Meta information of each sample gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ feats_dict, voxels_dict = self.extract_feat(points, img_metas) losses = dict() if self.with_rpn: rpn_outs = self.rpn_head(feats_dict['neck_feats']) rpn_loss_inputs = rpn_outs + (gt_bboxes_3d, gt_labels_3d, img_metas) rpn_losses = self.rpn_head.loss( *rpn_loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(rpn_losses) proposal_cfg = self.train_cfg.get('rpn_proposal', self.test_cfg.rpn) proposal_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*proposal_inputs) else: proposal_list = proposals roi_losses = self.roi_head.forward_train(feats_dict, voxels_dict, img_metas, proposal_list, gt_bboxes_3d, gt_labels_3d) losses.update(roi_losses) return losses def simple_test(self, points, img_metas, proposals=None, rescale=False): """Test function without augmentaiton.""" feats_dict, voxels_dict = self.extract_feat(points, img_metas) if self.with_rpn: rpn_outs = self.rpn_head(feats_dict['neck_feats']) proposal_cfg = self.test_cfg.rpn bbox_inputs = rpn_outs + (img_metas, proposal_cfg) proposal_list = self.rpn_head.get_bboxes(*bbox_inputs) else: proposal_list = proposals return self.roi_head.simple_test(feats_dict, voxels_dict, img_metas, proposal_list) ================================================ FILE: mmdet3d/models/detectors/point_rcnn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from ..builder import DETECTORS from .two_stage import TwoStage3DDetector @DETECTORS.register_module() class PointRCNN(TwoStage3DDetector): r"""PointRCNN detector. Please refer to the `PointRCNN `_ Args: backbone (dict): Config dict of detector's backbone. neck (dict, optional): Config dict of neck. Defaults to None. rpn_head (dict, optional): Config of RPN head. Defaults to None. roi_head (dict, optional): Config of ROI head. Defaults to None. train_cfg (dict, optional): Train configs. Defaults to None. test_cfg (dict, optional): Test configs. Defaults to None. pretrained (str, optional): Model pretrained path. Defaults to None. init_cfg (dict, optional): Config of initialization. Defaults to None. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(PointRCNN, self).__init__( backbone=backbone, neck=neck, rpn_head=rpn_head, roi_head=roi_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) def extract_feat(self, points): """Directly extract features from the backbone+neck. Args: points (torch.Tensor): Input points. Returns: dict: Features from the backbone+neck """ x = self.backbone(points) if self.with_neck: x = self.neck(x) return x def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d): """Forward of training. Args: points (list[torch.Tensor]): Points of each batch. img_metas (list[dict]): Meta information of each sample. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. Returns: dict: Losses. """ losses = dict() points_cat = torch.stack(points) x = self.extract_feat(points_cat) # features for rcnn backbone_feats = x['fp_features'].clone() backbone_xyz = x['fp_xyz'].clone() rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz} bbox_preds, cls_preds = self.rpn_head(x) rpn_loss = self.rpn_head.loss( bbox_preds=bbox_preds, cls_preds=cls_preds, points=points, gt_bboxes_3d=gt_bboxes_3d, gt_labels_3d=gt_labels_3d, img_metas=img_metas) losses.update(rpn_loss) bbox_list = self.rpn_head.get_bboxes(points_cat, bbox_preds, cls_preds, img_metas) proposal_list = [ dict( boxes_3d=bboxes, scores_3d=scores, labels_3d=labels, cls_preds=preds_cls) for bboxes, scores, labels, preds_cls in bbox_list ] rcnn_feats.update({'points_cls_preds': cls_preds}) roi_losses = self.roi_head.forward_train(rcnn_feats, img_metas, proposal_list, gt_bboxes_3d, gt_labels_3d) losses.update(roi_losses) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Forward of testing. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list[dict]): Image metas. imgs (list[torch.Tensor], optional): Images of each sample. Defaults to None. rescale (bool, optional): Whether to rescale results. Defaults to False. Returns: list: Predicted 3d boxes. """ points_cat = torch.stack(points) x = self.extract_feat(points_cat) # features for rcnn backbone_feats = x['fp_features'].clone() backbone_xyz = x['fp_xyz'].clone() rcnn_feats = {'features': backbone_feats, 'points': backbone_xyz} bbox_preds, cls_preds = self.rpn_head(x) rcnn_feats.update({'points_cls_preds': cls_preds}) bbox_list = self.rpn_head.get_bboxes( points_cat, bbox_preds, cls_preds, img_metas, rescale=rescale) proposal_list = [ dict( boxes_3d=bboxes, scores_3d=scores, labels_3d=labels, cls_preds=preds_cls) for bboxes, scores, labels, preds_cls in bbox_list ] bbox_results = self.roi_head.simple_test(rcnn_feats, img_metas, proposal_list) return bbox_results ================================================ FILE: mmdet3d/models/detectors/sassd.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.ops import Voxelization from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet.models.builder import DETECTORS from .. import builder from .single_stage import SingleStage3DDetector @DETECTORS.register_module() class SASSD(SingleStage3DDetector): r"""`SASSD ` _ for 3D detection.""" def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, init_cfg=None, pretrained=None): super(SASSD, self).__init__( backbone=backbone, neck=neck, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, init_cfg=init_cfg, pretrained=pretrained) self.voxel_layer = Voxelization(**voxel_layer) self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) self.middle_encoder = builder.build_middle_encoder(middle_encoder) def extract_feat(self, points, img_metas=None, test_mode=False): """Extract features from points.""" voxels, num_points, coors = self.voxelize(points) voxel_features = self.voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0].item() + 1 x, point_misc = self.middle_encoder(voxel_features, coors, batch_size, test_mode) x = self.backbone(x) if self.with_neck: x = self.neck(x) return x, point_misc @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply hard voxelization to points.""" voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return voxels, num_points, coors_batch def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, gt_bboxes_ignore=None): """Training forward function. Args: points (list[torch.Tensor]): Point cloud of each sample. img_metas (list[dict]): Meta information of each sample gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ x, point_misc = self.extract_feat(points, img_metas, test_mode=False) aux_loss = self.middle_encoder.aux_loss(*point_misc, gt_bboxes_3d) outs = self.bbox_head(x) loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas) losses = self.bbox_head.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) losses.update(aux_loss) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Test function without augmentaiton.""" x, _ = self.extract_feat(points, img_metas, test_mode=True) outs = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes( *outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" feats = self.extract_feats(points, img_metas, test_mode=True) # only support aug_test for one sample aug_bboxes = [] for x, img_meta in zip(feats, img_metas): outs = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes( *outs, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/detectors/single_stage.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..builder import DETECTORS, build_backbone, build_head, build_neck from .base import Base3DDetector @DETECTORS.register_module() class SingleStage3DDetector(Base3DDetector): """SingleStage3DDetector. This class serves as a base class for single-stage 3D detectors. Args: backbone (dict): Config dict of detector's backbone. neck (dict, optional): Config dict of neck. Defaults to None. bbox_head (dict, optional): Config dict of box head. Defaults to None. train_cfg (dict, optional): Config dict of training hyper-parameters. Defaults to None. test_cfg (dict, optional): Config dict of test hyper-parameters. Defaults to None. pretrained (str, optional): Path of pretrained models. Defaults to None. """ def __init__(self, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, init_cfg=None, pretrained=None): super(SingleStage3DDetector, self).__init__(init_cfg) self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) bbox_head.update(train_cfg=train_cfg) bbox_head.update(test_cfg=test_cfg) self.bbox_head = build_head(bbox_head) self.train_cfg = train_cfg self.test_cfg = test_cfg def forward_dummy(self, points): """Used for computing network flops. See `mmdetection/tools/analysis_tools/get_flops.py` """ x = self.extract_feat(points) try: sample_mod = self.train_cfg.sample_mod outs = self.bbox_head(x, sample_mod) except AttributeError: outs = self.bbox_head(x) return outs def extract_feat(self, points, img_metas=None): """Directly extract features from the backbone+neck. Args: points (torch.Tensor): Input points. """ x = self.backbone(points) if self.with_neck: x = self.neck(x) return x def extract_feats(self, points, img_metas): """Extract features of multiple samples.""" return [ self.extract_feat(pts, img_meta) for pts, img_meta in zip(points, img_metas) ] ================================================ FILE: mmdet3d/models/detectors/single_stage_mono3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from os import path as osp import mmcv import numpy as np import torch from mmcv.parallel import DataContainer as DC from mmdet3d.core import (CameraInstance3DBoxes, bbox3d2result, show_multi_modality_result) from mmdet.models.detectors import SingleStageDetector from ..builder import DETECTORS, build_backbone, build_head, build_neck @DETECTORS.register_module() class SingleStageMono3DDetector(SingleStageDetector): """Base class for monocular 3D single-stage detectors. Single-stage detectors directly and densely predict bounding boxes on the output features of the backbone+neck. """ def __init__(self, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(SingleStageDetector, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) bbox_head.update(train_cfg=train_cfg) bbox_head.update(test_cfg=test_cfg) self.bbox_head = build_head(bbox_head) self.train_cfg = train_cfg self.test_cfg = test_cfg def extract_feats(self, imgs): """Directly extract features from the backbone+neck.""" assert isinstance(imgs, list) return [self.extract_feat(img) for img in imgs] def forward_train(self, img, img_metas, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels=None, gt_bboxes_ignore=None): """ Args: img (Tensor): Input images of shape (N, C, H, W). Typically these should be mean centered and std scaled. img_metas (list[dict]): A List of image info dict where each dict has: 'img_shape', 'scale_factor', 'flip', and may also contain 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. For details on the values of these keys see :class:`mmdet.datasets.pipelines.Collect`. gt_bboxes (list[Tensor]): Each item are the truth boxes for each image in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): Class indices corresponding to each box gt_bboxes_3d (list[Tensor]): Each item are the 3D truth boxes for each image in [x, y, z, x_size, y_size, z_size, yaw, vx, vy] format. gt_labels_3d (list[Tensor]): 3D class indices corresponding to each box. centers2d (list[Tensor]): Projected 3D centers onto 2D images. depths (list[Tensor]): Depth of projected centers on 2D images. attr_labels (list[Tensor], optional): Attribute indices corresponding to each box gt_bboxes_ignore (list[Tensor]): Specify which bounding boxes can be ignored when computing the loss. Returns: dict[str, Tensor]: A dictionary of loss components. """ x = self.extract_feat(img) losses = self.bbox_head.forward_train(x, img_metas, gt_bboxes, gt_labels, gt_bboxes_3d, gt_labels_3d, centers2d, depths, attr_labels, gt_bboxes_ignore) return losses def simple_test(self, img, img_metas, rescale=False): """Test function without test time augmentation. Args: imgs (list[torch.Tensor]): List of multiple images img_metas (list[dict]): List of image information. rescale (bool, optional): Whether to rescale the results. Defaults to False. Returns: list[list[np.ndarray]]: BBox results of each image and classes. The outer list corresponds to each image. The inner list corresponds to each class. """ x = self.extract_feat(img) outs = self.bbox_head(x) bbox_outputs = self.bbox_head.get_bboxes( *outs, img_metas, rescale=rescale) if self.bbox_head.pred_bbox2d: from mmdet.core import bbox2result bbox2d_img = [ bbox2result(bboxes2d, labels, self.bbox_head.num_classes) for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs ] bbox_outputs = [bbox_outputs[0][:-1]] bbox_img = [ bbox3d2result(bboxes, scores, labels, attrs) for bboxes, scores, labels, attrs in bbox_outputs ] bbox_list = [dict() for i in range(len(img_metas))] for result_dict, img_bbox in zip(bbox_list, bbox_img): result_dict['img_bbox'] = img_bbox if self.bbox_head.pred_bbox2d: for result_dict, img_bbox2d in zip(bbox_list, bbox2d_img): result_dict['img_bbox2d'] = img_bbox2d return bbox_list def aug_test(self, imgs, img_metas, rescale=False): """Test function with test time augmentation.""" feats = self.extract_feats(imgs) # only support aug_test for one sample outs_list = [self.bbox_head(x) for x in feats] for i, img_meta in enumerate(img_metas): if img_meta[0]['pcd_horizontal_flip']: for j in range(len(outs_list[i])): # for each prediction if outs_list[i][j][0] is None: continue for k in range(len(outs_list[i][j])): # every stride of featmap outs_list[i][j][k] = torch.flip( outs_list[i][j][k], dims=[3]) reg = outs_list[i][1] for reg_feat in reg: # offset_x reg_feat[:, 0, :, :] = 1 - reg_feat[:, 0, :, :] # velo_x if self.bbox_head.pred_velo: reg_feat[:, 7, :, :] = -reg_feat[:, 7, :, :] # rotation reg_feat[:, 6, :, :] = -reg_feat[:, 6, :, :] + np.pi merged_outs = [] for i in range(len(outs_list[0])): # for each prediction merged_feats = [] for j in range(len(outs_list[0][i])): if outs_list[0][i][0] is None: merged_feats.append(None) continue # for each stride of featmap avg_feats = torch.mean( torch.cat([x[i][j] for x in outs_list]), dim=0, keepdim=True) if i == 1: # regression predictions # rot/velo/2d det keeps the original avg_feats[:, 6:, :, :] = \ outs_list[0][i][j][:, 6:, :, :] if i == 2: # dir_cls keeps the original avg_feats = outs_list[0][i][j] merged_feats.append(avg_feats) merged_outs.append(merged_feats) merged_outs = tuple(merged_outs) bbox_outputs = self.bbox_head.get_bboxes( *merged_outs, img_metas[0], rescale=rescale) if self.bbox_head.pred_bbox2d: from mmdet.core import bbox2result bbox2d_img = [ bbox2result(bboxes2d, labels, self.bbox_head.num_classes) for bboxes, scores, labels, attrs, bboxes2d in bbox_outputs ] bbox_outputs = [bbox_outputs[0][:-1]] bbox_img = [ bbox3d2result(bboxes, scores, labels, attrs) for bboxes, scores, labels, attrs in bbox_outputs ] bbox_list = dict() bbox_list.update(img_bbox=bbox_img[0]) if self.bbox_head.pred_bbox2d: bbox_list.update(img_bbox2d=bbox2d_img[0]) return [bbox_list] def show_results(self, data, result, out_dir, show=False, score_thr=None): """Results visualization. Args: data (list[dict]): Input images and the information of the sample. result (list[dict]): Prediction results. out_dir (str): Output directory of visualization result. show (bool, optional): Determines whether you are going to show result by open3d. Defaults to False. TODO: implement score_thr of single_stage_mono3d. score_thr (float, optional): Score threshold of bounding boxes. Default to None. Not implemented yet, but it is here for unification. """ for batch_id in range(len(result)): if isinstance(data['img_metas'][0], DC): img_filename = data['img_metas'][0]._data[0][batch_id][ 'filename'] cam2img = data['img_metas'][0]._data[0][batch_id]['cam2img'] elif mmcv.is_list_of(data['img_metas'][0], dict): img_filename = data['img_metas'][0][batch_id]['filename'] cam2img = data['img_metas'][0][batch_id]['cam2img'] else: ValueError( f"Unsupported data type {type(data['img_metas'][0])} " f'for visualization!') img = mmcv.imread(img_filename) file_name = osp.split(img_filename)[-1].split('.')[0] assert out_dir is not None, 'Expect out_dir, got none.' pred_bboxes = result[batch_id]['img_bbox']['boxes_3d'] assert isinstance(pred_bboxes, CameraInstance3DBoxes), \ f'unsupported predicted bbox type {type(pred_bboxes)}' show_multi_modality_result( img, None, pred_bboxes, cam2img, out_dir, file_name, 'camera', show=show) ================================================ FILE: mmdet3d/models/detectors/smoke_mono3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..builder import DETECTORS from .single_stage_mono3d import SingleStageMono3DDetector @DETECTORS.register_module() class SMOKEMono3D(SingleStageMono3DDetector): r"""SMOKE `_ for monocular 3D object detection. """ def __init__(self, backbone, neck, bbox_head, train_cfg=None, test_cfg=None, pretrained=None): super(SMOKEMono3D, self).__init__(backbone, neck, bbox_head, train_cfg, test_cfg, pretrained) ================================================ FILE: mmdet3d/models/detectors/ssd3dnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from ..builder import DETECTORS from .votenet import VoteNet @DETECTORS.register_module() class SSD3DNet(VoteNet): """3DSSDNet model. https://arxiv.org/abs/2002.10187.pdf """ def __init__(self, backbone, bbox_head=None, train_cfg=None, test_cfg=None, init_cfg=None, pretrained=None): super(SSD3DNet, self).__init__( backbone=backbone, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, init_cfg=init_cfg, pretrained=pretrained) ================================================ FILE: mmdet3d/models/detectors/two_stage.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from mmdet.models import TwoStageDetector from ..builder import DETECTORS, build_backbone, build_head, build_neck from .base import Base3DDetector @DETECTORS.register_module() class TwoStage3DDetector(Base3DDetector, TwoStageDetector): """Base class of two-stage 3D detector. It inherits original ``:class:TwoStageDetector`` and ``:class:Base3DDetector``. This class could serve as a base class for all two-stage 3D detectors. """ def __init__(self, backbone, neck=None, rpn_head=None, roi_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(TwoStageDetector, self).__init__(init_cfg) if pretrained: warnings.warn('DeprecationWarning: pretrained is deprecated, ' 'please use "init_cfg" instead') backbone.pretrained = pretrained self.backbone = build_backbone(backbone) self.train_cfg = train_cfg self.test_cfg = test_cfg if neck is not None: self.neck = build_neck(neck) if rpn_head is not None: rpn_train_cfg = train_cfg.rpn if train_cfg is not None else None rpn_head_ = rpn_head.copy() rpn_head_.update(train_cfg=rpn_train_cfg, test_cfg=test_cfg.rpn) self.rpn_head = build_head(rpn_head_) if roi_head is not None: # update train and test cfg here for now # TODO: refactor assigner & sampler rcnn_train_cfg = train_cfg.rcnn if train_cfg is not None else None roi_head.update(train_cfg=rcnn_train_cfg) roi_head.update(test_cfg=test_cfg.rcnn) roi_head.pretrained = pretrained self.roi_head = build_head(roi_head) ================================================ FILE: mmdet3d/models/detectors/votenet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from ..builder import DETECTORS from .single_stage import SingleStage3DDetector @DETECTORS.register_module() class VoteNet(SingleStage3DDetector): r"""`VoteNet `_ for 3D detection.""" def __init__(self, backbone, bbox_head=None, train_cfg=None, test_cfg=None, init_cfg=None, pretrained=None): super(VoteNet, self).__init__( backbone=backbone, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, init_cfg=None, pretrained=pretrained) def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, gt_bboxes_ignore=None): """Forward of training. Args: points (list[torch.Tensor]): Points of each batch. img_metas (list): Image metas. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): gt bboxes of each batch. gt_labels_3d (list[torch.Tensor]): gt class labels of each batch. pts_semantic_mask (list[torch.Tensor]): point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): point-wise instance label of each batch. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict: Losses. """ points_cat = torch.stack(points) x = self.extract_feat(points_cat) bbox_preds = self.bbox_head(x, self.train_cfg.sample_mod) loss_inputs = (points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas) losses = self.bbox_head.loss( bbox_preds, *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Forward of testing. Args: points (list[torch.Tensor]): Points of each sample. img_metas (list): Image metas. rescale (bool): Whether to rescale results. Returns: list: Predicted 3d boxes. """ points_cat = torch.stack(points) x = self.extract_feat(points_cat) bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod) bbox_list = self.bbox_head.get_bboxes( points_cat, bbox_preds, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test with augmentation.""" points_cat = [torch.stack(pts) for pts in points] feats = self.extract_feats(points_cat, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, pts_cat, img_meta in zip(feats, points_cat, img_metas): bbox_preds = self.bbox_head(x, self.test_cfg.sample_mod) bbox_list = self.bbox_head.get_bboxes( pts_cat, bbox_preds, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/detectors/voxelnet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.ops import Voxelization from mmcv.runner import force_fp32 from torch.nn import functional as F from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from .. import builder from ..builder import DETECTORS from .single_stage import SingleStage3DDetector @DETECTORS.register_module() class VoxelNet(SingleStage3DDetector): r"""`VoxelNet `_ for 3D detection.""" def __init__(self, voxel_layer, voxel_encoder, middle_encoder, backbone, neck=None, bbox_head=None, train_cfg=None, test_cfg=None, init_cfg=None, pretrained=None): super(VoxelNet, self).__init__( backbone=backbone, neck=neck, bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, init_cfg=init_cfg, pretrained=pretrained) self.voxel_layer = Voxelization(**voxel_layer) self.voxel_encoder = builder.build_voxel_encoder(voxel_encoder) self.middle_encoder = builder.build_middle_encoder(middle_encoder) def extract_feat(self, points, img_metas=None): """Extract features from points.""" voxels, num_points, coors = self.voxelize(points) voxel_features = self.voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0].item() + 1 x = self.middle_encoder(voxel_features, coors, batch_size) x = self.backbone(x) if self.with_neck: x = self.neck(x) return x @torch.no_grad() @force_fp32() def voxelize(self, points): """Apply hard voxelization to points.""" voxels, coors, num_points = [], [], [] for res in points: res_voxels, res_coors, res_num_points = self.voxel_layer(res) voxels.append(res_voxels) coors.append(res_coors) num_points.append(res_num_points) voxels = torch.cat(voxels, dim=0) num_points = torch.cat(num_points, dim=0) coors_batch = [] for i, coor in enumerate(coors): coor_pad = F.pad(coor, (1, 0), mode='constant', value=i) coors_batch.append(coor_pad) coors_batch = torch.cat(coors_batch, dim=0) return voxels, num_points, coors_batch def forward_train(self, points, img_metas, gt_bboxes_3d, gt_labels_3d, gt_bboxes_ignore=None): """Training forward function. Args: points (list[torch.Tensor]): Point cloud of each sample. img_metas (list[dict]): Meta information of each sample gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. gt_labels_3d (list[torch.Tensor]): Ground truth labels for boxes of each sampole gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Defaults to None. Returns: dict: Losses of each branch. """ x = self.extract_feat(points, img_metas) outs = self.bbox_head(x) loss_inputs = outs + (gt_bboxes_3d, gt_labels_3d, img_metas) losses = self.bbox_head.loss( *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore) return losses def simple_test(self, points, img_metas, imgs=None, rescale=False): """Test function without augmentaiton.""" x = self.extract_feat(points, img_metas) outs = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes( *outs, img_metas, rescale=rescale) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def aug_test(self, points, img_metas, imgs=None, rescale=False): """Test function with augmentaiton.""" feats = self.extract_feats(points, img_metas) # only support aug_test for one sample aug_bboxes = [] for x, img_meta in zip(feats, img_metas): outs = self.bbox_head(x) bbox_list = self.bbox_head.get_bboxes( *outs, img_meta, rescale=rescale) bbox_list = [ dict(boxes_3d=bboxes, scores_3d=scores, labels_3d=labels) for bboxes, scores, labels in bbox_list ] aug_bboxes.append(bbox_list[0]) # after merging, bboxes will be rescaled to the original image size merged_bboxes = merge_aug_bboxes_3d(aug_bboxes, img_metas, self.bbox_head.test_cfg) return [merged_bboxes] ================================================ FILE: mmdet3d/models/fbbev/__init__.py ================================================ from .detectors import * from .modules import * from .utils import * from .view_transformation import * from .heads import * from .streampetr import * from .track_head import * from .streammapnet import * from .motion_head import * from .planner_head import * ================================================ FILE: mmdet3d/models/fbbev/detectors/__init__.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here from .bev_planner import BEVPlanner ================================================ FILE: mmdet3d/models/fbbev/detectors/bev_planner.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import torch import torch.nn.functional as F import torch.nn as nn from mmcv.runner import force_fp32 import os from mmdet.models import DETECTORS from mmdet3d.models import builder from mmdet3d.models.detectors import CenterPoint from mmdet3d.models.builder import build_head, build_neck import numpy as np import torch import torchvision import matplotlib import cv2 import mmcv from ..utils.grid_mask import GridMask from ..utils.bricks import save_tensor def generate_forward_transformation_matrix(bda, img_meta_dict=None): b = bda.size(0) hom_res = torch.eye(4)[None].repeat(b, 1, 1).to(bda.device) for i in range(b): hom_res[i, :3, :3] = bda[i] return hom_res @DETECTORS.register_module() class BEVPlanner(CenterPoint): def __init__(self, # BEVDet components img_bev_encoder_backbone=None, img_bev_encoder_neck=None, forward_projection=None, # BEVFormer components backward_projection=None, # FB-BEV components frpn=None, # other modules depth_net=None, occupancy_head=None, img_det_2d_head=None, map_head=None, motion_head=None, planner_head=None, # other settings. use_depth_supervision=False, add_forward_backbward_feats=False, fix_void=False, occupancy_save_path=None, do_history=True, interpolation_mode='bilinear', fuse_history_bev=True, history_cat_num=16, history_cat_conv_out_channels=None, embed_dims=80, single_bev_num_channels=80, use_grid_mask=False, yolox_use_ml_feats=False, with_ego_status=False, align_prev_bev=True, **kwargs): """ Parameters: img_bev_encoder_backbone - img_bev_encoder_neck - forward_projection - backward_projection - frpn - foreground region proposal network, used in FB-BEV depth_net - occupancy_head - img_det_2d_head - map_head - motion_head - planner_head - use_depth_supervision - add_forward_backbward_feats - fix_void - Used to fix legacy bugs in Occupancy occupancy_save_path - do_history - A Flag to start the temporal traning at i-th epoch interpolation_mode - fuse_history_bev - Weather to use history bev, which is different from `do_hisitory` history_cat_num - history_cat_conv_out_channels - single_bev_num_channels - use_grid_mask - yolox_use_ml_feats - with_ego_status - """ super(BEVPlanner, self).__init__(**kwargs) self.fix_void = fix_void # BEVDet init self.forward_projection = builder.build_neck(forward_projection) if forward_projection else None self.img_bev_encoder_backbone = builder.build_backbone(img_bev_encoder_backbone) if img_bev_encoder_backbone else None self.img_bev_encoder_neck = builder.build_neck(img_bev_encoder_neck) if img_bev_encoder_neck else None # BEVFormer init self.backward_projection = builder.build_head(backward_projection) if backward_projection else None # FB-BEV init if not self.forward_projection: assert not frpn, 'frpn relies on LSS' self.frpn = builder.build_head(frpn) if frpn else None # Depth Net self.depth_net = builder.build_head(depth_net) if depth_net else None # Occupancy Head self.occupancy_head = builder.build_head(occupancy_head) if occupancy_head else None # 2D det head self.img_det_2d_head = builder.build_head(img_det_2d_head) if img_det_2d_head else None # map head if map_head: map_head['train_cfg'] = kwargs.get('train_cfg', None) self.map_head = builder.build_head(map_head) else: self.map_head = None # motion self.motion_head = builder.build_head(motion_head) if motion_head else None # planner self.planner_head = builder.build_head(planner_head) if planner_head else None self.embed_dims = embed_dims self.use_grid_mask = use_grid_mask if self.use_grid_mask: self.grid_mask = GridMask(True, True, rotate=1, offset=False, ratio=0.5, mode=1, prob=0.7) self.add_forward_backbward_feats = add_forward_backbward_feats # fuse voxel features and bev features self.use_depth_supervision = use_depth_supervision self.yolox_use_ml_feats = yolox_use_ml_feats self.occupancy_save_path = occupancy_save_path # for saving data\for submitting to test server self.with_ego_status = with_ego_status if self.with_ego_status: self.can_bus_mlp = nn.Sequential( nn.Linear(9, self.embed_dims // 2), nn.ReLU(inplace=True), nn.Linear(self.embed_dims // 2, self.embed_dims), nn.ReLU(inplace=True), nn.LayerNorm(self.embed_dims) ) # Deal with history self.single_bev_num_channels = single_bev_num_channels self.do_history = do_history self.interpolation_mode = interpolation_mode self.history_cat_num = history_cat_num self.history_cam_sweep_freq = 0.5 # seconds between each frame self.history_cat_conv_out_channels = history_cat_conv_out_channels self.align_prev_bev=align_prev_bev self.fuse_history_bev = fuse_history_bev if self.fuse_history_bev: self._init_fuse_layers() self.history_sweep_time = None self.history_bev = None self.history_bev_before_encoder = None self.history_seq_ids = None self.history_forward_augs = None def _init_fuse_layers(self): history_cat_conv_out_channels = (self.history_cat_conv_out_channels if self.history_cat_conv_out_channels is not None else self.single_bev_num_channels) ## Embed each sample with its relative temporal offset with current timestep conv = nn.Conv2d if self.forward_projection.nx[-1] == 1 else nn.Conv3d self.history_keyframe_time_conv = nn.Sequential( conv(self.single_bev_num_channels + 1, self.single_bev_num_channels, kernel_size=1, padding=0, stride=1), nn.SyncBatchNorm(self.single_bev_num_channels), nn.ReLU(inplace=True)) ## Then concatenate and send them through an MLP. self.history_keyframe_cat_conv = nn.Sequential( conv(self.single_bev_num_channels * (self.history_cat_num + 1), history_cat_conv_out_channels, kernel_size=1, padding=0, stride=1), nn.SyncBatchNorm(history_cat_conv_out_channels), nn.ReLU(inplace=True)) def with_specific_component(self, component_name): """Whether the model owns a specific component""" return getattr(self, component_name, None) is not None def image_encoder(self, img): """ Return (single_scale_context, multi_scale_context:[List]) single scale_context are counsumed by forward projection multi_scale_context are consumed by some perception heads like yolox """ imgs = img B, N, C, imH, imW = imgs.shape imgs = imgs.view(B * N, C, imH, imW) if self.use_grid_mask: imgs = self.grid_mask(imgs) x = self.img_backbone(imgs) if self.with_img_neck: x_list = self.img_neck(x) if type(x_list) in [list, tuple]: x_list = list(x_list) for i, x in enumerate(x_list): _, output_dim, ouput_H, output_W = x.shape x_list[i] = x.view(B, N, output_dim, ouput_H, output_W) return x_list[1], x_list else: _, output_dim, ouput_H, output_W = x_list.shape return x_list.view(B, N, output_dim, ouput_H, output_W), [x_list.view(B, N, output_dim, ouput_H, output_W)] @force_fp32() def bev_encoder(self, x): if self.with_specific_component('img_bev_encoder_backbone'): x = self.img_bev_encoder_backbone(x) if self.with_specific_component('img_bev_encoder_neck'): x = self.img_bev_encoder_neck(x) if type(x) not in [list, tuple]: x = [x] return x @force_fp32() def fuse_history(self, curr_bev, img_metas, bda): # align features with 3d shift if curr_bev is None: return None voxel_feat = True if len(curr_bev.shape) == 5 else False if voxel_feat: curr_bev = curr_bev.permute(0, 1, 4, 2, 3) # n, c, z, h, w seq_ids = torch.LongTensor([ single_img_metas['sequence_group_idx'] for single_img_metas in img_metas]).to(curr_bev.device) start_of_sequence = torch.BoolTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(curr_bev.device) forward_augs = generate_forward_transformation_matrix(bda) # print('sqe_ids', seq_ids, ' start_of_sequence ', start_of_sequence.tolist(), ' index ', img_metas[0]['index'], img_metas[0]['scene_name']) curr_to_prev_ego_rt = torch.stack([ single_img_metas['curr_to_prev_ego_rt'] for single_img_metas in img_metas]).to(curr_bev) if not self.align_prev_bev: curr_to_prev_ego_rt= torch.eye(4).repeat(curr_to_prev_ego_rt.size(0), 1, 1).to(curr_bev) ## Deal with first batch if self.history_bev is None: self.history_bev = curr_bev.clone() self.history_seq_ids = seq_ids.clone() self.history_forward_augs = forward_augs.clone() # Repeat the first frame feature to be history if voxel_feat: self.history_bev = curr_bev.repeat(1, self.history_cat_num, 1, 1, 1) else: self.history_bev = curr_bev.repeat(1, self.history_cat_num, 1, 1) # All 0s, representing current timestep. self.history_sweep_time = curr_bev.new_zeros(curr_bev.shape[0], self.history_cat_num) self.history_bev = self.history_bev.detach() assert self.history_bev.dtype == torch.float32 ## Deal with the new sequences # First, sanity check. For every non-start of sequence, history id and seq id should be same. assert (self.history_seq_ids != seq_ids)[~start_of_sequence].sum() == 0, \ "{}, {}, {}".format(self.history_seq_ids, seq_ids, start_of_sequence) ## Replace all the new sequences' positions in history with the curr_bev information self.history_sweep_time += 1 # new timestep, everything in history gets pushed back one. if start_of_sequence.sum()>0: if voxel_feat: self.history_bev[start_of_sequence] = curr_bev[start_of_sequence].repeat(1, self.history_cat_num, 1, 1, 1) else: self.history_bev[start_of_sequence] = curr_bev[start_of_sequence].repeat(1, self.history_cat_num, 1, 1) self.history_sweep_time[start_of_sequence] = 0 # zero the new sequence timestep starts self.history_seq_ids[start_of_sequence] = seq_ids[start_of_sequence] self.history_forward_augs[start_of_sequence] = forward_augs[start_of_sequence] ## Get grid idxs & grid2bev first. if voxel_feat: n, c_, z, h, w = curr_bev.shape else: n, c_, h, w = curr_bev.shape z = 1 # Generate grid xs = torch.linspace(0, w - 1, w, dtype=curr_bev.dtype, device=curr_bev.device).view(1, w, 1).expand(h, w, z) ys = torch.linspace(0, h - 1, h, dtype=curr_bev.dtype, device=curr_bev.device).view(h, 1, 1).expand(h, w, z) zs = torch.linspace(0, z - 1, z, dtype=curr_bev.dtype, device=curr_bev.device).view(1, 1, z).expand(h, w, z) grid = torch.stack( (xs, ys, zs, torch.ones_like(xs)), -1).view(1, h, w, z, 4).expand(n, h, w, z, 4).view(n, h, w, z, 4, 1) # This converts BEV indices to meters # IMPORTANT: the feat2bev[0, 3] is changed from feat2bev[0, 2] because previous was 2D rotation # which has 2-th index as the hom index. Now, with 3D hom, 3-th is hom feat2bev = torch.zeros((4,4),dtype=grid.dtype).to(grid) feat2bev[0, 0] = self.forward_projection.dx[0] feat2bev[1, 1] = self.forward_projection.dx[1] feat2bev[2, 2] = self.forward_projection.dx[2] feat2bev[0, 3] = self.forward_projection.bx[0] - self.forward_projection.dx[0] / 2. feat2bev[1, 3] = self.forward_projection.bx[1] - self.forward_projection.dx[1] / 2. feat2bev[2, 3] = self.forward_projection.bx[2] - self.forward_projection.dx[2] / 2. feat2bev[3, 3] = 1 feat2bev = feat2bev.view(1,4,4) ## Get flow for grid sampling. # The flow is as follows. Starting from grid locations in curr bev, transform to BEV XY11, # backward of current augmentations, curr lidar to prev lidar, forward of previous augmentations, # transform to previous grid locations. rt_flow = (torch.inverse(feat2bev) @ self.history_forward_augs @ curr_to_prev_ego_rt @ torch.inverse(forward_augs) @ feat2bev) grid = rt_flow.view(n, 1, 1, 1, 4, 4) @ grid # normalize and sample if voxel_feat: normalize_factor = torch.tensor([w - 1.0, h - 1.0, z - 1.0], dtype=curr_bev.dtype, device=curr_bev.device) grid = grid[:,:,:,:, :3,0] / normalize_factor.view(1, 1, 1, 1, 3) * 2.0 - 1.0 else: normalize_factor = torch.tensor([w - 1.0, h - 1.0], dtype=curr_bev.dtype, device=curr_bev.device) grid = grid[:,:,:,:, :2,0] / normalize_factor.view(1, 1, 1, 1, 2) * 2.0 - 1.0 tmp_bev = self.history_bev if voxel_feat: n, mc, z, h, w = tmp_bev.shape tmp_bev = tmp_bev.reshape(n, mc, z, h, w) grid = grid.to(curr_bev.dtype).permute(0, 3, 1, 2, 4) else: grid = grid.to(curr_bev.dtype).squeeze(-2) # save_tensor(tmp_bev[0].clamp(min=-1, max=1).reshape(4, 80, 128, 128).abs().mean(1), f'curr_{self.count}_pre.png') sampled_history_bev = F.grid_sample(tmp_bev, grid, align_corners=True, mode=self.interpolation_mode) # save_tensor(sampled_history_bev[0].clamp(min=-1, max=1).reshape(4, 80, 128, 128).abs().mean(1), f'curr_{self.count}_after.png') # save_tensor(curr_bev.clamp(min=-1, max=1).abs().mean(1), f'curr_{self.count}.png') # self.count += 1 # if self.count == 10: ## Update history # Add in current frame to features & timestep self.history_sweep_time = torch.cat( [self.history_sweep_time.new_zeros(self.history_sweep_time.shape[0], 1), self.history_sweep_time], dim=1) # B x (1 + T) if voxel_feat: sampled_history_bev = sampled_history_bev.reshape(n, mc, z, h, w) curr_bev = curr_bev.reshape(n, c_, z, h, w) feats_cat = torch.cat([curr_bev, sampled_history_bev], dim=1) # B x (1 + T) * 80 x H x W or B x (1 + T) * 80 xZ x H x W # Reshape and concatenate features and timestep feats_to_return = feats_cat.reshape( feats_cat.shape[0], self.history_cat_num + 1, self.single_bev_num_channels, *feats_cat.shape[2:]) # B x (1 + T) x 80 x H x W if voxel_feat: feats_to_return = torch.cat( [feats_to_return, self.history_sweep_time[:, :, None, None, None, None].repeat( 1, 1, 1, *feats_to_return.shape[3:]) * self.history_cam_sweep_freq ], dim=2) # B x (1 + T) x 81 x Z x H x W else: feats_to_return = torch.cat( [feats_to_return, self.history_sweep_time[:, :, None, None, None].repeat( 1, 1, 1, feats_to_return.shape[3], feats_to_return.shape[4]) * self.history_cam_sweep_freq ], dim=2) # B x (1 + T) x 81 x H x W # Time conv feats_to_return = self.history_keyframe_time_conv( feats_to_return.reshape(-1, *feats_to_return.shape[2:])).reshape( feats_to_return.shape[0], feats_to_return.shape[1], -1, *feats_to_return.shape[3:]) # B x (1 + T) x 80 xZ x H x W # Cat keyframes & conv feats_to_return = self.history_keyframe_cat_conv( feats_to_return.reshape( feats_to_return.shape[0], -1, *feats_to_return.shape[3:])) # B x C x H x W or B x C x Z x H x W self.history_bev = feats_cat[:, :-self.single_bev_num_channels, ...].detach().clone() self.history_sweep_time = self.history_sweep_time[:, :-1] self.history_forward_augs = forward_augs.clone() if voxel_feat: feats_to_return = feats_to_return.permute(0, 1, 3, 4, 2) if not self.do_history: self.history_bev = None return feats_to_return.clone() def extract_img_bev_feat(self, img, img_metas, **kwargs): """Extract features of images.""" return_map = {} context, mlvl_context = self.image_encoder(img[0]) cam_params = img[1:7] if self.with_specific_component('depth_net'): mlp_input = self.depth_net.get_mlp_input(*cam_params) context, depth = self.depth_net(context, mlp_input) else: depth=None if self.with_specific_component('forward_projection'): bev_feat = self.forward_projection(cam_params, context, depth, **kwargs) else: bev_feat = None if self.with_specific_component('frpn'): # not used in FB-OCC assert bev_feat is not None bev_mask_logit = self.frpn(bev_feat) bev_mask = bev_mask_logit.sigmoid() > self.frpn.mask_thre if bev_mask.requires_grad: # during training phase gt_bev_mask = kwargs['gt_bev_mask'].to(torch.bool) bev_mask = gt_bev_mask | bev_mask return_map['bev_mask_logit'] = bev_mask_logit else: bev_mask = None if self.with_specific_component('backward_projection'): bev_feat_refined = self.backward_projection([context], img_metas, lss_bev=bev_feat.mean(-1), cam_params=cam_params, bev_mask=bev_mask, gt_bboxes_3d=None, # debug pred_img_depth=depth) if self.add_forward_backbward_feats: bev_feat = bev_feat_refined[..., None] + bev_feat else: bev_feat = bev_feat_refined # Fuse History if self.fuse_history_bev: bev_feat = self.fuse_history(bev_feat, img_metas, img[6]) if self.with_ego_status: can_bus_info = torch.cat(kwargs['can_bus_info']) bev_feat = bev_feat + self.can_bus_mlp(can_bus_info)[:, :, None, None] bev_feat = self.bev_encoder(bev_feat) return_map['context'] = mlvl_context if self.yolox_use_ml_feats else context return_map['depth'] = depth return_map['cam_params'] = cam_params return_map['img_bev_feat'] = bev_feat return return_map def extract_lidar_bev_feat(self, pts, img_feats, img_metas): """Extract features of points.""" voxels, num_points, coors = self.voxelize(pts) voxel_features = self.pts_voxel_encoder(voxels, num_points, coors) batch_size = coors[-1, 0] + 1 bev_feat = self.pts_middle_encoder(voxel_features, coors, batch_size) bev_feat = self.pts_backbone(bev_feat) if self.with_pts_neck: bev_feat = self.pts_neck(bev_feat) bev_feat = self.bev_encoder(bev_feat) return dict(lidar_bev_feat=bev_feat) def extract_feat(self, points, img, img_metas, **kwargs): """Extract features from images and points.""" results={} if img is not None and self.with_specific_component('image_encoder'): results.update(self.extract_img_bev_feat(img, img_metas, **kwargs)) if points is not None and self.with_specific_component('pts_voxel_encoder'): results.update(self.extract_lidar_bev_feat(points, img, img_metas)) return results def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img_inputs=None, proposals=None, gt_bboxes_ignore=None, **kwargs): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ results= self.extract_feat( points, img=img_inputs, img_metas=img_metas, **kwargs) losses = dict() if self.with_pts_bbox: preds_agent_dicts = self.pts_bbox_head(results, img_metas, gt_bboxes_3d, gt_labels_3d) losses_pts, agent_instances = self.pts_bbox_head.loss(gt_bboxes_3d, gt_labels_3d, preds_agent_dicts, img_metas) losses.update(losses_pts) if self.with_specific_component('img_det_2d_head'): if type(results['context']) not in [list, tuple]: context = [results['context']] else: context = results['context'] preds_2ddet_dicts = self.img_det_2d_head(context) losses.update( self.img_det_2d_head.loss( kwargs['gt_bboxes_2d'], kwargs['gt_labels_2d'], kwargs['centers2d'], preds_2ddet_dicts, kwargs['depths2d'], img_metas, #len=B ) ) if self.with_specific_component('occupancy_head'): losses_occupancy = self.occupancy_head.forward_train(results['img_bev_feat'], results=results, gt_occupancy=kwargs['gt_occupancy'], gt_occupancy_flow=kwargs['gt_occupancy_flow']) losses.update(losses_occupancy) if self.with_specific_component('map_head'): loss_map_dict, preds_map_dicts = self.map_head.forward(results, img_metas, kwargs['map_gt_bboxes_3d'], kwargs['map_gt_labels_3d'], return_loss=True) losses.update(loss_map_dict) else: preds_map_dicts = [None] # dummy if self.with_specific_component('frpn'): losses_mask = self.frpn.get_bev_mask_loss(kwargs['gt_bev_mask'], results['bev_mask_logit']) losses.update(losses_mask) if self.use_depth_supervision and self.with_specific_component('depth_net'): loss_depth = self.depth_net.get_depth_loss(kwargs['gt_depth'], results['depth']) losses.update(loss_depth) if self.with_specific_component('motion_head'): preds_motion_dicts = self.motion_head( agent_instances, preds_map_dicts[-1], gt_ego_lcf_feat = kwargs['gt_ego_lcf_feat'], gt_ego_fut_cmd = kwargs['gt_ego_fut_cmd'], gt_ego_his_traj = kwargs['gt_ego_his_trajs'], gt_ego_fut_trajs = kwargs['gt_ego_fut_trajs'], img_metas=img_metas, ) losses.update( self.motion_head.loss( gt_agent_fut_traj = kwargs['gt_agent_fut_traj'], gt_agent_fut_traj_mask = kwargs['gt_agent_fut_traj_mask'], gt_ego_fut_cmd = kwargs['gt_ego_fut_cmd'], gt_ego_fut_trajs = kwargs['gt_ego_fut_trajs'], gt_ego_fut_masks = kwargs['gt_ego_fut_masks'], preds_dicts = preds_motion_dicts, preds_map_dicts = preds_map_dicts[-1], matched_gt_idxes = agent_instances.matched_gt_idxes, img_metas = img_metas, ) ) if self.with_specific_component('planner_head'): preds_plan_dicts = self.planner_head( results, kwargs['gt_ego_lcf_feat'], kwargs['gt_ego_fut_cmd'], kwargs['gt_ego_his_trajs'], kwargs['gt_ego_fut_trajs'], img_metas=img_metas, map_results=preds_map_dicts[-1] ) losses.update( self.planner_head.loss( kwargs['gt_ego_fut_trajs'], kwargs['gt_ego_fut_cmd'], kwargs['gt_ego_fut_masks'], preds_plan_dicts, img_metas, ) ) return losses def forward_test(self, points=None, img_metas=None, img_inputs=None, **kwargs): """ Args: points (list[torch.Tensor]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch img (list[torch.Tensor], optional): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. """ self.do_history = True kwargs['can_bus_info'] = kwargs.get('can_bus_info', [None])[0] if img_inputs is not None: for var, name in [(img_inputs, 'img_inputs'), (img_metas, 'img_metas')]: if not isinstance(var, list) : raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(img_inputs) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(img_inputs), len(img_metas))) if num_augs==1 and not img_metas[0][0].get('tta_config', dict(dist_tta=False))['dist_tta']: return self.simple_test(points[0], img_metas[0], img_inputs[0], **kwargs) else: return self.aug_test(points, img_metas, img_inputs, **kwargs) elif points is not None: img_inputs = [img_inputs] if img_inputs is None else img_inputs points = [points] if points is None else points return self.simple_test(points[0], img_metas[0], img_inputs[0], **kwargs) def aug_test(self,points, img_metas, img_inputs=None, visible_mask=[None], **kwargs): """Test function without augmentaiton.""" assert False return None def simple_test(self, points, img_metas, img=None, rescale=False, visible_mask=[None], return_raw_occ=False, **kwargs): """Test function without augmentaiton.""" results = self.extract_feat( points, img=img, img_metas=img_metas, **kwargs) output_list = [dict() for _ in range(len(img_metas))] if self.with_pts_bbox: if getattr(self.pts_bbox_head, 'tracking', False): preds_det_dicts, agent_instances = self.pts_bbox_head.forward_tracking(results, img_metas) else: preds_det_dicts = self.pts_bbox_head(results, img_metas) pred_bbox = self.pts_bbox_head.get_bboxes(preds_det_dicts, img_metas, rescale=rescale) pred_bbox[0]['index'] = img_metas[0]['index'] else: pred_bbox = [None for _ in range(len(img_metas))] if self.with_specific_component('map_head'): preds_map_dicts = self.map_head(results, img_metas, return_loss=False, map_gt_bboxes_3d = kwargs.get('map_gt_bboxes_3d', None), map_gt_labels_3d = kwargs.get('map_gt_labels_3d', None), ) pred_map = self.map_head.get_bboxes(preds_map_dicts, img_metas) pred_map[0]['index'] = img_metas[0]['index'] else: preds_map_dicts = [None] # dummy pred_map = [None for _ in range(len(img_metas))] if self.with_specific_component('motion_head'): preds_motion_dicts = self.motion_head( agent_instances, preds_map_dicts[-1], gt_ego_lcf_feat = kwargs['gt_ego_lcf_feat'][0], gt_ego_fut_cmd = kwargs['gt_ego_fut_cmd'][0], gt_ego_his_traj = kwargs['gt_ego_his_trajs'][0], gt_ego_fut_trajs = kwargs['gt_ego_fut_trajs'][0], img_metas=img_metas, ) pred_motion = self.motion_head.get_motion(preds_motion_dicts, img_metas) pred_motion[0]['index'] = img_metas[0]['index'] pred_traj = self.motion_head.get_traj( preds_motion_dicts, img_metas, gt_ego_fut_trajs=kwargs['gt_ego_fut_trajs'][0], gt_ego_fut_cmd=kwargs['gt_ego_fut_cmd'][0], gt_ego_fut_masks=kwargs['gt_ego_fut_masks'][0], gt_fut_segmentations=kwargs['gt_fut_segmentations'][0], gt_fut_segmentations_plus=kwargs['gt_fut_segmentations_plus'][0], # vad_ego_fut_trajs=kwargs['vad_ego_fut_trajs'][0], ) pred_traj[0]['index'] = img_metas[0]['index'] # add motion traj to tracking results num_bbox = pred_bbox[0]['track_scores'].size(0) motion_info = np.zeros([num_bbox, 6, 8, 2]) motion_cls = np.zeros([num_bbox, 6]) for i, obj_idx in enumerate(pred_motion[0]['obj_idxes']): try: bbox_ind = (pred_bbox[0]['obj_idxes']==obj_idx).nonzero().item() except: continue motion_info[bbox_ind] = pred_motion[0]['fut_trajs_in_global'][i] motion_cls[bbox_ind] = pred_motion[0]['pred_traj_cls'][i] pred_bbox[0]['motion_traj'] = motion_info pred_bbox[0]['motion_cls'] = motion_cls else: pred_motion = [None for _ in range(len(img_metas))] pred_traj = [None for _ in range(len(img_metas))] if self.with_specific_component('occupancy_head'): pred_occupancy = self.occupancy_head(results['img_bev_feat'], results=results, **kwargs)['output_voxels'][0] pred_occupancy = pred_occupancy.permute(0, 2, 3, 4, 1)[0] if self.fix_void: pred_occupancy = pred_occupancy[..., 1:] pred_occupancy = pred_occupancy.softmax(-1) # convert to CVPR2023 Format pred_occupancy = pred_occupancy.permute(3, 2, 0, 1) pred_occupancy = torch.flip(pred_occupancy, [2]) pred_occupancy = torch.rot90(pred_occupancy, -1, [2, 3]) pred_occupancy = pred_occupancy.permute(2, 3, 1, 0) if return_raw_occ: pred_occupancy_category = pred_occupancy else: pred_occupancy_category = pred_occupancy.argmax(-1) # # do not change the order # if self.occupancy_save_path is not None: # scene_name = img_metas[0]['scene_name'] # sample_token = img_metas[0]['sample_idx'] # mask_camera = visible_mask[0][0] # masked_pred_occupancy = pred_occupancy[mask_camera].cpu().numpy() # save_path = os.path.join(self.occupancy_save_path, 'occupancy_pred', scene_name+'_'+sample_token) # np.savez_compressed(save_path, pred=masked_pred_occupancy, sample_token=sample_token) # For test server if self.occupancy_save_path is not None: scene_name = img_metas[0]['scene_name'] sample_token = img_metas[0]['sample_idx'] # mask_camera = visible_mask[0][0] # masked_pred_occupancy = pred_occupancy[mask_camera].cpu().numpy() save_pred_occupancy = pred_occupancy.argmax(-1).cpu().numpy() save_path = os.path.join(self.occupancy_save_path, 'occupancy_pred', f'{sample_token}.npz') np.savez_compressed(save_path, save_pred_occupancy.astype(np.uint8)) pred_occupancy_category= pred_occupancy_category.cpu().numpy() else: pred_occupancy_category = None if self.with_specific_component('planner_head'): preds_dicts = self.planner_head( results, kwargs['gt_ego_lcf_feat'][0], kwargs['gt_ego_fut_cmd'][0], kwargs['gt_ego_his_trajs'][0], kwargs['gt_ego_fut_trajs'][0], img_metas=img_metas, map_results=preds_map_dicts[-1] ) pred_traj = self.planner_head.get_bboxes(preds_dicts, img_metas, gt_ego_fut_trajs=kwargs['gt_ego_fut_trajs'][0], gt_ego_fut_cmd=kwargs['gt_ego_fut_cmd'][0], gt_ego_fut_masks=kwargs['gt_ego_fut_masks'][0], gt_fut_segmentations=kwargs['gt_fut_segmentations'][0], gt_fut_segmentations_plus=kwargs['gt_fut_segmentations_plus'][0], # vad_ego_fut_trajs=kwargs['vad_ego_fut_trajs'][0], ) pred_traj[0]['index'] = img_metas[0]['index'] else: pred_traj = [None for _ in range(len(img_metas))] # if results.get('bev_mask_logit', None) is not None: # pred_bev_mask = results['bev_mask_logit'].sigmoid() > 0.5 assert len(img_metas) == 1 for i, result_dict in enumerate(output_list): result_dict['pts_bbox'] = pred_bbox[i] result_dict['pred_map'] = pred_map[i] result_dict['pred_motion'] = pred_motion[i] result_dict['pred_ego_traj'] = pred_traj[i] result_dict['pred_occupancy'] = pred_occupancy_category result_dict['index'] = img_metas[i]['index'] # if not self.training: # self.visual_sample(output_list, **kwargs) return output_list def forward_dummy(self, points=None, img_metas=None, img_inputs=None, **kwargs): results = self.extract_feat( points, img=img_inputs, img_metas=img_metas, **kwargs) assert self.with_pts_bbox outs = self.pts_bbox_head(results['img_bev_feat']) return outs def world2bev_vis(self, x, y): return int((x + 51.2) * 5), int((y + 51.2) * 5) def visual_sample(self, results, **kwargs): import cv2 # upper image is gt bev_img = np.ones([1024, 512, 3], dtype=np.float32) * 255 bev_img = bev_img.astype(np.float32) bev_img = cv2.circle(bev_img, self.world2bev_vis(0, 0), 5, (0, 255, 0), thickness=-1) bev_img = cv2.circle(bev_img, self.world2bev_vis(0, 51.2 * 2), 5, (0, 255, 0), thickness=-1) if results[0].get('pts_bbox') is not None: bbox = results[0]['pts_bbox']['boxes_3d'] track_scores = results[0]['pts_bbox']['track_scores'] for i, corners in enumerate(bbox.corners[:, [4, 7, 3, 0], :2]): if track_scores[i]<0.4: continue corners = np.array([self.world2bev_vis(*corner) for corner in corners]) corners2 = np.array([(x, y+512) for (x, y) in corners]) bev_img = cv2.circle(bev_img, corners[0], 1, (61, 102, 255)) bev_img = cv2.polylines(bev_img, pts=[corners], isClosed=True, color=(61, 102, 255), thickness=1) bev_img = cv2.circle(bev_img, corners2[0], 1, (61, 102, 255)) bev_img = cv2.polylines(bev_img, pts=[corners2], isClosed=True, color=(61, 102, 255), thickness=1) if kwargs.get('gt_bboxes_3d', False): gt_bboxes_3d = kwargs['gt_bboxes_3d'][0][0] for i, corners in enumerate(gt_bboxes_3d.corners[:, [4, 7, 3, 0], :2]): corners = np.array([self.world2bev_vis(*corner) for corner in corners]) bev_img = cv2.circle(bev_img, corners[0], 1, (61, 102, 255)) # bev_img = cv2.fillPoly(bev_img, [corners], (61, 102, 255)) bev_img = cv2.polylines(bev_img, pts=[corners], isClosed=True, color=(255, 102, 61), thickness=1) if results[0].get('pred_ego_traj') is not None: pred_ego_fut_trajs = results[0]['pred_ego_traj']['pred_ego_fut_trajs'] gt_ego_fut_trajs = results[0]['pred_ego_traj']['gt_ego_fut_trajs'] gt_ego_fut_trajs, colors = self._render_traj(gt_ego_fut_trajs.numpy()) points = np.array([self.world2bev_vis(*point) for point in gt_ego_fut_trajs]) for point, color in zip(points, colors): bev_img = cv2.circle(bev_img, point, 1, color) pred_ego_fut_trajs, colors = self._render_traj(pred_ego_fut_trajs.numpy(), colormap='autumn') points = np.array([self.world2bev_vis(*point) for point in pred_ego_fut_trajs]) for point, color in zip(points, colors): x,y = point bev_img = cv2.circle(bev_img, (x, y+512), 1, color) if kwargs.get('map_gt_bboxes_3d', False): map_gt_bboxes_3d = kwargs['map_gt_bboxes_3d'][0][0] map_gt_labels_3d = kwargs['map_gt_labels_3d'][0][0] for k, line in enumerate(map_gt_bboxes_3d.fixed_num_sampled_points): label = map_gt_labels_3d[k] # line = (line[..., :2] - self.map_head.origin.cpu()) / self.map_head.roi_size.cpu() line = line.cpu().numpy() corners = np.array([self.world2bev_vis(*corner) for corner in line]) corners = [each for each in corners if ((each>=0).all() & (each<512).all())] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 255)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=colors[label], thickness=1) if results[0].get('pred_map') is not None: for k, line in enumerate(results[0]['pred_map']['map_pts_3d']): label = results[0]['pred_map']['map_labels_3d'][k] # if label !=0: continue score = results[0]['pred_map']['map_scores_3d'][k] if score < 0.4: continue line = line.cpu().numpy() corners = np.array([self.world2bev_vis(*corner) for corner in line]) corners = [each for each in corners if ((each>=0).all() & (each<512).all())] corners = [(x, y+512) for (x, y) in corners ] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 255)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=colors[label], thickness=1) if kwargs.get('gt_agent_fut_traj', False): gt_agent_fut_traj = kwargs['gt_agent_fut_traj'][0][0].cpu() gt_agent_fut_traj_mask = kwargs['gt_agent_fut_traj_mask'][0][0].cpu() centers = kwargs['gt_bboxes_3d'][0][0].center[..., :2].cpu() tmp = torch.cat([centers[:, None], gt_agent_fut_traj], 1) trajs = torch.cumsum(tmp, 1)[:, 1:] for k, traj in enumerate(trajs): traj = traj.cpu().numpy() corners = np.array([self.world2bev_vis(*corner) for corner in traj]) center = np.array(self.world2bev_vis(*centers[k])) corners = [each for each in corners if ((each>=0).all() & (each<1536).all())] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): if gt_agent_fut_traj_mask[k, i+1].sum()<2 or gt_agent_fut_traj_mask[k, i].sum()<2: continue if i == 0: bev_img = cv2.line(bev_img, center, corners[i], color=(123, 22, 187), thickness=1) # bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 32)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=(123, 22, 187), thickness=1) if results[0].get('pred_motion') is not None: obj_idxes_list = results[0]['pts_bbox']['obj_idxes'] centers = results[0]['pts_bbox']['boxes_3d'].center[..., :2].cpu().numpy() # pred_agent_fut_trajs = results[0]['pred_motion']['pred_agent_fut_trajs'] pred_agent_fut_trajs2 = results[0]['pred_motion']['pred_agent_fut_trajs2'] motion_obj_idxes = results[0]['pred_motion']['obj_idxes'] for k, trajs in enumerate(pred_agent_fut_trajs2): try: track_k = (obj_idxes_list==motion_obj_idxes[k]).nonzero()[0][0] except: continue if track_scores[track_k]<0.4: continue traj_ind = results[0]['pred_motion']['pred_traj_cls'][k].argmax() # for traj in trajs: traj = trajs[traj_ind] corners = np.array([self.world2bev_vis(*corner) for corner in traj]) corners = np.array([(x, y+512) for (x, y) in corners]) center = np.array(self.world2bev_vis(*centers[track_k])) center[-1] +=512 corners = [each for each in corners if ((each>=0).all() & (each<1536).all())] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): if i == 0: bev_img = cv2.line(bev_img, center, corners[i], color=(123, 22, 187), thickness=1) # bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 32)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=(22, 122, 187), thickness=1) mmcv.imwrite(bev_img, f'bev_{results[0]["index"]}.png') def _render_traj(self, future_traj, traj_score=1, colormap='winter', points_per_step=5, line_color=None, dot_color=None, dot_size=25): total_steps = (len(future_traj)-1) * points_per_step + 1 dot_colors = matplotlib.colormaps[colormap]( np.linspace(0, 1, total_steps))[:, :3] * 255 dot_colors = dot_colors*traj_score + \ (1-traj_score)*np.ones_like(dot_colors) total_xy = np.zeros((total_steps, 2)) for i in range(total_steps-1): unit_vec = future_traj[i//points_per_step + 1] - future_traj[i//points_per_step] total_xy[i] = (i/points_per_step - i//points_per_step) * \ unit_vec + future_traj[i//points_per_step] total_xy[-1] = future_traj[-1] return total_xy, dot_colors ================================================ FILE: mmdet3d/models/fbbev/heads/__init__.py ================================================ from .occupancy_head import OccHead from .yolox import YOLOXHeadCustom ================================================ FILE: mmdet3d/models/fbbev/heads/occupancy_head.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE import copy import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import reduce_mean from mmdet.models import HEADS from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmdet3d.models.fbbev.modules.occ_loss_utils import lovasz_softmax, CustomFocalLoss from mmdet3d.models.fbbev.modules.occ_loss_utils import nusc_class_frequencies, nusc_class_names from mmdet3d.models.fbbev.modules.occ_loss_utils import geo_scal_loss, sem_scal_loss, CE_ssc_loss from torch.utils.checkpoint import checkpoint as cp from mmcv.runner import BaseModule, force_fp32 from torch.cuda.amp import autocast from mmdet3d.models import builder @HEADS.register_module() class OccHead(BaseModule): def __init__( self, in_channels, out_channel, num_level=1, soft_weights=False, loss_weight_cfg=None, conv_cfg=dict(type='Conv3d', bias=False), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], final_occ_size=[256, 256, 20], empty_idx=0, balance_cls_weight=True, train_cfg=None, test_cfg=None, with_cp=False, use_focal_loss=False, use_dice_loss= False, use_deblock=True, ): super(OccHead, self).__init__() self.fp16_enabled=False if type(in_channels) is not list: in_channels = [in_channels] self.with_cp = with_cp self.use_deblock = use_deblock self.use_focal_loss = use_focal_loss if self.use_focal_loss: self.focal_loss = builder.build_loss(dict(type='CustomFocalLoss')) self.in_channels = in_channels self.out_channel = out_channel self.num_level = num_level self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float() if loss_weight_cfg is None: self.loss_weight_cfg = { "loss_voxel_ce_weight": 1.0, "loss_voxel_sem_scal_weight": 1.0, "loss_voxel_geo_scal_weight": 1.0, "loss_voxel_lovasz_weight": 1.0, } else: self.loss_weight_cfg = loss_weight_cfg # voxel losses self.loss_voxel_ce_weight = self.loss_weight_cfg.get('loss_voxel_ce_weight', 1.0) self.loss_voxel_sem_scal_weight = self.loss_weight_cfg.get('loss_voxel_sem_scal_weight', 1.0) self.loss_voxel_geo_scal_weight = self.loss_weight_cfg.get('loss_voxel_geo_scal_weight', 1.0) self.loss_voxel_lovasz_weight = self.loss_weight_cfg.get('loss_voxel_lovasz_weight', 1.0) # voxel-level prediction self.occ_convs = nn.ModuleList() for i in range(self.num_level): mid_channel = self.in_channels[i] // 2 occ_conv = nn.Sequential( build_conv_layer(conv_cfg, in_channels=self.in_channels[i], out_channels=mid_channel, kernel_size=3, stride=1, padding=1), build_norm_layer(norm_cfg, mid_channel)[1], nn.ReLU(inplace=True)) self.occ_convs.append(occ_conv) self.occ_pred_conv = nn.Sequential( build_conv_layer(conv_cfg, in_channels=mid_channel, out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0), build_norm_layer(norm_cfg, mid_channel//2)[1], nn.ReLU(inplace=True), build_conv_layer(conv_cfg, in_channels=mid_channel//2, out_channels=out_channel, kernel_size=1, stride=1, padding=0)) self.soft_weights = soft_weights self.num_point_sampling_feat = self.num_level + 1 * self.use_deblock if self.soft_weights: soft_in_channel = mid_channel self.voxel_soft_weights = nn.Sequential( build_conv_layer(conv_cfg, in_channels=soft_in_channel, out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0), build_norm_layer(norm_cfg, soft_in_channel//2)[1], nn.ReLU(inplace=True), build_conv_layer(conv_cfg, in_channels=soft_in_channel//2, out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0)) # loss functions self.use_dice_loss = use_dice_loss if self.use_dice_loss: self.dice_loss = builder.build_loss(dict(type='DiceLoss', loss_weight=2)) if balance_cls_weight: if out_channel == 19: self.class_weights = torch.from_numpy(1 / np.log(nusc_class_frequencies[:out_channel] + 0.001)) self.class_weights = torch.cat([torch.tensor([0]), self.class_weights]) else: if out_channel == 17: nusc_class_frequencies[0] += nusc_class_frequencies[-1] self.class_weights = torch.from_numpy(1 / np.log(nusc_class_frequencies[:out_channel] + 0.001)) else: self.class_weights = torch.ones(out_channel)/out_channel # FIXME hardcode 17 if self.use_deblock: upsample_cfg=dict(type='deconv3d', bias=False) upsample_layer = build_conv_layer( upsample_cfg, in_channels=self.in_channels[0], out_channels=self.in_channels[0]//2, kernel_size=2, stride=2, padding=0) self.deblock = nn.Sequential(upsample_layer, build_norm_layer(norm_cfg, self.in_channels[0]//2)[1], nn.ReLU(inplace=True)) self.class_names = nusc_class_names self.empty_idx = empty_idx @force_fp32(apply_to=('voxel_feats')) def forward_coarse_voxel(self, voxel_feats): output_occs = [] output = {} if self.use_deblock: if self.with_cp and voxel_feats[0].requires_grad: x0 = cp(self.deblock, voxel_feats[0]) else: x0 = self.deblock(voxel_feats[0]) output_occs.append(x0) for feats, occ_conv in zip(voxel_feats, self.occ_convs): if self.with_cp and feats.requires_grad: x = cp(occ_conv, feats) else: x = occ_conv(feats) output_occs.append(x) if self.soft_weights: voxel_soft_weights = self.voxel_soft_weights(output_occs[0]) voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1) else: voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat out_voxel_feats = 0 _, _, H, W, D= output_occs[0].shape for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)): feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous() out_voxel_feats += feats * weights.unsqueeze(1) output['out_voxel_feats'] = [out_voxel_feats] if self.with_cp and out_voxel_feats.requires_grad: out_voxel = cp(self.occ_pred_conv, out_voxel_feats) else: out_voxel = self.occ_pred_conv(out_voxel_feats) output['occ'] = [out_voxel] return output @force_fp32() def forward(self, voxel_feats, img_feats=None, pts_feats=None, transform=None, **kwargs): assert type(voxel_feats) is list and len(voxel_feats) == self.num_level output = self.forward_coarse_voxel(voxel_feats) out_voxel_feats = output['out_voxel_feats'][0] coarse_occ = output['occ'][0] res = { 'output_voxels': output['occ'], 'output_voxels_fine': output.get('fine_output', None), 'output_coords_fine': output.get('fine_coord', None), } return res @force_fp32() def forward_train(self, voxel_feats, img_feats=None, pts_feats=None, transform=None, gt_occupancy=None, gt_occupancy_flow=None, **kwargs): res = self.forward(voxel_feats, img_feats=img_feats, pts_feats=pts_feats, transform=transform, **kwargs) loss = self.loss(target_voxels=gt_occupancy, output_voxels = res['output_voxels'], output_coords_fine=res['output_coords_fine'], output_voxels_fine=res['output_voxels_fine']) return loss @force_fp32() def loss_voxel(self, output_voxels, target_voxels, tag): # resize gt B, C, H, W, D = output_voxels.shape ratio = target_voxels.shape[2] // H if ratio != 1: target_voxels = target_voxels.reshape(B, H, ratio, W, ratio, D, ratio).permute(0,1,3,5,2,4,6).reshape(B, H, W, D, ratio**3) empty_mask = target_voxels.sum(-1) == self.empty_idx target_voxels = target_voxels.to(torch.int64) occ_space = target_voxels[~empty_mask] occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1 target_voxels[~empty_mask] = occ_space target_voxels = torch.mode(target_voxels, dim=-1)[0] target_voxels[target_voxels<0] = 255 target_voxels = target_voxels.long() # output_voxels = torch.log(output_voxels * 0) + output_voxels/0 # debug !!!!!!!! output_voxels[torch.isnan(output_voxels)] = 0 output_voxels[torch.isinf(output_voxels)] = 0 assert torch.isnan(output_voxels).sum().item() == 0 assert torch.isnan(target_voxels).sum().item() == 0 loss_dict = {} # igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels) if self.use_focal_loss: loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * self.focal_loss(output_voxels, target_voxels, self.class_weights.type_as(output_voxels), ignore_index=255) else: loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(output_voxels, target_voxels, self.class_weights.type_as(output_voxels), ignore_index=255) loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(output_voxels, target_voxels, ignore_index=255) loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(output_voxels, target_voxels, ignore_index=255, non_empty_idx=self.empty_idx) loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(output_voxels, dim=1), target_voxels, ignore=255) if self.use_dice_loss: visible_mask = target_voxels!=255 visible_pred_voxels = output_voxels.permute(0, 2, 3, 4, 1)[visible_mask] visible_target_voxels = target_voxels[visible_mask] visible_target_voxels = F.one_hot(visible_target_voxels.to(torch.long), 19) loss_dict['loss_voxel_dice_{}'.format(tag)] = self.dice_loss(visible_pred_voxels, visible_target_voxels) return loss_dict @force_fp32() def loss(self, output_voxels=None, output_coords_fine=None, output_voxels_fine=None, target_voxels=None, visible_mask=None, **kwargs): loss_dict = {} for index, output_voxel in enumerate(output_voxels): loss_dict.update(self.loss_voxel(output_voxel, target_voxels, tag='c_{}'.format(index))) return loss_dict ================================================ FILE: mmdet3d/models/fbbev/heads/yolox.py ================================================ import math import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, bias_init_with_prob) from mmcv.ops.nms import batched_nms from mmcv.runner import force_fp32 from mmdet.core import (MlvlPointGenerator, bbox_xyxy_to_cxcywh, build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.builder import HEADS, build_loss from mmdet.models.dense_heads.base_dense_head import BaseDenseHead from mmdet.models.dense_heads.dense_test_mixins import BBoxTestMixin @HEADS.register_module() class YOLOXHeadCustom(BaseDenseHead, BBoxTestMixin): """YOLOXHead head used in `YOLOX `_. Args: num_classes (int): Number of categories excluding the background category. in_channels (int): Number of channels in the input feature map. feat_channels (int): Number of hidden channels in stacking convs. Default: 256 stacked_convs (int): Number of stacking convs of the head. Default: 2. strides (tuple): Downsample factor of each feature map. use_depthwise (bool): Whether to depthwise separable convolution in blocks. Default: False dcn_on_last_conv (bool): If true, use dcn in the last layer of towers. Default: False. conv_bias (bool | str): If specified as `auto`, it will be decided by the norm_cfg. Bias of conv will be set as True if `norm_cfg` is None, otherwise False. Default: "auto". conv_cfg (dict): Config dict for convolution layer. Default: None. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (dict): Config dict for activation layer. Default: None. loss_cls (dict): Config of classification loss. loss_bbox (dict): Config of localization loss. loss_obj (dict): Config of objectness loss. loss_l1 (dict): Config of L1 loss. train_cfg (dict): Training config of anchor head. test_cfg (dict): Testing config of anchor head. init_cfg (dict or list[dict], optional): Initialization config dict. """ def __init__(self, num_classes, in_channels, feat_channels=256, stacked_convs=2, strides=[8, 16, 32], use_depthwise=False, dcn_on_last_conv=False, conv_bias='auto', conv_cfg=None, norm_cfg=dict(type='BN', momentum=0.03, eps=0.001), act_cfg=dict(type='Swish'), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), loss_bbox=dict( type='IoULoss', mode='square', eps=1e-16, reduction='sum', loss_weight=5.0), loss_obj=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), loss_l1=dict(type='L1Loss', reduction='sum', loss_weight=1.0), loss_centers2d=dict(type='L1Loss', reduction='sum', loss_weight=1.0), train_cfg=None, test_cfg=None, init_cfg=dict( type='Kaiming', layer='Conv2d', a=math.sqrt(5), distribution='uniform', mode='fan_in', nonlinearity='leaky_relu')): super().__init__(init_cfg=init_cfg) self.num_classes = num_classes self.cls_out_channels = num_classes self.in_channels = in_channels self.feat_channels = feat_channels self.stacked_convs = stacked_convs self.strides = strides self.use_depthwise = use_depthwise self.dcn_on_last_conv = dcn_on_last_conv assert conv_bias == 'auto' or isinstance(conv_bias, bool) self.conv_bias = conv_bias self.use_sigmoid_cls = True self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_obj = build_loss(loss_obj) self.loss_centers2d = build_loss(loss_centers2d) self.use_l1 = True # This flag will be modified by hooks. self.loss_l1 = build_loss(loss_l1) self.prior_generator = MlvlPointGenerator(strides, offset=0) self.test_cfg = test_cfg self.train_cfg = train_cfg self.sampling = False if self.train_cfg: self.assigner = build_assigner(self.train_cfg.assigner) # sampling=False so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.sampler_ = build_sampler(sampler_cfg, context=self) self.fp16_enabled = False self._init_layers() def _init_layers(self): self.multi_level_cls_convs = nn.ModuleList() self.multi_level_reg_convs = nn.ModuleList() self.multi_level_conv_cls = nn.ModuleList() self.multi_level_conv_reg = nn.ModuleList() self.multi_level_conv_obj = nn.ModuleList() self.multi_level_conv_centers2d = nn.ModuleList() for _ in self.strides: self.multi_level_cls_convs.append(self._build_stacked_convs()) self.multi_level_reg_convs.append(self._build_stacked_convs()) conv_cls, conv_reg, conv_obj, conv_centers2d = self._build_predictor() self.multi_level_conv_cls.append(conv_cls) self.multi_level_conv_reg.append(conv_reg) self.multi_level_conv_obj.append(conv_obj) self.multi_level_conv_centers2d.append(conv_centers2d) def _build_stacked_convs(self): """Initialize conv layers of a single level head.""" conv = DepthwiseSeparableConvModule \ if self.use_depthwise else ConvModule stacked_convs = [] for i in range(self.stacked_convs): chn = self.in_channels if i == 0 else self.feat_channels if self.dcn_on_last_conv and i == self.stacked_convs - 1: conv_cfg = dict(type='DCNv2') else: conv_cfg = self.conv_cfg stacked_convs.append( conv( chn, self.feat_channels, 3, stride=1, padding=1, conv_cfg=conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, bias=self.conv_bias)) return nn.Sequential(*stacked_convs) def _build_predictor(self): """Initialize predictor layers of a single level head.""" conv_cls = nn.Conv2d(self.feat_channels, self.cls_out_channels, 1) conv_reg = nn.Conv2d(self.feat_channels, 4, 1) conv_obj = nn.Conv2d(self.feat_channels, 1, 1) conv_centers2d = nn.Conv2d(self.feat_channels, 2, 1) return conv_cls, conv_reg, conv_obj, conv_centers2d def init_weights(self): super(YOLOXHeadCustom, self).init_weights() # Use prior in model initialization to improve stability bias_init = bias_init_with_prob(0.01) for conv_cls, conv_obj in zip(self.multi_level_conv_cls, self.multi_level_conv_obj): conv_cls.bias.data.fill_(bias_init) conv_obj.bias.data.fill_(bias_init) @force_fp32(apply_to=('x')) def forward_single(self, x, cls_convs, reg_convs, conv_cls, conv_reg, conv_obj, conv_centers2d): """Forward feature of a single scale level.""" if x.dim() == 5: bs, n, c, h, w= x.shape x = x.reshape(bs*n, c, h, w) cls_feat = cls_convs(x) reg_feat = reg_convs(x) cls_score = conv_cls(cls_feat) bbox_pred = conv_reg(reg_feat) objectness = conv_obj(reg_feat) centers2d_offset = conv_centers2d(reg_feat) return cls_score, bbox_pred, objectness, centers2d_offset @force_fp32(apply_to=('feats')) def forward(self, feats): """Forward features from the upstream network. Args: feats (tuple[Tensor]): Features from the upstream network, each is a 4D-tensor. Returns: tuple[Tensor]: A tuple of multi-level predication map, each is a 4D-tensor of shape (batch_size, 5+num_classes, height, width). """ # feats = data['img_feats'] cls_scores, bbox_preds, objectnesses, centers2d_offsets= multi_apply(self.forward_single, feats, self.multi_level_cls_convs, self.multi_level_reg_convs, self.multi_level_conv_cls, self.multi_level_conv_reg, self.multi_level_conv_obj, self.multi_level_conv_centers2d, ) out = { 'enc_cls_scores': cls_scores, 'enc_bbox_preds': bbox_preds, 'pred_centers2d_offset': centers2d_offsets, 'objectnesses':objectnesses, 'topk_indexes':None } return out def _bbox_decode(self, priors, bbox_preds): xys = (bbox_preds[..., :2] * priors[:, 2:]) + priors[:, :2] whs = bbox_preds[..., 2:].exp() * priors[:, 2:] tl_x = (xys[..., 0] - whs[..., 0] / 2) tl_y = (xys[..., 1] - whs[..., 1] / 2) br_x = (xys[..., 0] + whs[..., 0] / 2) br_y = (xys[..., 1] + whs[..., 1] / 2) decoded_bboxes = torch.stack([tl_x, tl_y, br_x, br_y], -1) return decoded_bboxes def _centers2d_decode(self, priors, centers2d): centers2d = (centers2d[..., :2] * priors[:, 2:]) + priors[:, :2] return centers2d def _bboxes_nms(self, cls_scores, bboxes, score_factor, cfg): max_scores, labels = torch.max(cls_scores, 1) valid_mask = score_factor * max_scores >= cfg.score_thr bboxes = bboxes[valid_mask] scores = max_scores[valid_mask] * score_factor[valid_mask] labels = labels[valid_mask] if labels.numel() == 0: return bboxes, labels else: dets, keep = batched_nms(bboxes, scores, labels, cfg.nms) return dets, labels[keep] @force_fp32(apply_to=('cls_scores', 'bbox_preds', 'objectnesses', 'centers2d')) def loss(self, gt_bboxes2d_list, gt_labels2d_list, centers2d, preds_dicts, depths, img_metas, #len=B gt_bboxes_ignore=None): """Compute loss of the head.` Args: cls_scores (list[Tensor]): Box scores for each scale level, each is a 4D-tensor, the channel number is num_priors * num_classes. bbox_preds (list[Tensor]): Box energies / deltas for each scale level, each is a 4D-tensor, the channel number is num_priors * 4. objectnesses (list[Tensor], Optional): Score factor for all scale level, each is a 4D-tensor, has shape (batch_size, 1, H, W). gt_bboxes (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (list[Tensor]): class indices corresponding to each box img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. gt_bboxes_ignore (None | list[Tensor]): specify which bounding boxes can be ignored when computing the loss. """ cls_scores = preds_dicts['enc_cls_scores'] bbox_preds = preds_dicts['enc_bbox_preds'] objectnesses = preds_dicts['objectnesses'] centers2d_offset = preds_dicts['pred_centers2d_offset'] num_imgs = cls_scores[0].shape[0] featmap_sizes = [cls_score.shape[2:] for cls_score in cls_scores] mlvl_priors = self.prior_generator.grid_priors( featmap_sizes, dtype=cls_scores[0].dtype, device=cls_scores[0].device, with_stride=True) flatten_cls_preds = [ cls_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, self.cls_out_channels) for cls_pred in cls_scores ] flatten_bbox_preds = [ bbox_pred.permute(0, 2, 3, 1).reshape(num_imgs, -1, 4) for bbox_pred in bbox_preds ] flatten_objectness = [ objectness.permute(0, 2, 3, 1).reshape(num_imgs, -1) for objectness in objectnesses ] flatten_centers2d_offset = [ center2d_offset.permute(0, 2, 3, 1).reshape(num_imgs, -1, 2) for center2d_offset in centers2d_offset ] flatten_cls_preds = torch.cat(flatten_cls_preds, dim=1) flatten_bbox_preds = torch.cat(flatten_bbox_preds, dim=1) flatten_objectness = torch.cat(flatten_objectness, dim=1) flatten_centers2d_offset = torch.cat(flatten_centers2d_offset, dim=1) flatten_priors = torch.cat(mlvl_priors) flatten_bboxes = self._bbox_decode(flatten_priors, flatten_bbox_preds) device = cls_scores[0].device gt_bboxes = [bboxes2d.to(device) for i in gt_bboxes2d_list for bboxes2d in i] gt_labels = [labels2d.to(device) for i in gt_labels2d_list for labels2d in i] centers2d = [center2d.to(device) for i in centers2d for center2d in i] (pos_masks, cls_targets, obj_targets, bbox_targets, l1_targets, centers2d_target, num_fg_imgs) = multi_apply( self._get_target_single, flatten_cls_preds.detach(), flatten_objectness.detach(), flatten_priors.unsqueeze(0).repeat(num_imgs, 1, 1), flatten_bboxes.detach(), gt_bboxes, gt_labels, centers2d) # The experimental results show that ‘reduce_mean’ can improve # performance on the COCO dataset. num_pos = torch.tensor( sum(num_fg_imgs), dtype=torch.float, device=flatten_cls_preds.device) num_total_samples = max(reduce_mean(num_pos), 1.0) pos_masks = torch.cat(pos_masks, 0) cls_targets = torch.cat(cls_targets, 0) obj_targets = torch.cat(obj_targets, 0) bbox_targets = torch.cat(bbox_targets, 0) if self.use_l1: l1_targets = torch.cat(l1_targets, 0) centers2d_target = torch.cat(centers2d_target, 0) loss_bbox = self.loss_bbox( flatten_bboxes.view(-1, 4)[pos_masks], bbox_targets) / num_total_samples loss_obj = self.loss_obj(flatten_objectness.view(-1, 1), obj_targets) / num_total_samples loss_cls = self.loss_cls( flatten_cls_preds.view(-1, self.num_classes)[pos_masks], cls_targets) / num_total_samples loss_centers2d = self.loss_centers2d( flatten_centers2d_offset.view(-1, 2)[pos_masks], centers2d_target) / num_total_samples loss_dict = dict( enc_loss_cls=loss_cls, enc_loss_iou=loss_bbox, enc_loss_obj=loss_obj, enc_loss_centers2d=loss_centers2d) if self.use_l1: loss_l1 = self.loss_l1( flatten_bbox_preds.view(-1, 4)[pos_masks], l1_targets) / num_total_samples loss_dict.update(enc_loss_bbox=loss_l1) return loss_dict @torch.no_grad() def _get_target_single(self, cls_preds, objectness, priors, decoded_bboxes, gt_bboxes, gt_labels, centers2d): """Compute classification, regression, and objectness targets for priors in a single image. Args: cls_preds (Tensor): Classification predictions of one image, a 2D-Tensor with shape [num_priors, num_classes] objectness (Tensor): Objectness predictions of one image, a 1D-Tensor with shape [num_priors] priors (Tensor): All priors of one image, a 2D-Tensor with shape [num_priors, 4] in [cx, xy, stride_w, stride_y] format. decoded_bboxes (Tensor): Decoded bboxes predictions of one image, a 2D-Tensor with shape [num_priors, 4] in [tl_x, tl_y, br_x, br_y] format. gt_bboxes (Tensor): Ground truth bboxes of one image, a 2D-Tensor with shape [num_gts, 4] in [tl_x, tl_y, br_x, br_y] format. gt_labels (Tensor): Ground truth labels of one image, a Tensor with shape [num_gts]. """ num_priors = priors.size(0) num_gts = gt_labels.size(0) gt_bboxes = gt_bboxes.to(decoded_bboxes.dtype) centers2d = centers2d.to(decoded_bboxes.dtype) # No target if num_gts == 0: cls_target = cls_preds.new_zeros((0, self.num_classes)) bbox_target = cls_preds.new_zeros((0, 4)) l1_target = cls_preds.new_zeros((0, 4)) obj_target = cls_preds.new_zeros((num_priors, 1)) foreground_mask = cls_preds.new_zeros(num_priors).bool() centers2d_target = cls_preds.new_zeros((0, 2)) return (foreground_mask, cls_target, obj_target, bbox_target, l1_target, centers2d_target, 0) # YOLOX uses center priors with 0.5 offset to assign targets, # but use center priors without offset to regress bboxes. offset_priors = torch.cat( [priors[:, :2] + priors[:, 2:] * 0.5, priors[:, 2:]], dim=-1) assign_result = self.assigner.assign( cls_preds.sigmoid() * objectness.unsqueeze(1).sigmoid(), offset_priors, decoded_bboxes, gt_bboxes, gt_labels) sampling_result = self.sampler.sample(assign_result, priors, gt_bboxes) sampling_result_centers2d = self.sampler_.sample(assign_result, priors, centers2d) pos_inds = sampling_result.pos_inds num_pos_per_img = pos_inds.size(0) pos_ious = assign_result.max_overlaps[pos_inds] # IOU aware classification score cls_target = F.one_hot(sampling_result.pos_gt_labels, self.num_classes) * pos_ious.unsqueeze(-1) obj_target = torch.zeros_like(objectness).unsqueeze(-1) obj_target[pos_inds] = 1 bbox_target = sampling_result.pos_gt_bboxes l1_target = cls_preds.new_zeros((num_pos_per_img, 4)) if self.use_l1: l1_target = self._get_l1_target(l1_target, bbox_target, priors[pos_inds]) foreground_mask = torch.zeros_like(objectness).to(torch.bool) foreground_mask[pos_inds] = 1 #centers2d target centers2d_labels = sampling_result_centers2d.pos_gt_bboxes centers2d_target = cls_preds.new_zeros((num_pos_per_img, 2)) centers2d_target = self._get_centers2d_target(centers2d_target, centers2d_labels, priors[pos_inds]) return (foreground_mask, cls_target, obj_target, bbox_target, l1_target, centers2d_target, num_pos_per_img) def _get_l1_target(self, l1_target, gt_bboxes, priors, eps=1e-8): """Convert gt bboxes to center offset and log width height.""" gt_cxcywh = bbox_xyxy_to_cxcywh(gt_bboxes) l1_target[:, :2] = (gt_cxcywh[:, :2] - priors[:, :2]) / priors[:, 2:] l1_target[:, 2:] = torch.log(gt_cxcywh[:, 2:] / priors[:, 2:] + eps) return l1_target def _get_centers2d_target(self, centers2d_target, centers2d_labels, priors): centers2d_target = (centers2d_labels - priors[:, :2]) / priors[:, 2:] return centers2d_target ================================================ FILE: mmdet3d/models/fbbev/modules/__init__.py ================================================ from .depth_net import NaiveDepthNet, CM_DepthNet from .frpn import FRPN from .fpn3d import FPN3D from .resnet3d import CustomResNet3D from .occ_loss_utils import * ================================================ FILE: mmdet3d/models/fbbev/modules/depth_net.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_conv_layer from mmcv.runner import BaseModule, force_fp32 from torch.cuda.amp.autocast_mode import autocast from torch.utils.checkpoint import checkpoint from mmdet.models.backbones.resnet import BasicBlock from mmdet.models import HEADS import torch.utils.checkpoint as cp from mmdet3d.models import builder from mmcv.runner import force_fp32, auto_fp16 import torch from torchvision.utils import make_grid import torchvision import matplotlib.pyplot as plt import cv2 def convert_color(img_path): plt.figure() img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) plt.imsave(img_path, img, cmap=plt.get_cmap('viridis')) plt.close() def save_tensor(tensor, path, pad_value=254.0,normalize=False): print('save_tensor', path) tensor = tensor.to(torch.float).detach().cpu() max_ = tensor.flatten(1).max(-1).values[:, None, None] min_ = tensor.flatten(1).min(-1).values[:, None, None] tensor = (tensor-min_)/(max_-min_) if tensor.type() == 'torch.BoolTensor': tensor = tensor*255 if len(tensor.shape) == 3: tensor = tensor.unsqueeze(1) tensor = make_grid(tensor, pad_value=pad_value, normalize=normalize).permute(1, 2, 0).numpy().copy() torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path) convert_color(path) @HEADS.register_module() class NaiveDepthNet(BaseModule): r"""Naive depthnet used in Lift-Splat-Shoot Please refer to the `paper `_ Args: in_channels (int): Channels of input feature. context_channels (int): Channels of transformed feature. """ def __init__( self, in_channels=512, context_channels=64, depth_channels=118, downsample=16, uniform=False, with_cp=False ): super(NaiveDepthNet, self).__init__() self.uniform = uniform self.with_cp = with_cp self.context_channels = context_channels self.in_channels = in_channels self.D =depth_channels self.downsample=downsample, self.depth_net = nn.Conv2d( in_channels, self.D + self.context_channels, kernel_size=1, padding=0) @force_fp32() def forward(self, x, mlp_input=None): """ """ B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) if self.with_cp and x.requires_grad: x = cp.checkpoint(self.depth_net, x) else: x = self.depth_net(x) depth_digit = x[:, :self.D, ...] context = x[:, self.D:self.D + self.context_channels, ...] if self.uniform: depth_digit = depth_digit * 0 depth = depth_digit.softmax(dim=1) else: depth = depth_digit.softmax(dim=1) context = context.view(B, N, self.context_channels, H, W) depth = depth.view(B, N, self.D, H, W) return context, depth def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): return None class _ASPPModule(nn.Module): def __init__(self, inplanes, planes, kernel_size, padding, dilation, BatchNorm): super(_ASPPModule, self).__init__() self.atrous_conv = nn.Conv2d( inplanes, planes, kernel_size=kernel_size, stride=1, padding=padding, dilation=dilation, bias=False) self.bn = BatchNorm(planes) self.relu = nn.ReLU() self._init_weight() @force_fp32() def forward(self, x): x = self.atrous_conv(x) x = self.bn(x) return self.relu(x) def _init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() class ASPP(nn.Module): def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d): super(ASPP, self).__init__() dilations = [1, 6, 12, 18] self.aspp1 = _ASPPModule( inplanes, mid_channels, 1, padding=0, dilation=dilations[0], BatchNorm=BatchNorm) self.aspp2 = _ASPPModule( inplanes, mid_channels, 3, padding=dilations[1], dilation=dilations[1], BatchNorm=BatchNorm) self.aspp3 = _ASPPModule( inplanes, mid_channels, 3, padding=dilations[2], dilation=dilations[2], BatchNorm=BatchNorm) self.aspp4 = _ASPPModule( inplanes, mid_channels, 3, padding=dilations[3], dilation=dilations[3], BatchNorm=BatchNorm) self.global_avg_pool = nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False), BatchNorm(mid_channels), nn.ReLU(), ) self.conv1 = nn.Conv2d( int(mid_channels * 5), inplanes, 1, bias=False) self.bn1 = BatchNorm(inplanes) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.5) self._init_weight() @force_fp32() def forward(self, x): x1 = self.aspp1(x) x2 = self.aspp2(x) x3 = self.aspp3(x) x4 = self.aspp4(x) x5 = self.global_avg_pool(x) x5 = F.interpolate( x5, size=x4.size()[2:], mode='bilinear', align_corners=True) x = torch.cat((x1, x2, x3, x4, x5), dim=1) x = self.conv1(x) x = self.bn1(x) x = self.relu(x) return self.dropout(x) def _init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, drop=0.0): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.drop1 = nn.Dropout(drop) self.fc2 = nn.Linear(hidden_features, out_features) self.drop2 = nn.Dropout(drop) @force_fp32() def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop1(x) x = self.fc2(x) x = self.drop2(x) return x class SELayer(nn.Module): def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): super().__init__() self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True) self.act1 = act_layer() self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True) self.gate = gate_layer() @force_fp32() def forward(self, x, x_se): x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) return x * self.gate(x_se) @HEADS.register_module() class CM_DepthNet(BaseModule): """ Camera parameters aware depth net """ def __init__(self, in_channels=512, context_channels=64, depth_channels=118, mid_channels=512, use_dcn=True, downsample=16, grid_config=None, loss_depth_weight=3.0, with_cp=False, se_depth_map=False, sid=False, bias=0.0, input_size=None, aspp_mid_channels=-1, use_aspp=True): super(CM_DepthNet, self).__init__() self.fp16_enable=False self.sid=sid self.with_cp = with_cp self.downsample = downsample self.grid_config = grid_config self.loss_depth_weight = loss_depth_weight self.reduce_conv = nn.Sequential( nn.Conv2d( in_channels, mid_channels, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), ) self.context_channels = context_channels self.depth_channels = depth_channels self.se_depth_map = se_depth_map self.context_conv = nn.Conv2d( mid_channels, context_channels, kernel_size=1, stride=1, padding=0) self.bn = nn.BatchNorm1d(27) self.depth_mlp = Mlp(27, mid_channels, mid_channels) self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware self.context_mlp = Mlp(27, mid_channels, mid_channels) self.context_se = SELayer(mid_channels) # NOTE: add camera-aware depth_conv_input_channels = mid_channels downsample = None depth_conv_list = [ BasicBlock(depth_conv_input_channels, mid_channels, downsample=downsample), BasicBlock(mid_channels, mid_channels), BasicBlock(mid_channels, mid_channels), ] if use_aspp: if aspp_mid_channels < 0: aspp_mid_channels = mid_channels depth_conv_list.append(ASPP(mid_channels, aspp_mid_channels)) if use_dcn: depth_conv_list.append( build_conv_layer( cfg=dict( type='DCN', in_channels=mid_channels, out_channels=mid_channels, kernel_size=3, padding=1, groups=4, im2col_step=128, ))) depth_conv_list.append( nn.Conv2d( mid_channels, depth_channels, kernel_size=1, stride=1, padding=0)) self.depth_conv = nn.Sequential(*depth_conv_list) @force_fp32() def forward(self, x, mlp_input): # if not x.requires_grad: x = x.to(torch.float32) # FIX distill type error mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) if self.with_cp and x.requires_grad: x = cp.checkpoint(self.reduce_conv, x) else: x = self.reduce_conv(x) context_se = self.context_mlp(mlp_input)[..., None, None] if self.with_cp and x.requires_grad: context = cp.checkpoint(self.context_se, x, context_se) else: context = self.context_se(x, context_se) context = self.context_conv(context) depth_se = self.depth_mlp(mlp_input)[..., None, None] depth = self.depth_se(x, depth_se) if self.with_cp and depth.requires_grad: depth = cp.checkpoint(self.depth_conv, depth) else: depth = self.depth_conv(depth) depth = depth.softmax(dim=1) context = context.view(B, N, self.context_channels, H, W) depth = depth.view(B, N, self.depth_channels, H, W) return context, depth def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): B, N, _, _ = rot.shape bda = bda.view(B, 1, 3, 3).repeat(1, N, 1, 1) mlp_input = torch.stack([ intrin[:, :, 0, 0], intrin[:, :, 1, 1], intrin[:, :, 0, 2], intrin[:, :, 1, 2], post_rot[:, :, 0, 0], post_rot[:, :, 0, 1], post_tran[:, :, 0], post_rot[:, :, 1, 0], post_rot[:, :, 1, 1], post_tran[:, :, 1], bda[:, :, 0, 0], bda[:, :, 0, 1], bda[:, :, 1, 0], bda[:, :, 1, 1], bda[:, :, 2, 2], ], dim=-1) sensor2ego = torch.cat([rot, tran.reshape(B, N, 3, 1)], dim=-1).reshape(B, N, -1) mlp_input = torch.cat([mlp_input, sensor2ego], dim=-1) return mlp_input def get_downsampled_gt_depth(self, gt_depths): """ Input: gt_depths: [B, N, H, W] Output: gt_depths: [B*N*h*w, d] """ downsample = self.downsample # if self.downsample == 8 and self.se_depth_map: # downsample = 16 B, N, H, W = gt_depths.shape gt_depths = gt_depths.view(B * N, H // downsample, downsample, W // downsample, downsample, 1) gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous() gt_depths = gt_depths.view(-1, downsample * downsample) gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths) gt_depths = torch.min(gt_depths_tmp, dim=-1).values gt_depths = gt_depths.view(B * N, H // downsample, W // downsample) if not self.sid: gt_depths = (gt_depths - (self.grid_config['depth'][0] - self.grid_config['depth'][2])) / \ self.grid_config['depth'][2] else: gt_depths = torch.log(gt_depths) - torch.log( torch.tensor(self.grid_config['depth'][0]).float()) gt_depths = gt_depths * (self.D - 1) / torch.log( torch.tensor(self.grid_config['depth'][1] - 1.).float() / self.grid_config['depth'][0]) gt_depths = gt_depths + 1. gt_depths = torch.where((gt_depths < self.depth_channels + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths)) gt_depths = F.one_hot( gt_depths.long(), num_classes=self.depth_channels + 1).view(-1, self.depth_channels + 1)[:, 1:] return gt_depths.float() @force_fp32() def get_depth_loss(self, depth_labels, depth_preds): depth_labels = self.get_downsampled_gt_depth(depth_labels) depth_preds = depth_preds.permute(0, 1, 3, 4, 2).contiguous().view(-1, self.depth_channels) fg_mask = torch.max(depth_labels, dim=1).values > 0.0 depth_labels = depth_labels[fg_mask] depth_preds = depth_preds[fg_mask] with autocast(enabled=False): depth_loss = F.binary_cross_entropy( depth_preds, depth_labels, reduction='none', ).sum() / max(1.0, fg_mask.sum()) return dict(loss_depth=self.loss_depth_weight * depth_loss) @HEADS.register_module() class CM_ContextNet(nn.Module): """ Camera parameters aware depth net """ def __init__(self, in_channels=512, context_channels=64, mid_channels=512, with_cp=False, ): super(CM_ContextNet, self).__init__() self.with_cp = with_cp self.reduce_conv = nn.Sequential( nn.Conv2d( in_channels, mid_channels, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), ) self.context_channels = context_channels self.context_conv = nn.Conv2d( mid_channels, context_channels, kernel_size=1, stride=1, padding=0) self.bn = nn.BatchNorm1d(27) self.context_mlp = Mlp(27, mid_channels, mid_channels) self.context_se = SELayer(mid_channels) # NOTE: add camera-aware @force_fp32() def forward(self, x, mlp_input): mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) if self.with_cp and x.requires_grad: x = cp.checkpoint(self.reduce_conv, x) else: x = self.reduce_conv(x) context_se = self.context_mlp(mlp_input)[..., None, None] if self.with_cp and x.requires_grad: context = cp.checkpoint(self.context_se, x, context_se) else: context = self.context_se(x, context_se) context = self.context_conv(context) context = context.view(B, N, self.context_channels, H, W) return context ================================================ FILE: mmdet3d/models/fbbev/modules/fpn3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn from mmcv.cnn import ConvModule from mmdet.models import NECKS import torch.nn.functional as F import pdb from mmcv.runner import BaseModule, force_fp32 @NECKS.register_module() class FPN3D(BaseModule): """FPN used in SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (list[int]): Input channels of multi-scale feature maps. out_channels (list[int]): Output channels of feature maps. upsample_strides (list[int]): Strides used to upsample the feature maps. norm_cfg (dict): Config dict of normalization layers. upsample_cfg (dict): Config dict of upsample layers. conv_cfg (dict): Config dict of conv layers. use_conv_for_no_stride (bool): Whether to use conv when stride is 1. """ def __init__(self, in_channels=[80, 160, 320, 640], out_channels=256, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), conv_cfg=dict(type='Conv3d'), act_cfg=dict(type='ReLU'), with_cp=False, upsample_cfg=dict(mode='trilinear'), init_cfg=None): super(FPN3D, self).__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False self.upsample_cfg = upsample_cfg self.with_cp = with_cp self.num_out = len(self.in_channels) self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for i in range(self.num_out): l_conv = nn.Sequential( ConvModule(in_channels[i], out_channels, kernel_size=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=False, inplace=True), ) fpn_conv = nn.Sequential( ConvModule(out_channels, out_channels, kernel_size=3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=False, inplace=True), ) self.lateral_convs.append(l_conv) self.fpn_convs.append(fpn_conv) @force_fp32() def forward(self, inputs): """Forward function. Args: x (torch.Tensor): 4D Tensor in (N, C, H, W) shape. Returns: list[torch.Tensor]: Multi-level feature maps. """ assert len(inputs) == len(self.in_channels) # build laterals laterals = [] for i, lateral_conv in enumerate(self.lateral_convs): if self.with_cp: lateral_i = torch.utils.checkpoint.checkpoint(lateral_conv, inputs[i]) else: lateral_i = lateral_conv(inputs[i]) laterals.append(lateral_i) # build down-top path for i in range(self.num_out - 1, 0, -1): prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] = laterals[i - 1] + F.interpolate(laterals[i], size=prev_shape, align_corners=False, **self.upsample_cfg) # outs = [ # self.fpn_convs[i](laterals[i]) for i in range(self.num_out) # ] outs = [] for i, fpn_conv in enumerate(self.fpn_convs): if self.with_cp: out_i = torch.utils.checkpoint.checkpoint(fpn_conv, laterals[i]) else: out_i = fpn_conv(laterals[i]) outs.append(out_i) return outs ================================================ FILE: mmdet3d/models/fbbev/modules/frpn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_conv_layer from mmcv.runner import BaseModule, force_fp32 from torch.cuda.amp.autocast_mode import autocast from torch.utils.checkpoint import checkpoint from mmdet.models.backbones.resnet import BasicBlock from mmdet.models import HEADS import torch.utils.checkpoint as cp from mmdet3d.models.builder import build_loss @HEADS.register_module() class FRPN(BaseModule): r""" Args: in_channels (int): Channels of input feature. context_channels (int): Channels of transformed feature. """ def __init__( self, in_channels=512, scale_factor=1, mask_thre = 0.4, ): super(FRPN, self).__init__() self.mask_net = nn.Sequential( nn.Conv2d(in_channels, in_channels//2, kernel_size=3, padding=1, stride=1), nn.BatchNorm2d(in_channels//2), nn.ReLU(), nn.Conv2d(in_channels//2, 1, kernel_size=3, padding=1, stride=1), ) self.upsample = nn.Upsample(scale_factor = scale_factor , mode ='bilinear',align_corners = True) self.dice_loss = build_loss(dict(type='CustomDiceLoss', use_sigmoid=True, loss_weight=1.)) self.ce_loss = torch.nn.BCEWithLogitsLoss(pos_weight=torch.tensor([2.13])) # From lss self.mask_thre = mask_thre def forward(self, input): """ """ bev_mask = self.mask_net(input) bev_mask = self.upsample(bev_mask) return bev_mask def get_bev_mask_loss(self, gt_bev_mask, pred_bev_mask): bs, bev_h, bev_w = gt_bev_mask.shape b = gt_bev_mask.reshape(bs , bev_w * bev_h).permute(1, 0).to(torch.float) a = pred_bev_mask.reshape(bs, bev_w * bev_h).permute(1, 0) mask_ce_loss = self.ce_loss(a, b) mask_dice_loss = self.dice_loss(pred_bev_mask.reshape(bs, -1), gt_bev_mask.reshape(bs, -1)) return dict(mask_ce_loss=mask_ce_loss, mask_dice_loss=mask_dice_loss) ================================================ FILE: mmdet3d/models/fbbev/modules/occ_loss_utils/__init__.py ================================================ from .lovasz_softmax import * from .nusc_param import * from .semkitti import * from .focal_loss import CustomFocalLoss ================================================ FILE: mmdet3d/models/fbbev/modules/occ_loss_utils/focal_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss from mmdet.models.builder import LOSSES from mmdet.models.losses.utils import weight_reduce_loss import numpy as np # This method is only for debugging def py_sigmoid_focal_loss(pred, target, weight=None, gamma=2.0, alpha=0.25, reduction='mean', avg_factor=None): """PyTorch version of `Focal Loss `_. Args: pred (torch.Tensor): The prediction with shape (N, C), C is the number of classes target (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. alpha (float, optional): A balanced form for Focal Loss. Defaults to 0.25. reduction (str, optional): The method used to reduce the loss into a scalar. Defaults to 'mean'. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. """ pred_sigmoid = pred.sigmoid() target = target.type_as(pred) pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target) focal_weight = (alpha * target + (1 - alpha) * (1 - target)) * pt.pow(gamma) loss = F.binary_cross_entropy_with_logits( pred, target, reduction='none') * focal_weight if weight is not None: if weight.shape != loss.shape: if weight.size(0) == loss.size(0): # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.view(-1, 1) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.view(loss.size(0), -1) assert weight.ndim == loss.ndim loss = loss * weight loss = loss.sum(-1).mean() # loss = weight_reduce_loss(loss, weight, reduction, avg_factor) return loss def py_focal_loss_with_prob(pred, target, weight=None, gamma=2.0, alpha=0.25, reduction='mean', avg_factor=None): """PyTorch version of `Focal Loss `_. Different from `py_sigmoid_focal_loss`, this function accepts probability as input. Args: pred (torch.Tensor): The prediction probability with shape (N, C), C is the number of classes. target (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. alpha (float, optional): A balanced form for Focal Loss. Defaults to 0.25. reduction (str, optional): The method used to reduce the loss into a scalar. Defaults to 'mean'. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. """ num_classes = pred.size(1) target = F.one_hot(target, num_classes=num_classes + 1) target = target[:, :num_classes] target = target.type_as(pred) pt = (1 - pred) * target + pred * (1 - target) focal_weight = (alpha * target + (1 - alpha) * (1 - target)) * pt.pow(gamma) loss = F.binary_cross_entropy( pred, target, reduction='none') * focal_weight if weight is not None: if weight.shape != loss.shape: if weight.size(0) == loss.size(0): # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.view(-1, 1) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.view(loss.size(0), -1) assert weight.ndim == loss.ndim loss = weight_reduce_loss(loss, weight, reduction, avg_factor) return loss def sigmoid_focal_loss(pred, target, weight=None, gamma=2.0, alpha=0.25, reduction='mean', avg_factor=None): r"""A wrapper of cuda version `Focal Loss `_. Args: pred (torch.Tensor): The prediction with shape (N, C), C is the number of classes. target (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): Sample-wise loss weight. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. alpha (float, optional): A balanced form for Focal Loss. Defaults to 0.25. reduction (str, optional): The method used to reduce the loss into a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum". avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. """ # Function.apply does not accept keyword arguments, so the decorator # "weighted_loss" is not applicable loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma, alpha, None, 'none') if weight is not None: if weight.shape != loss.shape: if weight.size(0) == loss.size(0): # For most cases, weight is of shape (num_priors, ), # which means it does not have the second axis num_class weight = weight.view(-1, 1) else: # Sometimes, weight per anchor per class is also needed. e.g. # in FSAF. But it may be flattened of shape # (num_priors x num_class, ), while loss is still of shape # (num_priors, num_class). assert weight.numel() == loss.numel() weight = weight.view(loss.size(0), -1) assert weight.ndim == loss.ndim loss = loss * weight loss = loss.sum(-1).mean() # loss = weight_reduce_loss(loss, weight, reduction, avg_factor) return loss @LOSSES.register_module() class CustomFocalLoss(nn.Module): def __init__(self, use_sigmoid=True, gamma=2.0, alpha=0.25, reduction='mean', loss_weight=100.0, activated=False): """`Focal Loss `_ Args: use_sigmoid (bool, optional): Whether to the prediction is used for sigmoid or softmax. Defaults to True. gamma (float, optional): The gamma for calculating the modulating factor. Defaults to 2.0. alpha (float, optional): A balanced form for Focal Loss. Defaults to 0.25. reduction (str, optional): The method used to reduce the loss into a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum". loss_weight (float, optional): Weight of loss. Defaults to 1.0. activated (bool, optional): Whether the input is activated. If True, it means the input has been activated and can be treated as probabilities. Else, it should be treated as logits. Defaults to False. """ super(CustomFocalLoss, self).__init__() assert use_sigmoid is True, 'Only sigmoid focal loss supported now.' self.use_sigmoid = use_sigmoid self.gamma = gamma self.alpha = alpha self.reduction = reduction self.loss_weight = loss_weight self.activated = activated H, W = 200, 200 xy, yx = torch.meshgrid([torch.arange(H)-H/2, torch.arange(W)-W/2]) c = torch.stack([xy,yx], 2) c = torch.norm(c, 2, -1) c_max = c.max() self.c = (c/c_max + 1).cuda() def forward(self, pred, target, weight=None, avg_factor=None, ignore_index=255, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning label of the prediction. weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Options are "none", "mean" and "sum". Returns: torch.Tensor: The calculated loss """ B, H, W, D = target.shape c = self.c[None, :, :, None].repeat(B, 1, 1, D).reshape(-1) visible_mask = (target!=ignore_index).reshape(-1).nonzero().squeeze(-1) weight_mask = weight[None,:] * c[visible_mask, None] # visible_mask[:, None] num_classes = pred.size(1) pred = pred.permute(0, 2, 3, 4, 1).reshape(-1, num_classes)[visible_mask] target = target.reshape(-1)[visible_mask] assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.use_sigmoid: if self.activated: calculate_loss_func = py_focal_loss_with_prob else: if torch.cuda.is_available() and pred.is_cuda: calculate_loss_func = sigmoid_focal_loss else: num_classes = pred.size(1) target = F.one_hot(target, num_classes=num_classes + 1) target = target[:, :num_classes] calculate_loss_func = py_sigmoid_focal_loss loss_cls = self.loss_weight * calculate_loss_func( pred, target.to(torch.long), weight_mask, gamma=self.gamma, alpha=self.alpha, reduction=reduction, avg_factor=avg_factor) else: raise NotImplementedError return loss_cls @LOSSES.register_module() class CustomMSELoss(nn.Module): """MSELoss. Args: reduction (str, optional): The method that reduces the loss to a scalar. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of the loss. Defaults to 1.0 """ def __init__(self, reduction='mean', loss_weight=1.0): super().__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, mask=None, weight=None, avg_factor=None, reduction_override=None): """Forward function of loss. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. weight (torch.Tensor, optional): Weight of the loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. Returns: torch.Tensor: The calculated loss """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # N, C, H, W = pred.shape # mask = mask[:, None, :, :].repeat(1, C, 1, 1).to(torch.float) loss = self.loss_weight * (F.mse_loss(pred, target, reduction='mean')) return loss ================================================ FILE: mmdet3d/models/fbbev/modules/occ_loss_utils/lovasz_softmax.py ================================================ # -*- coding:utf-8 -*- # author: Xinge """ Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License) """ from __future__ import print_function, division import torch from torch.autograd import Variable import torch.nn.functional as F import numpy as np try: from itertools import ifilterfalse except ImportError: # py3k from itertools import filterfalse as ifilterfalse from torch.cuda.amp import autocast def lovasz_grad(gt_sorted): """ Computes gradient of the Lovasz extension w.r.t sorted errors See Alg. 1 in paper """ p = len(gt_sorted) gts = gt_sorted.sum() intersection = gts - gt_sorted.float().cumsum(0) union = gts + (1 - gt_sorted).float().cumsum(0) jaccard = 1. - intersection / union if p > 1: # cover 1-pixel case jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] return jaccard def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True): """ IoU for foreground class binary: 1 foreground, 0 background """ if not per_image: preds, labels = (preds,), (labels,) ious = [] for pred, label in zip(preds, labels): intersection = ((label == 1) & (pred == 1)).sum() union = ((label == 1) | ((pred == 1) & (label != ignore))).sum() if not union: iou = EMPTY else: iou = float(intersection) / float(union) ious.append(iou) iou = mean(ious) # mean accross images if per_image return 100 * iou def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False): """ Array of IoU for each (non ignored) class """ if not per_image: preds, labels = (preds,), (labels,) ious = [] for pred, label in zip(preds, labels): iou = [] for i in range(C): if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes) intersection = ((label == i) & (pred == i)).sum() union = ((label == i) | ((pred == i) & (label != ignore))).sum() if not union: iou.append(EMPTY) else: iou.append(float(intersection) / float(union)) ious.append(iou) ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image return 100 * np.array(ious) # --------------------------- BINARY LOSSES --------------------------- def lovasz_hinge(logits, labels, per_image=True, ignore=None): """ Binary Lovasz hinge loss logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) per_image: compute the loss per image instead of per batch ignore: void class id """ if per_image: loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore)) for log, lab in zip(logits, labels)) else: loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore)) return loss def lovasz_hinge_flat(logits, labels): """ Binary Lovasz hinge loss logits: [P] Variable, logits at each prediction (between -\infty and +\infty) labels: [P] Tensor, binary ground truth labels (0 or 1) ignore: label to ignore """ if len(labels) == 0: # only void pixels, the gradients should be 0 return logits.sum() * 0. signs = 2. * labels.float() - 1. errors = (1. - logits * Variable(signs)) errors_sorted, perm = torch.sort(errors, dim=0, descending=True) perm = perm.data gt_sorted = labels[perm] grad = lovasz_grad(gt_sorted) loss = torch.dot(F.relu(errors_sorted), Variable(grad)) return loss def flatten_binary_scores(scores, labels, ignore=None): """ Flattens predictions in the batch (binary case) Remove labels equal to 'ignore' """ scores = scores.view(-1) labels = labels.view(-1) if ignore is None: return scores, labels valid = (labels != ignore) vscores = scores[valid] vlabels = labels[valid] return vscores, vlabels class StableBCELoss(torch.nn.modules.Module): def __init__(self): super(StableBCELoss, self).__init__() def forward(self, input, target): neg_abs = - input.abs() loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log() return loss.mean() def binary_xloss(logits, labels, ignore=None): """ Binary Cross entropy loss logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) ignore: void class id """ logits, labels = flatten_binary_scores(logits, labels, ignore) loss = StableBCELoss()(logits, Variable(labels.float())) return loss # --------------------------- MULTICLASS LOSSES --------------------------- def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None): """ Multi-class Lovasz-Softmax loss probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. per_image: compute the loss per image instead of per batch ignore: void class labels """ if per_image: loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes) for prob, lab in zip(probas, labels)) else: with autocast(False): loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes) return loss def lovasz_softmax_flat(probas, labels, classes='present'): """ Multi-class Lovasz-Softmax loss probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) labels: [P] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. """ if probas.numel() == 0: # only void pixels, the gradients should be 0 return probas * 0. C = probas.size(1) losses = [] class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes for c in class_to_sum: fg = (labels == c).float() # foreground for class c if (classes is 'present' and fg.sum() == 0): continue if C == 1: if len(classes) > 1: raise ValueError('Sigmoid output possible only with 1 class') class_pred = probas[:, 0] else: class_pred = probas[:, c] errors = (Variable(fg) - class_pred).abs() errors_sorted, perm = torch.sort(errors, 0, descending=True) perm = perm.data fg_sorted = fg[perm] losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted)))) return mean(losses) def flatten_probas(probas, labels, ignore=None): """ Flattens predictions in the batch """ if probas.dim() == 2: if ignore is not None: valid = (labels != ignore) probas = probas[valid] labels = labels[valid] return probas, labels elif probas.dim() == 3: # assumes output of a sigmoid layer B, H, W = probas.size() probas = probas.view(B, 1, H, W) elif probas.dim() == 5: #3D segmentation B, C, L, H, W = probas.size() probas = probas.contiguous().view(B, C, L, H*W) B, C, H, W = probas.size() probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C labels = labels.view(-1) if ignore is None: return probas, labels valid = (labels != ignore) vprobas = probas[valid.nonzero().squeeze()] vlabels = labels[valid] return vprobas, vlabels def xloss(logits, labels, ignore=None): """ Cross entropy loss """ return F.cross_entropy(logits, Variable(labels), ignore_index=255) def jaccard_loss(probas, labels,ignore=None, smooth = 100, bk_class = None): """ Something wrong with this loss Multi-class Lovasz-Softmax loss probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. per_image: compute the loss per image instead of per batch ignore: void class labels """ vprobas, vlabels = flatten_probas(probas, labels, ignore) true_1_hot = torch.eye(vprobas.shape[1])[vlabels] if bk_class: one_hot_assignment = torch.ones_like(vlabels) one_hot_assignment[vlabels == bk_class] = 0 one_hot_assignment = one_hot_assignment.float().unsqueeze(1) true_1_hot = true_1_hot*one_hot_assignment true_1_hot = true_1_hot.to(vprobas.device) intersection = torch.sum(vprobas * true_1_hot) cardinality = torch.sum(vprobas + true_1_hot) loss = (intersection + smooth / (cardinality - intersection + smooth)).mean() return (1-loss)*smooth def hinge_jaccard_loss(probas, labels,ignore=None, classes = 'present', hinge = 0.1, smooth =100): """ Multi-class Hinge Jaccard loss probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. ignore: void class labels """ vprobas, vlabels = flatten_probas(probas, labels, ignore) C = vprobas.size(1) losses = [] class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes for c in class_to_sum: if c in vlabels: c_sample_ind = vlabels == c cprobas = vprobas[c_sample_ind,:] non_c_ind =np.array([a for a in class_to_sum if a != c]) class_pred = cprobas[:,c] max_non_class_pred = torch.max(cprobas[:,non_c_ind],dim = 1)[0] TP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.) + smooth FN = torch.sum(torch.clamp(max_non_class_pred - class_pred, min = -hinge)+hinge) if (~c_sample_ind).sum() == 0: FP = 0 else: nonc_probas = vprobas[~c_sample_ind,:] class_pred = nonc_probas[:,c] max_non_class_pred = torch.max(nonc_probas[:,non_c_ind],dim = 1)[0] FP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.) losses.append(1 - TP/(TP+FP+FN)) if len(losses) == 0: return 0 return mean(losses) # --------------------------- HELPER FUNCTIONS --------------------------- def isnan(x): return x != x def mean(l, ignore_nan=False, empty=0): """ nanmean compatible with generators. """ l = iter(l) if ignore_nan: l = ifilterfalse(isnan, l) try: n = 1 acc = next(l) except StopIteration: if empty == 'raise': raise ValueError('Empty mean') return empty for n, v in enumerate(l, 2): acc += v if n == 1: return acc return acc / n ================================================ FILE: mmdet3d/models/fbbev/modules/occ_loss_utils/nusc_param.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np # nusc_class_frequencies = np.array([57330862, 25985376, 1561108, 28862014, 196106643, 15920504, # 2158753, 26539491, 4004729, 34838681, 75173306, 2255027978, 50959399, 646022466, 869055679, # 1446141335, 1724391378, 2242961742295]) # nusc_class_frequencies = np.array([2242961742295, 25985376, 1561108, 28862014, 196106643, 15920504, # 2158753, 26539491, 4004729, 34838681, 75173306, 2255027978, 50959399, 646022466, 869055679, # 1446141335, 1724391378]) nusc_class_frequencies = np.array([ 944004, 1897170, 152386, 2391677, 16957802, 724139, 189027, 2074468, 413451, 2384460, 5916653, 175883646, 4275424, 51393615, 61411620, 105975596, 116424404, 1892500630 ]) # nusc_class_names = [ # "noise", # "barrier", # "bicycle", # "bus", # "car", # "construction", # "motorcycle", # "pedestrian", # "trafficcone", # "trailer", # "truck", # "driveable_surface", # "other", # "sidewalk", # "terrain", # "mannade", # "vegetation", # "free", # ] nusc_class_names = [ "empty", # 0 "barrier", # 1 "bicycle", # 2 "bus", # 3 "car", # 4 "construction", # 5 "motorcycle", # 6 "pedestrian", # 7 "trafficcone", # 8 "trailer", # 9 "truck", # 10 "driveable_surface", # 11 "other", # 12 "sidewalk", # 13 "terrain", # 14 "mannade", # 15 "vegetation", # 16 ] # classname_to_color = { # RGB. # 0: (0, 0, 0), # Black. noise # 1: (112, 128, 144), # Slategrey barrier # 2: (220, 20, 60), # Crimson bicycle # 3: (255, 127, 80), # Orangered bus # 4: (255, 158, 0), # Orange car # 5: (233, 150, 70), # Darksalmon construction # 6: (255, 61, 99), # Red motorcycle # 7: (0, 0, 230), # Blue pedestrian # 8: (47, 79, 79), # Darkslategrey trafficcone # 9: (255, 140, 0), # Darkorange trailer # 10: (255, 99, 71), # Tomato truck # 11: (0, 207, 191), # nuTonomy green driveable_surface # 12: (175, 0, 75), # flat other # 13: (75, 0, 75), # sidewalk # 14: (112, 180, 60), # terrain # 15: (222, 184, 135), # Burlywood mannade # 16: (0, 175, 0), # Green vegetation # } classname_to_color = { # RGB. # 0: (0, 0, 0), # Black. noise 1: (112, 128, 144), # Slategrey barrier 2: (220, 20, 60), # Crimson bicycle 3: (255, 127, 80), # Orangered bus 4: (255, 158, 0), # Orange car 5: (233, 150, 70), # Darksalmon construction 6: (255, 61, 99), # Red motorcycle 7: (0, 0, 230), # Blue pedestrian 8: (47, 79, 79), # Darkslategrey trafficcone 9: (255, 140, 0), # Darkorange trailer 10: (255, 99, 71), # Tomato truck 11: (0, 207, 191), # nuTonomy green driveable_surface 12: (175, 0, 75), # flat other 13: (75, 0, 75), # sidewalk 14: (112, 180, 60), # terrain 15: (222, 184, 135), # Burlywood mannade 16: (0, 175, 0), # Green vegetation } def KL_sep(p, target): """ KL divergence on nonzeros classes """ nonzeros = target != 0 nonzero_p = p[nonzeros] kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum") return kl_term def geo_scal_loss(pred, ssc_target): # Get softmax probabilities pred = F.softmax(pred, dim=1) # Compute empty and nonempty probabilities empty_probs = pred[:, 0, :, :, :] nonempty_probs = 1 - empty_probs # Remove unknown voxels mask = ssc_target != 255 nonempty_target = ssc_target != 0 nonempty_target = nonempty_target[mask].float() nonempty_probs = nonempty_probs[mask] empty_probs = empty_probs[mask] intersection = (nonempty_target * nonempty_probs).sum() precision = intersection / nonempty_probs.sum() recall = intersection / nonempty_target.sum() spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum() return ( F.binary_cross_entropy(precision, torch.ones_like(precision)) + F.binary_cross_entropy(recall, torch.ones_like(recall)) + F.binary_cross_entropy(spec, torch.ones_like(spec)) ) def sem_scal_loss(pred, ssc_target): # Get softmax probabilities pred = F.softmax(pred, dim=1) loss = 0 count = 0 mask = ssc_target != 255 n_classes = pred.shape[1] for i in range(0, n_classes): # Get probability of class i p = pred[:, i, :, :, :] # Remove unknown voxels target_ori = ssc_target p = p[mask] target = ssc_target[mask] completion_target = torch.ones_like(target) completion_target[target != i] = 0 completion_target_ori = torch.ones_like(target_ori).float() completion_target_ori[target_ori != i] = 0 if torch.sum(completion_target) > 0: count += 1.0 nominator = torch.sum(p * completion_target) loss_class = 0 if torch.sum(p) > 0: precision = nominator / (torch.sum(p)) loss_precision = F.binary_cross_entropy( precision, torch.ones_like(precision) ) loss_class += loss_precision if torch.sum(completion_target) > 0: recall = nominator / (torch.sum(completion_target)) loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall)) loss_class += loss_recall if torch.sum(1 - completion_target) > 0: specificity = torch.sum((1 - p) * (1 - completion_target)) / ( torch.sum(1 - completion_target) ) loss_specificity = F.binary_cross_entropy( specificity, torch.ones_like(specificity) ) loss_class += loss_specificity loss += loss_class return loss / count def CE_ssc_loss(pred, target, class_weights): """ :param: prediction: the predicted tensor, must be [BS, C, H, W, D] """ criterion = nn.CrossEntropyLoss( weight=class_weights, ignore_index=255, reduction="mean" ) loss = criterion(pred, target.long()) return loss ================================================ FILE: mmdet3d/models/fbbev/modules/occ_loss_utils/semkitti.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np # from mmcv.runner import BaseModule, force_fp32 from torch.cuda.amp import autocast semantic_kitti_class_frequencies = np.array( [ 5.41773033e09, 1.57835390e07, 1.25136000e05, 1.18809000e05, 6.46799000e05, 8.21951000e05, 2.62978000e05, 2.83696000e05, 2.04750000e05, 6.16887030e07, 4.50296100e06, 4.48836500e07, 2.26992300e06, 5.68402180e07, 1.57196520e07, 1.58442623e08, 2.06162300e06, 3.69705220e07, 1.15198800e06, 3.34146000e05, ] ) kitti_class_names = [ "empty", "car", "bicycle", "motorcycle", "truck", "other-vehicle", "person", "bicyclist", "motorcyclist", "road", "parking", "sidewalk", "other-ground", "building", "fence", "vegetation", "trunk", "terrain", "pole", "traffic-sign", ] def inverse_sigmoid(x, sign='A'): x = x.to(torch.float32) while x >= 1-1e-5: x = x - 1e-5 while x< 1e-5: x = x + 1e-5 return -torch.log((1 / x) - 1) def KL_sep(p, target): """ KL divergence on nonzeros classes """ nonzeros = target != 0 nonzero_p = p[nonzeros] kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum") return kl_term def geo_scal_loss(pred, ssc_target, ignore_index=255, non_empty_idx=0): # Get softmax probabilities pred = F.softmax(pred, dim=1) # Compute empty and nonempty probabilities empty_probs = pred[:, non_empty_idx] nonempty_probs = 1 - empty_probs # Remove unknown voxels mask = ssc_target != ignore_index nonempty_target = ssc_target != non_empty_idx nonempty_target = nonempty_target[mask].float() nonempty_probs = nonempty_probs[mask] empty_probs = empty_probs[mask] eps = 1e-5 intersection = (nonempty_target * nonempty_probs).sum() precision = intersection / (nonempty_probs.sum()+eps) recall = intersection / (nonempty_target.sum()+eps) spec = ((1 - nonempty_target) * (empty_probs)).sum() / ((1 - nonempty_target).sum()+eps) with autocast(False): return ( F.binary_cross_entropy_with_logits(inverse_sigmoid(precision, 'A'), torch.ones_like(precision)) + F.binary_cross_entropy_with_logits(inverse_sigmoid(recall, 'B'), torch.ones_like(recall)) + F.binary_cross_entropy_with_logits(inverse_sigmoid(spec, 'C'), torch.ones_like(spec)) ) def sem_scal_loss(pred_, ssc_target, ignore_index=255): # Get softmax probabilities with autocast(False): pred = F.softmax(pred_, dim=1) loss = 0 count = 0 mask = ssc_target != ignore_index n_classes = pred.shape[1] begin = 1 if n_classes == 19 else 0 for i in range(begin, n_classes-1): # Get probability of class i p = pred[:, i] # Remove unknown voxels target_ori = ssc_target p = p[mask] target = ssc_target[mask] completion_target = torch.ones_like(target) completion_target[target != i] = 0 completion_target_ori = torch.ones_like(target_ori).float() completion_target_ori[target_ori != i] = 0 if torch.sum(completion_target) > 0: count += 1.0 nominator = torch.sum(p * completion_target) loss_class = 0 if torch.sum(p) > 0: precision = nominator / (torch.sum(p)+ 1e-5) loss_precision = F.binary_cross_entropy_with_logits( inverse_sigmoid(precision, 'D'), torch.ones_like(precision) ) loss_class += loss_precision if torch.sum(completion_target) > 0: recall = nominator / (torch.sum(completion_target) +1e-5) # loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall)) loss_recall = F.binary_cross_entropy_with_logits(inverse_sigmoid(recall, 'E'), torch.ones_like(recall)) loss_class += loss_recall if torch.sum(1 - completion_target) > 0: specificity = torch.sum((1 - p) * (1 - completion_target)) / ( torch.sum(1 - completion_target) + 1e-5 ) loss_specificity = F.binary_cross_entropy_with_logits( inverse_sigmoid(specificity, 'F'), torch.ones_like(specificity) ) loss_class += loss_specificity loss += loss_class # print(i, loss_class, loss_recall, loss_specificity) l = loss/count if torch.isnan(l): from IPython import embed embed() exit() return l def CE_ssc_loss(pred, target, class_weights=None, ignore_index=255): """ :param: prediction: the predicted tensor, must be [BS, C, ...] """ criterion = nn.CrossEntropyLoss( weight=class_weights, ignore_index=ignore_index, reduction="mean" ) # from IPython import embed # embed() # exit() with autocast(False): loss = criterion(pred, target.long()) return loss def vel_loss(pred, gt): with autocast(False): return F.l1_loss(pred, gt) ================================================ FILE: mmdet3d/models/fbbev/modules/resnet3d.py ================================================ import math from functools import partial from mmdet3d.models.builder import BACKBONES from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.checkpoint import checkpoint as cp import pdb from mmcv.runner import BaseModule import spconv.pytorch as spconv from spconv.pytorch import functional as Fsp from mmcv.runner import BaseModule, force_fp32 def get_inplanes(): return [64, 128, 256, 512] BIAS = True def conv3x3x3(in_planes, out_planes, stride=1, use_spase_3dtensor=False): if not use_spase_3dtensor: Conv3d = nn.Conv3d else: Conv3d = spconv.SparseConv3d if stride!=1 else spconv.SubMConv3d return Conv3d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=BIAS) def conv1x1x1(in_planes, out_planes, stride=1, use_spase_3dtensor=False): if not use_spase_3dtensor: Conv3d = nn.Conv3d else: Conv3d = spconv.SparseConv3d if stride!=1 else spconv.SubMConv3d return Conv3d(in_planes, out_planes, kernel_size=1, stride=stride, bias=BIAS) class BasicBlock(BaseModule): expansion = 1 def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None, use_spase_3dtensor=False): super().__init__() self.use_spase_3dtensor = use_spase_3dtensor self.relu = nn.ReLU(inplace=False) self.downsample = downsample if self.use_spase_3dtensor: Sequential = spconv.SparseSequential conv1 = conv3x3x3(in_planes, planes, stride, use_spase_3dtensor=self.use_spase_3dtensor) bn1 = build_norm_layer(norm_cfg, planes)[1] relu = nn.ReLU(inplace=True) conv2 = conv3x3x3(planes, planes, use_spase_3dtensor=self.use_spase_3dtensor) bn2 = build_norm_layer(norm_cfg, planes)[1] layer_list = [conv1, bn1, relu, conv2, bn2] self.layer_seq = Sequential(*layer_list) else: self.conv1 = conv3x3x3(in_planes, planes, stride, use_spase_3dtensor=self.use_spase_3dtensor) self.bn1 = build_norm_layer(norm_cfg, planes)[1] self.conv2 = conv3x3x3(planes, planes, use_spase_3dtensor=self.use_spase_3dtensor) self.bn2 = build_norm_layer(norm_cfg, planes)[1] self.stride = stride @force_fp32() def forward(self, x, debug=False): residual = x if self.use_spase_3dtensor: out = self.layer_seq(x) if self.downsample is not None: residual = self.downsample(x) out = Fsp.sparse_add(out, residual) out = out.replace_feature(self.relu(out.features)) return out else: out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(BaseModule): expansion = 4 def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None): super().__init__() self.conv1 = conv1x1x1(in_planes, planes) self.bn1 = build_norm_layer(norm_cfg, planes)[1] self.conv2 = conv3x3x3(planes, planes, stride) self.bn2 = build_norm_layer(norm_cfg, planes)[1] self.conv3 = conv1x1x1(planes, planes * self.expansion) self.bn3 = build_norm_layer(norm_cfg, planes * self.expansion)[1] self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride @force_fp32() def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out @BACKBONES.register_module() class CustomResNet3D(BaseModule): def __init__(self, depth, block_inplanes=[64, 128, 256, 512], block_strides=[1, 2, 2, 2], out_indices=(0, 1, 2, 3), n_input_channels=3, shortcut_type='B', with_cp=False, norm_cfg=dict(type='BN3d', requires_grad=True), use_spase_3dtensor=False, plane2voxel=None, widen_factor=1.0): super().__init__() layer_metas = { 10: [1, 1, 1, 1], 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], } if depth in [10, 18, 34]: block = BasicBlock else: assert depth in [50, 101] block = Bottleneck self.with_cp = with_cp self.plane2voxel = plane2voxel layers = layer_metas[depth] self.use_spase_3dtensor = use_spase_3dtensor block_inplanes = [int(x * widen_factor) for x in block_inplanes] self.in_planes = block_inplanes[0] self.out_indices = out_indices # replace the first several downsampling layers with the channel-squeeze layers Conv3d = nn.Conv3d if not self.use_spase_3dtensor else spconv.SubMConv3d Sequential = nn.Sequential if not self.use_spase_3dtensor else spconv.SparseSequential if self.use_spase_3dtensor: norm_cfg['type'] = 'BN1d' self.input_proj = Sequential( Conv3d(n_input_channels, self.in_planes, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False), build_norm_layer(norm_cfg, self.in_planes)[1], nn.ReLU(inplace=True), ) self.layers = nn.ModuleList() for i in range(len(block_inplanes)): self.layers.append(self._make_layer(block, block_inplanes[i], layers[i], shortcut_type, block_strides[i], norm_cfg=norm_cfg)) for m in self.modules(): if isinstance(m, nn.Conv3d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm3d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def _downsample_basic_block(self, x, planes, stride): out = F.avg_pool3d(x, kernel_size=1, stride=stride) zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2), out.size(3), out.size(4)) if isinstance(out.data, torch.cuda.FloatTensor): zero_pads = zero_pads.cuda() out = torch.cat([out.data, zero_pads], dim=1) return out def _make_layer(self, block, planes, blocks, shortcut_type, stride=1, norm_cfg=None): downsample = None Sequential = nn.Sequential if not self.use_spase_3dtensor else spconv.SparseSequential if stride != 1 or self.in_planes != planes * block.expansion: if shortcut_type == 'A': downsample = partial(self._downsample_basic_block, planes=planes * block.expansion, stride=stride) else: downsample = Sequential( conv1x1x1(self.in_planes, planes * block.expansion, stride, self.use_spase_3dtensor), build_norm_layer(norm_cfg, planes * block.expansion)[1]) layers = [] layers.append( block(in_planes=self.in_planes, planes=planes, stride=stride, downsample=downsample, use_spase_3dtensor = self.use_spase_3dtensor, norm_cfg=norm_cfg)) self.in_planes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.in_planes, planes, norm_cfg=norm_cfg, use_spase_3dtensor = self.use_spase_3dtensor)) return Sequential(*layers) @force_fp32() def forward(self, x): if self.plane2voxel is not None: x = x.unsqueeze(-1).repeat(1, 1, 1, 1, self.plane2voxel) x = self.input_proj(x) res = [] for index, layer in enumerate(self.layers): if self.use_spase_3dtensor: for block in layer: if self.with_cp: x = cp(block, x) else: x = block(x) else: if self.with_cp: x = cp(layer, x) else: x = layer(x) if index in self.out_indices: if self.use_spase_3dtensor: res.append(x.dense()) else: res.append(x) return res def generate_model(model_depth, **kwargs): assert model_depth in [10, 18, 34, 50, 101, 152, 200] if model_depth == 10: model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs) elif model_depth == 18: model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs) elif model_depth == 34: model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs) elif model_depth == 50: model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs) elif model_depth == 101: model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs) elif model_depth == 152: model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs) elif model_depth == 200: model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs) return model ================================================ FILE: mmdet3d/models/fbbev/motion_head/__init__.py ================================================ from .motion_head import MotionHead from .motion_planner_head import MotionPlannerHead from .traj_loss import TrajLoss ================================================ FILE: mmdet3d/models/fbbev/motion_head/motion_head.py ================================================ import torch import torch.nn as nn from mmcv.cnn import Linear, bias_init_with_prob, Scale from mmcv.runner import force_fp32 from mmdet.core import (build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.utils import build_transformer from mmdet.models import HEADS, build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.models.utils.transformer import inverse_sigmoid from mmdet3d.core.bbox.coders import build_bbox_coder # from .streampetr_utils import * import copy from mmdet.models.utils import NormedLinear from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.fbbev.utils import save_tensor from mmcv.runner.base_module import BaseModule from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence def get_ego_pos(points, pc_range): if points.size(-1) == 3: points = points * (pc_range[3:6] - pc_range[0:3]) + pc_range[0:3] elif points.size(-1) == 2: points = points * (pc_range[3:5] - pc_range[0:2]) + pc_range[0:2] return points def get_rel_pos(points, pc_range): if points.size(-1) == 3: return (points - pc_range[0:3]) / (pc_range[3:6] - pc_range[0:3]) elif points.size(-1) == 2: return (points - pc_range[0:2]) / (pc_range[3:5] - pc_range[0:2]) @HEADS.register_module() class MotionHead(BaseModule): """Implements the DETR transformer head. See `paper: End-to-End Object Detection with Transformers `_ for details. Args: num_classes (int): Number of categories excluding the background. in_channels (int): Number of channels in the input feature map. num_query (int): Number of query in Transformer. num_reg_fcs (int, optional): Number of fully-connected layers used in `FFN`, which is then used for the regression head. Default 2. transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer. Default: None. sync_cls_avg_factor (bool): Whether to sync the avg_factor of all ranks. Default to False. positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for position encoding. loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification loss. Default `CrossEntropyLoss`. loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the regression loss. Default `L1Loss`. loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the regression iou loss. Default `GIoULoss`. tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of transformer head. test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of transformer head. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ _version = 2 def __init__(self, # num_classes=1, in_channels=256, stride=[16], embed_dims=256, num_query=6, num_reg_fcs=2, memory_len=12, topk_proposals=4, num_propagated=0, with_dn=True, with_ego_pos=True, match_with_velo=True, match_costs=None, transformer=None, sync_cls_avg_factor=False, code_weights=None, bbox_coder=None, loss_traj=dict(type='L1Loss', loss_weight=0.25), init_cfg=None, normedlinear=False, point_cloud_range=None, agent_decoder=dict(), agent_map_decoder=dict(), map_layer_index = -1, **kwargs): if 'code_size' in kwargs: self.code_size = kwargs['code_size'] else: self.code_size = 2 # if code_weights is not None: # self.code_weights = code_weights # else: # self.code_weights = [1.0, 1.0] # x, y, v_x, v_y # self.code_weights = self.code_weights[:self.code_size] self.traj_num_cls = 1 self.num_query = num_query self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs # self.train_cfg = train_cfg # self.test_cfg = test_cfg self.fp16_enabled = False self.embed_dims = embed_dims self.map_layer_index = map_layer_index super(MotionHead, self).__init__() self.loss_traj = build_loss(loss_traj) self.log_softmax = nn.LogSoftmax(dim=2) # self.code_weights = nn.Parameter(torch.tensor( # self.code_weights), requires_grad=False) self.pc_range = nn.Parameter(torch.tensor( point_cloud_range), requires_grad=False) self.fut_steps = 8 self.num_fut_mode = 6 self.agent_decoder = build_transformer_layer_sequence(agent_decoder) self.agent_map_decoder = build_transformer_layer_sequence(agent_map_decoder) self._init_layers() self.count = 0 def _init_layers(self): """Initialize layers of the transformer head.""" traj_branch = [] for _ in range(self.num_reg_fcs): traj_branch.append(Linear(self.embed_dims*2, self.embed_dims*2)) traj_branch.append(nn.ReLU()) traj_branch.append(Linear(self.embed_dims*2, self.fut_steps*self.code_size)) traj_branch = nn.Sequential(*traj_branch) traj_cls_branch = [] for _ in range(self.num_reg_fcs): traj_cls_branch.append(Linear(self.embed_dims*2, self.embed_dims*2)) traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2)) traj_cls_branch.append(nn.ReLU(inplace=True)) traj_cls_branch.append(Linear(self.embed_dims*2, self.traj_num_cls)) traj_cls_branch = nn.Sequential(*traj_cls_branch) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) motion_num_pred = 2 self.traj_branches = _get_clones(traj_branch, motion_num_pred) self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred) # self.reference_points = nn.Embedding(self.num_query, 3) self.agent_info = MLN(17) self.agent_info_embedding = nn.Sequential( nn.Linear(17, self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) self.traj_mode_embedding = nn.Embedding(self.num_fut_mode, self.embed_dims) def forward(self, agent_instances, preds_map_dicts, img_metas=None): valid_length = [(matched_gt_idxes>=0).sum() for matched_gt_idxes in agent_instances.matched_gt_idxes] max_valid_query = max(valid_length) assert 0<=max_valid_query<=250 agent_instances = agent_instances[:, :max_valid_query] agent_queries = agent_instances.query_feats agent_reference_points = agent_instances.reference_points mode_embedding = self.traj_mode_embedding.weight hist_mask = agent_instances.hist_mask B = len(agent_instances) hist_xyz_delta = (agent_instances.hist_xyz[:, :, 1:] - agent_instances.hist_xyz[:, :, :-1]) * hist_mask[:,:, :-1, None] agent_hist_info = torch.cat([hist_xyz_delta.flatten(-2, -1), agent_instances.hist_velo.flatten(-2, -1)], -1).detach() # I do believe this agent history infomation can be helpfull, so I use it twice agent_queries = self.agent_info(agent_queries, agent_hist_info) extra_agent_infos = (self.agent_info_embedding(agent_hist_info)[:, :, None, :].repeat(1, 1, self.num_fut_mode, 1)).flatten(1, 2) agent_queries = (agent_queries[:, :, None, :] + mode_embedding[None, None, :, :]).flatten(1, 2) hist_traj_points = agent_instances.hist_xyz.unsqueeze(2).repeat(1, 1, self.num_fut_mode, 1, 1).flatten(1, 2) hist_agent_xy = agent_instances.reference_points[:, :, :2].unsqueeze(2).repeat(1, 1, self.num_fut_mode, 1).flatten(1, 2) agent_queries = self.agent_decoder(agent_queries, reference_points_q=hist_traj_points, reference_points_v=hist_traj_points, pc_range=self.pc_range) map_queries = preds_map_dicts[self.map_layer_index]['queries'].clone() map_lines = preds_map_dicts[self.map_layer_index]['lines'].clone() map_scores = preds_map_dicts[self.map_layer_index]['scores'].clone() B, NMQ, K2 = map_lines.shape map_lines = map_lines.reshape(B, NMQ, K2//2, 2) map_lines = get_ego_pos(map_lines, self.pc_range) co_agent_queries = torch.cat([agent_queries, extra_agent_infos], -1) pred_traj_cls = self.traj_cls_branches[0](co_agent_queries).view(B, max_valid_query, self.num_fut_mode) pred_traj_cls = self.log_softmax(pred_traj_cls) pred_traj = self.traj_branches[0](co_agent_queries) B, N, PK = pred_traj.shape pred_traj = pred_traj.view(B, N, PK//self.code_size, self.code_size) fut_traj_points = torch.cat([hist_agent_xy.unsqueeze(-2), pred_traj[..., :2]], -2) fut_traj_points = torch.cumsum(fut_traj_points, -2)[:, :, 1:] agent_queries = self.agent_map_decoder(agent_queries, map_queries, map_queries, reference_points_q=fut_traj_points, reference_points_v=map_lines, pc_range=self.pc_range, map_scores=map_scores) co_agent_queries = torch.cat([agent_queries, extra_agent_infos], -1) pred_opt_traj_cls = self.traj_cls_branches[1](co_agent_queries).view(B, max_valid_query, self.num_fut_mode) pred_opt_traj_cls = self.log_softmax(pred_opt_traj_cls) pred_opt_traj = self.traj_branches[1](co_agent_queries) pred_opt_traj = pred_opt_traj.view(B, N, PK//self.code_size, self.code_size) fut_opt_traj_points = torch.cat([hist_agent_xy.unsqueeze(-2), pred_opt_traj[..., :2]], -2) fut_opt_traj_points = torch.cumsum(fut_opt_traj_points, -2)[:, :, 1:] return dict( pred_trajs=[ dict( pred_traj=pred_traj.view(B, N//self.num_fut_mode, self.num_fut_mode, PK//self.code_size, self.code_size), pred_traj_cls=pred_traj_cls, valid_length=valid_length, ), dict( pred_traj=pred_opt_traj.view(B, N//self.num_fut_mode, self.num_fut_mode, PK//self.code_size, self.code_size), pred_traj_cls=pred_opt_traj_cls, valid_length=valid_length, )], pred_abs_trajs = fut_traj_points, pred_abs_trajs2 = fut_opt_traj_points, obj_idxes = agent_instances.obj_idxes.clone(), motion_queries = agent_queries, agent_logits = agent_instances.logits.clone() ) @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_agent_fut_traj, gt_agent_fut_traj_mask, preds_dicts, matched_gt_idxes=None, img_metas=None, ): loss_dict = dict() gt_agent_fut_traj_list = [] gt_agent_fut_traj_mask_list = [] B = len(gt_agent_fut_traj) pred_trajs = preds_dicts['pred_trajs'] valid_length = pred_trajs[0]['valid_length'] for i in range(B): index = matched_gt_idxes[i][:valid_length[i]] if valid_length[i]>0: gt_agent_fut_traj_list.append(gt_agent_fut_traj[i][:valid_length[i]][index]) gt_agent_fut_traj_mask_list.append(gt_agent_fut_traj_mask[i][:valid_length[i]][index]) # from IPython import embed # embed() # exit() gt_agent_fut_traj = torch.cat(gt_agent_fut_traj_list) gt_agent_fut_traj_mask = torch.cat(gt_agent_fut_traj_mask_list).sum(-1) > 0 for lld, single_preds in enumerate(pred_trajs): pred_traj = single_preds['pred_traj'] pred_traj_cls = single_preds['pred_traj_cls'] pred_agent_fut_traj_list = [] pred_agent_fut_traj_cls_list = [] for i in range(B): if valid_length[i]>0: pred_agent_fut_traj_list.append(pred_traj[i][:valid_length[i]]) pred_agent_fut_traj_cls_list.append(pred_traj_cls[i][:valid_length[i]]) pred_traj = torch.cat(pred_agent_fut_traj_list) pred_traj_cls = torch.cat(pred_agent_fut_traj_cls_list) loss_traj, l_class, l_reg, l_minade, l_minfde, l_mr = self.loss_traj(pred_traj_cls, pred_traj, gt_agent_fut_traj, gt_agent_fut_traj_mask) loss_dict.update({ f'loss_traj.d{lld}': loss_traj, f'l_class.d{lld}': l_class, f'l_reg.d{lld}': l_reg, f'l_minade.d{lld}': l_minade, f'l_minfde.d{lld}': l_minfde, f'l_mr.d{lld}': l_mr, } ) return loss_dict @force_fp32(apply_to=('preds_dicts')) def get_bboxes(self, preds_dicts, img_metas, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ num_samples = len(img_metas) fut_traj_points = preds_dicts['pred_abs_trajs'].view(num_samples, -1, self.num_fut_mode, self.fut_steps, self.code_size) # fut_traj_index = preds_dicts['pred_trajs'][0]['pred_traj_cls'].softmax(-1).argmax(-1) scores, fut_traj_index = torch.max(preds_dicts['pred_trajs'][0]['pred_traj_cls'].softmax(-1), dim=-1) inds_rep = fut_traj_index.repeat( self.fut_steps, self.code_size, 1, 1).permute(2, 3, 0, 1) fut_traj_points = fut_traj_points.gather(2, inds_rep.unsqueeze(2)).squeeze(dim=2) fut_traj_points2 = preds_dicts['pred_abs_trajs2'].view(num_samples, -1, self.num_fut_mode, self.fut_steps, self.code_size) # fut_traj_index = preds_dicts['pred_trajs'][0]['pred_traj_cls'].softmax(-1).argmax(-1) scores, fut_traj_index = torch.max(preds_dicts['pred_trajs'][1]['pred_traj_cls'].softmax(-1), dim=-1) inds_rep = fut_traj_index.repeat( self.fut_steps, self.code_size, 1, 1).permute(2, 3, 0, 1) fut_traj_points2 = fut_traj_points2.gather(2, inds_rep.unsqueeze(2)).squeeze(dim=2) ret_list = [] for i in range(num_samples): ret_list.append( dict( pred_agent_fut_trajs = fut_traj_points[i].cpu().numpy(), pred_agent_fut_trajs2 = fut_traj_points2[i].cpu().numpy(), obj_idxes = preds_dicts['obj_idxes'][i].cpu().numpy() ) ) return ret_list class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256, use_ln=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.use_ln = use_ln self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) if self.use_ln: self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.init_weight() def init_weight(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): if self.use_ln: x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out ================================================ FILE: mmdet3d/models/fbbev/motion_head/motion_planner_head.py ================================================ import torch import torch.nn as nn from mmcv.cnn import Linear, bias_init_with_prob, Scale from mmcv.runner import force_fp32 from mmdet.core import (build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.utils import build_transformer from mmdet.models import HEADS, build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.models.utils.transformer import inverse_sigmoid from mmdet3d.core.bbox.coders import build_bbox_coder # from .streampetr_utils import * import copy from mmdet.models.utils import NormedLinear from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.fbbev.utils import save_tensor from mmcv.runner.base_module import BaseModule from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence from ..streampetr.streampetr_utils import * from ..planner_head.metric_stp3 import PlanningMetric def get_ego_pos(points, pc_range): if points.size(-1) == 3: points = points * (pc_range[3:6] - pc_range[0:3]) + pc_range[0:3] elif points.size(-1) == 2: points = points * (pc_range[3:5] - pc_range[0:2]) + pc_range[0:2] return points def get_rel_pos(points, pc_range): if points.size(-1) == 3: return (points - pc_range[0:3]) / (pc_range[3:6] - pc_range[0:3]) elif points.size(-1) == 2: return (points - pc_range[0:2]) / (pc_range[3:5] - pc_range[0:2]) @HEADS.register_module() class MotionPlannerHead(BaseModule): """Implements the DETR transformer head. See `paper: End-to-End Object Detection with Transformers `_ for details. Args: num_classes (int): Number of categories excluding the background. in_channels (int): Number of channels in the input feature map. num_query (int): Number of query in Transformer. num_reg_fcs (int, optional): Number of fully-connected layers used in `FFN`, which is then used for the regression head. Default 2. transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer. Default: None. sync_cls_avg_factor (bool): Whether to sync the avg_factor of all ranks. Default to False. positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for position encoding. loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification loss. Default `CrossEntropyLoss`. loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the regression loss. Default `L1Loss`. loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the regression iou loss. Default `GIoULoss`. tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of transformer head. test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of transformer head. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ _version = 2 def __init__(self, # num_classes=1, in_channels=256, stride=[16], embed_dims=256, num_query=6, num_reg_fcs=2, memory_len=12, topk_proposals=4, num_propagated=0, with_dn=True, with_ego_pos=True, match_with_velo=True, match_costs=None, transformer=None, sync_cls_avg_factor=False, code_weights=None, bbox_coder=None, loss_traj=dict(type='L1Loss', loss_weight=0.25), init_cfg=None, normedlinear=False, point_cloud_range=None, agent_decoder=dict(), agent_map_decoder=dict(), map_layer_index = -1, # planner loss_plan_reg=dict(type='L1Loss', loss_weight=5.0), loss_plan_bound=dict(type='PlanMapBoundLoss', loss_weight=1.0, dis_thresh=5.0), loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=5.0), loss_plan_dir=dict(type='PlanMapDirectionLoss', loss_weight=2.5), ego_agent_decoder = dict( type='CustomTransformerDecoder', num_layers=1, return_intermediate=False, transformerlayers=dict( type='BaseTransformerLayer', batch_first=True, attn_cfgs=dict( type='MotionSelfAttention', embed_dims=256, num_heads=8, dropout=0.1, dist_func_type='MDE', pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], consider_map_quality=False, ), feedforward_channels=2048, ffn_dropout=0.1, operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), ego_map_decoder = dict( type='CustomTransformerDecoder', num_layers=1, return_intermediate=False, transformerlayers=dict( type='BaseTransformerLayer', batch_first=True, attn_cfgs=dict( type='MotionSelfAttention', embed_dims=256, num_heads=8, dropout=0.1, dist_func_type='MDE', pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], consider_map_quality=True, ), feedforward_channels=2048, ffn_dropout=0.1, operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), ego_ego_decoder = dict( type='CustomTransformerDecoder', num_layers=1, return_intermediate=False, transformerlayers=dict( type='BaseTransformerLayer', batch_first=True, attn_cfgs=dict( type='MultiheadAttention', embed_dims=256, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), feedforward_channels=1024, ffn_dropout=0.1, operation_order=('cross_attn', 'norm', 'ffn', 'norm'))), **kwargs): if 'code_size' in kwargs: self.code_size = kwargs['code_size'] else: self.code_size = 2 # if code_weights is not None: # self.code_weights = code_weights # else: # self.code_weights = [1.0, 1.0] # x, y, v_x, v_y # self.code_weights = self.code_weights[:self.code_size] self.traj_num_cls = 1 self.num_query = num_query self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs # self.train_cfg = train_cfg # self.test_cfg = test_cfg self.fp16_enabled = False self.embed_dims = embed_dims self.map_layer_index = map_layer_index super(MotionPlannerHead, self).__init__() self.loss_traj = build_loss(loss_traj) self.log_softmax = nn.LogSoftmax(dim=2) # self.code_weights = nn.Parameter(torch.tensor( # self.code_weights), requires_grad=False) self.pc_range = nn.Parameter(torch.tensor( point_cloud_range), requires_grad=False) self.fut_steps = 8 self.num_fut_mode = 6 self.agent_decoder = build_transformer_layer_sequence(agent_decoder) self.agent_map_decoder = build_transformer_layer_sequence(agent_map_decoder) self._init_layers() self.count = 0 # planner self.ego_ego_decoder = build_transformer_layer_sequence(ego_ego_decoder) self.ego_agent_decoder = build_transformer_layer_sequence(ego_agent_decoder) self.ego_map_decoder = build_transformer_layer_sequence(ego_map_decoder) self.ego_fut_steps = 6 self.ego_fut_mode = 3 self.memory_len = 4 self.loss_plan_reg = build_loss(loss_plan_reg) loss_plan_bound.update(point_cloud_range=point_cloud_range) loss_plan_col.update(point_cloud_range=point_cloud_range) loss_plan_dir.update(point_cloud_range=point_cloud_range) self.loss_plan_bound = build_loss(loss_plan_bound) self.loss_plan_col = build_loss(loss_plan_col) self.loss_plan_dir = build_loss(loss_plan_dir) self.ego_info = MLN(3) self._init_planer_layers() self.memory_traj = None self.planning_metric = PlanningMetric() self.count = 0 def _init_planer_layers(self): """Initialize layers of the transformer head.""" ego_fut_decoder = [] ego_fut_dec_in_dim = self.embed_dims*2 for _ in range(self.num_reg_fcs): ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim)) ego_fut_decoder.append(nn.ReLU()) ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode*self.ego_fut_steps*2)) self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder) self.query_embedding = nn.Sequential( nn.Linear(self.embed_dims, self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) self.motion_query_mlp = nn.Sequential( nn.Linear(2 * self.embed_dims * self.num_fut_mode , self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) self.query_feat_embedding = nn.Embedding(1, self.embed_dims) self.memory_ego_embed = None self.time_embedding = nn.Embedding(self.memory_len, self.embed_dims) self.hist_ego_mlp = nn.Sequential( nn.Linear(self.embed_dims * 2, self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) def _init_layers(self): """Initialize layers of the transformer head.""" traj_branch = [] for _ in range(self.num_reg_fcs): traj_branch.append(Linear(self.embed_dims*2, self.embed_dims*2)) traj_branch.append(nn.ReLU()) traj_branch.append(Linear(self.embed_dims*2, self.fut_steps*self.code_size)) traj_branch = nn.Sequential(*traj_branch) traj_cls_branch = [] for _ in range(self.num_reg_fcs): traj_cls_branch.append(Linear(self.embed_dims*2, self.embed_dims*2)) traj_cls_branch.append(nn.LayerNorm(self.embed_dims*2)) traj_cls_branch.append(nn.ReLU(inplace=True)) traj_cls_branch.append(Linear(self.embed_dims*2, self.traj_num_cls)) traj_cls_branch = nn.Sequential(*traj_cls_branch) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) motion_num_pred = 2 self.traj_branches = _get_clones(traj_branch, motion_num_pred) self.traj_cls_branches = _get_clones(traj_cls_branch, motion_num_pred) # self.reference_points = nn.Embedding(self.num_query, 3) self.agent_info = MLN(17) self.agent_info_embedding = nn.Sequential( nn.Linear(17, self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) self.traj_mode_embedding = nn.Embedding(self.num_fut_mode, self.embed_dims) def pre_update_memory(self, data, fut_traj_from_velo): x = 1-data['start_of_sequence'] # original prev_exist, so we need do `not` B = x.size(0) # refresh the memory when the scene changes if self.memory_traj is None: self.memory_traj = fut_traj_from_velo.unsqueeze(1).repeat(1, self.memory_len, 1, 1) # * 0 self.memory_ego_embed = x.new_zeros(B, self.memory_len, self.embed_dims * 2) else: self.memory_traj = transform_reference_points(self.memory_traj, data['ego_pose_inv'], reverse=False)[..., :2] self.memory_traj = memory_refresh(self.memory_traj[:, :self.memory_len], x) for i in range(B): # do not leak velo info,init all zeros if not x[i]: self.memory_traj[i, 0] = fut_traj_from_velo[i] * 0 self.memory_ego_embed = memory_refresh(self.memory_ego_embed[:, :self.memory_len], x) def post_update_memory(self, data, ego_fut_trajs, ego_embeds): self.memory_traj = torch.cat([ego_fut_trajs, self.memory_traj], dim=1) self.memory_traj = torch.cat([self.memory_traj, torch.zeros_like(self.memory_traj[..., :1])], -1) self.memory_traj = transform_reference_points(self.memory_traj, data['ego_pose'], reverse=False) self.memory_ego_embed = torch.cat([ego_embeds, self.memory_ego_embed], dim=1).detach() def forward(self, agent_instances, preds_map_dicts, img_metas=None, gt_ego_lcf_feat=None, gt_ego_fut_cmd=None, gt_ego_his_traj=None, gt_ego_fut_trajs=None): valid_length = [(matched_gt_idxes>=0).sum() for matched_gt_idxes in agent_instances.matched_gt_idxes] max_valid_query = max(valid_length) assert 0<=max_valid_query<=250 agent_instances = agent_instances[:, :max_valid_query] agent_queries = agent_instances.query_feats agent_reference_points = agent_instances.reference_points mode_embedding = self.traj_mode_embedding.weight hist_mask = agent_instances.hist_mask B = len(agent_instances) hist_xyz_delta = (agent_instances.hist_xyz[:, :, 1:] - agent_instances.hist_xyz[:, :, :-1]) * hist_mask[:,:, :-1, None] agent_hist_info = torch.cat([hist_xyz_delta.flatten(-2, -1), agent_instances.hist_velo.flatten(-2, -1)], -1).detach() # I do believe this agent history infomation can be helpfull, so I use it twice agent_queries = self.agent_info(agent_queries, agent_hist_info) extra_agent_infos = (self.agent_info_embedding(agent_hist_info)[:, :, None, :].repeat(1, 1, self.num_fut_mode, 1)).flatten(1, 2) agent_queries = (agent_queries[:, :, None, :] + mode_embedding[None, None, :, :]).flatten(1, 2) hist_traj_points = agent_instances.hist_xyz.unsqueeze(2).repeat(1, 1, self.num_fut_mode, 1, 1).flatten(1, 2) hist_agent_xy = agent_instances.reference_points[:, :, :2].unsqueeze(2).repeat(1, 1, self.num_fut_mode, 1).flatten(1, 2) agent_queries = self.agent_decoder(agent_queries, reference_points_q=hist_traj_points, reference_points_v=hist_traj_points, pc_range=self.pc_range) map_queries = preds_map_dicts['queries'].clone() map_lines = preds_map_dicts['lines'].clone() map_scores = preds_map_dicts['scores'].clone() B, NMQ, K2 = map_lines.shape map_lines = map_lines.reshape(B, NMQ, K2//2, 2) map_pos = self.query_embedding(bevpos2posemb(map_lines.mean(-2))) map_lines = get_ego_pos(map_lines, self.pc_range) co_agent_queries = torch.cat([agent_queries, extra_agent_infos], -1) pred_traj_cls = self.traj_cls_branches[0](co_agent_queries).view(B, max_valid_query, self.num_fut_mode) pred_traj_cls = self.log_softmax(pred_traj_cls) pred_traj = self.traj_branches[0](co_agent_queries) B, N, PK = pred_traj.shape pred_traj = pred_traj.view(B, N, PK//self.code_size, self.code_size) fut_traj_points = torch.cat([hist_agent_xy.unsqueeze(-2), pred_traj[..., :2]], -2) fut_traj_points = torch.cumsum(fut_traj_points, -2)[:, :, 1:] agent_queries = self.agent_map_decoder(agent_queries, map_queries, map_queries, reference_points_q=fut_traj_points, reference_points_v=map_lines, pc_range=self.pc_range, map_scores=map_scores) co_agent_queries = torch.cat([agent_queries, extra_agent_infos], -1) pred_opt_traj_cls = self.traj_cls_branches[1](co_agent_queries).view(B, max_valid_query, self.num_fut_mode) pred_opt_traj_cls = self.log_softmax(pred_opt_traj_cls) pred_opt_traj = self.traj_branches[1](co_agent_queries) pred_opt_traj = pred_opt_traj.view(B, N, PK//self.code_size, self.code_size) fut_opt_traj_points = torch.cat([hist_agent_xy.unsqueeze(-2), pred_opt_traj[..., :2]], -2) fut_opt_traj_points = torch.cumsum(fut_opt_traj_points, -2)[:, :, 1:] # planner bs, num_agents = B, N//self.num_fut_mode agent_queries = self.motion_query_mlp(co_agent_queries.view(bs, num_agents, 2 * self.embed_dims * self.num_fut_mode)) agent_reference_points = fut_opt_traj_points.view(bs, num_agents, self.num_fut_mode, 8, 2).mean(2) agent_centers = get_rel_pos(agent_reference_points[:, :, 0], self.pc_range) agent_pos = self.query_embedding(bevpos2posemb(agent_centers)) gt_ego_lcf_feat = torch.stack(gt_ego_lcf_feat).to(agent_queries.device) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd).to(agent_queries.device) start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(agent_queries.device) timestamp = torch.FloatTensor([ single_img_metas['timestamp'] for single_img_metas in img_metas]).to(agent_queries.device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(agent_queries.device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(agent_queries.device) data = dict( start_of_sequence = start_of_sequence, timestamp = timestamp, ego_pose_inv = ego_pose_inv, ego_pose = ego_pose, ) fut_traj_from_velo = gt_ego_lcf_feat[:, :2].unsqueeze(1).repeat(1, self.ego_fut_steps, 1) * torch.arange(1, self.ego_fut_steps+1)[None,:, None].to(agent_queries.device) * 0.5 self.pre_update_memory(data, fut_traj_from_velo) ego_query = self.query_feat_embedding.weight.repeat(bs, 1) ego_query = self.ego_info(ego_query, gt_ego_fut_cmd.to(ego_query.dtype)).unsqueeze(1) ego_pos = get_rel_pos(ego_query.new_zeros(bs, 2), self.pc_range) ego_pos = self.query_embedding(bevpos2posemb(ego_pos)).unsqueeze(1) init_ego_traj = self.memory_traj[:, 0:1] hist_ego_query = self.hist_ego_mlp(self.memory_ego_embed) + self.time_embedding.weight[None] ego_query = self.ego_ego_decoder( query=ego_query, key=hist_ego_query, value=hist_ego_query, ) ego_agent_query = self.ego_agent_decoder(query=ego_query, key=agent_queries, value=agent_queries, query_pos=ego_pos, key_pos=agent_pos, reference_points_q=init_ego_traj, reference_points_v=agent_reference_points) ego_map_query = self.ego_map_decoder(query=ego_query, key=map_queries, value=map_queries, query_pos=ego_pos, key_pos=map_pos, reference_points_q=init_ego_traj, reference_points_v=map_lines, map_scores=map_scores, ) co_agent_query = torch.cat([ego_agent_query, ego_map_query], -1) outputs_ego_trajs = self.ego_fut_decoder(co_agent_query) outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0], self.ego_fut_mode, self.ego_fut_steps, 2) self.post_update_memory(data, torch.cumsum(outputs_ego_trajs[gt_ego_fut_cmd==1], 1)[:, None], co_agent_query) ego_trajs = torch.cumsum(outputs_ego_trajs[gt_ego_fut_cmd==1], 1) ego_trajs = torch.cat([torch.zeros_like(ego_trajs[:,:1]), ego_trajs], 1) ego_trajs = torch.cat([ego_trajs, torch.zeros_like(ego_trajs[..., :1])], -1) ego_trajs_in_global = transform_reference_points(ego_trajs, data['ego_pose'], reverse=False)[..., :2] fut_trajs_in_global = torch.cat([fut_opt_traj_points, torch.zeros_like(fut_opt_traj_points[..., :1])], -1) fut_trajs_in_global = transform_reference_points(fut_trajs_in_global, data['ego_pose'], reverse=False)[..., :2] return dict( pred_trajs=[ dict( pred_traj=pred_traj.view(B, N//self.num_fut_mode, self.num_fut_mode, PK//self.code_size, self.code_size), pred_traj_cls=pred_traj_cls, valid_length=valid_length, ), dict( pred_traj=pred_opt_traj.view(B, N//self.num_fut_mode, self.num_fut_mode, PK//self.code_size, self.code_size), pred_traj_cls=pred_opt_traj_cls, valid_length=valid_length, )], fut_traj_from_velo = fut_traj_from_velo, fut_trajs_in_global = fut_trajs_in_global, pred_abs_trajs2 = fut_opt_traj_points, obj_idxes = agent_instances.obj_idxes.clone(), agent_scores = agent_instances.scores.clone(), ego_fut_preds=outputs_ego_trajs, ego_trajs_in_global = ego_trajs_in_global, ) @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_agent_fut_traj=None, gt_agent_fut_traj_mask=None, gt_ego_fut_trajs=None, gt_ego_fut_cmd=None, gt_ego_fut_masks=None, preds_dicts=None, preds_map_dicts=None, matched_gt_idxes=None, img_metas=None, ): loss_dict = dict() gt_agent_fut_traj_list = [] gt_agent_fut_traj_mask_list = [] B = len(gt_agent_fut_traj) pred_trajs = preds_dicts['pred_trajs'] valid_length = pred_trajs[0]['valid_length'] for i in range(B): index = matched_gt_idxes[i][:valid_length[i]] if valid_length[i]>0: gt_agent_fut_traj_list.append(gt_agent_fut_traj[i][:valid_length[i]][index]) gt_agent_fut_traj_mask_list.append(gt_agent_fut_traj_mask[i][:valid_length[i]][index]) gt_agent_fut_traj = torch.cat(gt_agent_fut_traj_list) gt_agent_fut_traj_mask = torch.cat(gt_agent_fut_traj_mask_list).sum(-1) > 0 for lld, single_preds in enumerate(pred_trajs): pred_traj = single_preds['pred_traj'] pred_traj_cls = single_preds['pred_traj_cls'] pred_agent_fut_traj_list = [] pred_agent_fut_traj_cls_list = [] for i in range(B): if valid_length[i]>0: pred_agent_fut_traj_list.append(pred_traj[i][:valid_length[i]]) pred_agent_fut_traj_cls_list.append(pred_traj_cls[i][:valid_length[i]]) pred_traj = torch.cat(pred_agent_fut_traj_list) pred_traj_cls = torch.cat(pred_agent_fut_traj_cls_list) loss_traj, l_class, l_reg, l_minade, l_minfde, l_mr = self.loss_traj(pred_traj_cls, pred_traj, gt_agent_fut_traj, gt_agent_fut_traj_mask) loss_dict.update({ f'loss_traj.d{lld}': loss_traj, f'l_class.d{lld}': l_class, f'l_reg.d{lld}': l_reg, f'l_minade.d{lld}': l_minade, f'l_minfde.d{lld}': l_minfde, f'l_mr.d{lld}': l_mr, } ) ego_fut_preds = preds_dicts['ego_fut_preds'] map_lines = preds_map_dicts['lines'] B, NMQ, K2 = map_lines.shape map_lines = map_lines.reshape(B, NMQ, K2//2, 2) map_scores = preds_map_dicts['scores'] agent_fut_preds = preds_dicts['pred_abs_trajs2'].reshape(B, -1, self.num_fut_mode, 8, 2)[..., :self.ego_fut_steps, :2] agent_score_preds = preds_dicts['agent_scores'] agent_fut_cls_preds = preds_dicts['pred_trajs'][-1]['pred_traj_cls'] gt_ego_fut_trajs = torch.stack(gt_ego_fut_trajs) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd) gt_ego_fut_masks = torch.stack(gt_ego_fut_masks) gt_ego_fut_trajs = torch.cat([gt_ego_fut_trajs[:,:1], (gt_ego_fut_trajs[:,1:] - gt_ego_fut_trajs[:,:-1])], 1) gt_ego_fut_trajs = gt_ego_fut_trajs.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1) loss_plan_l1_weight = gt_ego_fut_cmd[..., None, None] * gt_ego_fut_masks[:, None, :, None] loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2) loss_plan_l1 = self.loss_plan_reg( ego_fut_preds, gt_ego_fut_trajs, loss_plan_l1_weight ) loss_plan_bound = self.loss_plan_bound( ego_fut_preds[gt_ego_fut_cmd==1], map_lines, map_scores, weight=gt_ego_fut_masks ) loss_plan_col = self.loss_plan_col( ego_fut_preds[gt_ego_fut_cmd==1], agent_fut_preds, agent_score_preds.squeeze(-1), agent_fut_cls_preds, weight=gt_ego_fut_masks[:, :, None].repeat(1, 1, 2) ) loss_plan_dir = self.loss_plan_dir( ego_fut_preds[gt_ego_fut_cmd==1], map_lines, map_scores, weight=gt_ego_fut_masks ) loss_plan_l1 = torch.nan_to_num(loss_plan_l1) loss_plan_bound = torch.nan_to_num(loss_plan_bound) loss_plan_col = torch.nan_to_num(loss_plan_col) loss_plan_dir = torch.nan_to_num(loss_plan_dir) loss_dict['loss_plan_reg'] = loss_plan_l1 loss_dict['loss_plan_bound'] = loss_plan_bound loss_dict['loss_plan_col'] = loss_plan_col loss_dict['loss_plan_dir'] = loss_plan_dir return loss_dict @force_fp32(apply_to=('preds_dicts')) def get_motion(self, preds_dicts, img_metas, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ num_samples = len(img_metas) # fut_traj_points = preds_dicts['pred_abs_trajs'].view(num_samples, -1, self.num_fut_mode, self.fut_steps, self.code_size) # # fut_traj_index = preds_dicts['pred_trajs'][0]['pred_traj_cls'].softmax(-1).argmax(-1) # scores, fut_traj_index = torch.max(preds_dicts['pred_trajs'][0]['pred_traj_cls'].softmax(-1), dim=-1) # inds_rep = fut_traj_index.repeat( # self.fut_steps, self.code_size, 1, 1).permute(2, 3, 0, 1) # fut_traj_points = fut_traj_points.gather(2, inds_rep.unsqueeze(2)).squeeze(dim=2) fut_traj_points2 = preds_dicts['pred_abs_trajs2'].view(num_samples, -1, self.num_fut_mode, self.fut_steps, self.code_size) fut_trajs_in_global = preds_dicts['fut_trajs_in_global'].view(num_samples, -1, self.num_fut_mode, self.fut_steps, self.code_size) # fut_traj_index = preds_dicts['pred_trajs'][0]['pred_traj_cls'].softmax(-1).argmax(-1) # scores, fut_traj_index = torch.max(preds_dicts['pred_trajs'][1]['pred_traj_cls'].softmax(-1), dim=-1) # inds_rep = fut_traj_index.repeat( # self.fut_steps, self.code_size, 1, 1).permute(2, 3, 0, 1) # fut_traj_points2 = fut_traj_points2.gather(2, inds_rep.unsqueeze(2)).squeeze(dim=2) ret_list = [] for i in range(num_samples): ret_list.append( dict( # pred_agent_fut_trajs = fut_traj_points[i].cpu().numpy(), fut_trajs_in_global = fut_trajs_in_global[i].cpu().numpy(), pred_agent_fut_trajs2 = fut_traj_points2[i].cpu().numpy(), pred_traj_cls = preds_dicts['pred_trajs'][1]['pred_traj_cls'][i].softmax(-1).cpu().numpy(), pred_traj = preds_dicts['pred_trajs'][1]['pred_traj'][i].cpu().numpy(), obj_idxes = preds_dicts['obj_idxes'][i].cpu().numpy() ) ) return ret_list @force_fp32(apply_to=('preds_dicts')) def get_traj(self, preds_dicts, img_metas, rescale=False, gt_ego_fut_trajs=None, gt_ego_fut_cmd=None, gt_ego_fut_masks=None, gt_fut_segmentations=None, vad_ego_fut_trajs=None): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ pred_ego_fut_trajs = preds_dicts['ego_fut_preds'] gt_ego_fut_trajs = torch.stack(gt_ego_fut_trajs).to(pred_ego_fut_trajs.device) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd).to(pred_ego_fut_trajs.device) gt_ego_fut_masks = torch.stack(gt_ego_fut_masks).to(pred_ego_fut_trajs.device) pred_ego_fut_trajs = torch.cumsum(pred_ego_fut_trajs[gt_ego_fut_cmd==1], 1) # pred_ego_fut_trajs = vad_ego_fut_trajs[0][None] metric_dict = { 'plan_L2_1s':0, 'plan_L2_2s':0, 'plan_L2_3s':0, 'plan_obj_col_1s':0, 'plan_obj_col_2s':0, 'plan_obj_col_3s':0, 'plan_obj_box_col_1s':0, 'plan_obj_box_col_2s':0, 'plan_obj_box_col_3s':0, 'l2_dist': 0, } fut_valid_flag = gt_ego_fut_masks.all() future_second = 3 metric_dict['fut_valid_flag'] = fut_valid_flag.cpu().item() for i in range(future_second): if fut_valid_flag: cur_time = (i+1)*2 traj_L2 = self.planning_metric.compute_L2( pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[0, :cur_time] ) obj_coll, obj_box_coll = self.planning_metric.evaluate_coll( pred_ego_fut_trajs[:, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[:, :cur_time], gt_fut_segmentations, index = [each['index'] for each in img_metas] ) metric_dict['plan_L2_{}s'.format(i+1)] = traj_L2 metric_dict['plan_obj_col_{}s'.format(i+1)] = obj_coll.mean().item() metric_dict['plan_obj_box_col_{}s'.format(i+1)] = obj_box_coll.max().item() l2_dist = (pred_ego_fut_trajs-gt_ego_fut_trajs).norm(dim=-1) * gt_ego_fut_masks[:, None] l2_dist[gt_ego_fut_masks[:, None]==0] = -1 metric_dict['l2_dist'] = l2_dist[0].cpu() ret_list = [] num_samples = len(pred_ego_fut_trajs) assert num_samples == 1 index_w_scene = img_metas[0]['scene_name'] + '-' + str(img_metas[0]['index']) for i in range(num_samples): ret_list.append( dict( pred_ego_fut_trajs = pred_ego_fut_trajs[i].cpu(), gt_ego_fut_trajs = gt_ego_fut_trajs[i].cpu(), metric_dict = metric_dict, l2_dist=l2_dist[i].cpu(), index_w_scene = index_w_scene, ego_trajs_in_global = preds_dicts['ego_trajs_in_global'][i].cpu(), gt_ego_fut_cmd = gt_ego_fut_cmd[i].cpu(), ) ) return ret_list class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256, use_ln=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.use_ln = use_ln self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) if self.use_ln: self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.init_weight() def init_weight(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): if self.use_ln: x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out ================================================ FILE: mmdet3d/models/fbbev/motion_head/traj_loss.py ================================================ #---------------------------------------------------------------------------------# # UniAD: Planning-oriented Autonomous Driving (https://arxiv.org/abs/2212.10156) # # Source code: https://github.com/OpenDriveLab/UniAD # # Copyright (c) OpenDriveLab. All rights reserved. # #---------------------------------------------------------------------------------# import torch import math import torch.nn as nn import torch.nn.functional as F from typing import Tuple from mmdet.models import LOSSES @LOSSES.register_module() class TrajLoss(nn.Module): """ MTP loss modified to include variances. Uses MSE for mode selection. Can also be used with Multipath outputs, with residuals added to anchors. """ def __init__(self, use_variance=False, cls_loss_weight=1., nll_loss_weight=1., loss_weight_minade=0., loss_weight_minfde=1., loss_weight_mr=1.): """ Initialize MTP loss :param args: Dictionary with the following (optional) keys use_variance: bool, whether or not to use variances for computing regression component of loss, default: False alpha: float, relative weight assigned to classification component, compared to regression component of loss, default: 1 """ super(TrajLoss, self).__init__() self.use_variance = use_variance self.cls_loss_weight = cls_loss_weight self.nll_loss_weight = nll_loss_weight self.loss_weight_minade = loss_weight_minade self.loss_weight_minfde = loss_weight_minfde def forward(self, traj_prob, traj_preds, gt_future_traj, gt_future_traj_valid_mask): """ Compute MTP loss :param predictions: Dictionary with 'traj': predicted trajectories and 'probs': mode (log) probabilities :param ground_truth: Either a tensor with ground truth trajectories or a dictionary :return: """ # Unpack arguments traj = traj_preds # (b, nmodes, seq, 5) log_probs = traj_prob traj_gt = gt_future_traj # Useful variables batch_size = traj.shape[0] sequence_length = traj.shape[2] pred_params = 5 if self.use_variance else 2 # Masks for variable length ground truth trajectories masks = 1 - gt_future_traj_valid_mask.to(traj.dtype) l_minfde, inds = min_fde(traj, traj_gt, masks) try: l_mr = miss_rate(traj, traj_gt, masks) except: l_mr = torch.zeros_like(l_minfde) l_minade, inds = min_ade(traj, traj_gt, masks) inds_rep = inds.repeat( sequence_length, pred_params, 1, 1).permute(3, 2, 0, 1) # Calculate MSE or NLL loss for trajectories corresponding to selected # outputs: traj_best = traj.gather(1, inds_rep).squeeze(dim=1) if self.use_variance: l_reg = traj_nll(traj_best, traj_gt, masks) else: l_reg = l_minade # Compute classification loss l_class = - torch.squeeze(log_probs.gather(1, inds.unsqueeze(1))) l_reg = torch.sum(l_reg)/(batch_size + 1e-5) l_class = torch.sum(l_class)/(batch_size + 1e-5) l_minade = torch.sum(l_minade)/(batch_size + 1e-5) l_minfde = torch.sum(l_minfde)/(batch_size + 1e-5) loss = l_class * self.cls_loss_weight + l_reg * self.nll_loss_weight + l_minade * self.loss_weight_minade + l_minfde * self.loss_weight_minfde return loss, l_class, l_reg, l_minade, l_minfde, l_mr def min_ade(traj: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Computes average displacement error for the best trajectory is a set, with respect to ground truth :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :return errs, inds: errors and indices for modes with min error, shape [batch_size] """ num_modes = traj.shape[1] traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1) masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1) err = traj_gt_rpt - traj[:, :, :, 0:2] err = torch.pow(err, exponent=2) err = torch.sum(err, dim=3) err = torch.pow(err, exponent=0.5) err = torch.sum(err * (1 - masks_rpt), dim=2) / \ torch.clip(torch.sum((1 - masks_rpt), dim=2), min=1) err, inds = torch.min(err, dim=1) return err, inds def traj_nll( pred_dist: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor): """ Computes negative log likelihood of ground truth trajectory under a predictive distribution with a single mode, with a bivariate Gaussian distribution predicted at each time in the prediction horizon :param pred_dist: parameters of a bivariate Gaussian distribution, shape [batch_size, sequence_length, 5] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :return: """ mu_x = pred_dist[:, :, 0] mu_y = pred_dist[:, :, 1] x = traj_gt[:, :, 0] y = traj_gt[:, :, 1] sig_x = pred_dist[:, :, 2] sig_y = pred_dist[:, :, 3] rho = pred_dist[:, :, 4] ohr = torch.pow(1 - torch.pow(rho, 2), -0.5) nll = 0.5 * torch.pow(ohr, 2) * \ (torch.pow(sig_x, 2) * torch.pow(x - mu_x, 2) + torch.pow(sig_y, 2) * torch.pow(y - mu_y, 2) - 2 * rho * torch.pow(sig_x, 1) * torch.pow(sig_y, 1) * (x - mu_x) * (y - mu_y)) - \ torch.log(sig_x * sig_y * ohr) + 1.8379 nll[nll.isnan()] = 0 nll[nll.isinf()] = 0 nll = torch.sum(nll * (1 - masks), dim=1) / (torch.sum((1 - masks), dim=1) + 1e-5) # Note: Normalizing with torch.sum((1 - masks), dim=1) makes values # somewhat comparable for trajectories of # different lengths return nll def min_fde(traj: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """ Computes final displacement error for the best trajectory is a set, with respect to ground truth :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :return errs, inds: errors and indices for modes with min error, shape [batch_size] """ num_modes = traj.shape[1] lengths = torch.sum(1 - masks, dim=1).long() valid_mask = lengths > 0 traj = traj[valid_mask] traj_gt = traj_gt[valid_mask] masks = masks[valid_mask] traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1) lengths = torch.sum(1 - masks, dim=1).long() inds = lengths.unsqueeze(1).unsqueeze( 2).unsqueeze(3).repeat(1, num_modes, 1, 2) - 1 traj_last = torch.gather(traj[..., :2], dim=2, index=inds).squeeze(2) traj_gt_last = torch.gather(traj_gt_rpt, dim=2, index=inds).squeeze(2) err = traj_gt_last - traj_last[..., 0:2] err = torch.pow(err, exponent=2) err = torch.sum(err, dim=2) err = torch.pow(err, exponent=0.5) err, inds = torch.min(err, dim=1) return err, inds def miss_rate( traj: torch.Tensor, traj_gt: torch.Tensor, masks: torch.Tensor, dist_thresh: float = 2) -> torch.Tensor: """ Computes miss rate for mini batch of trajectories, with respect to ground truth and given distance threshold :param traj: predictions, shape [batch_size, num_modes, sequence_length, 2] :param traj_gt: ground truth trajectory, shape [batch_size, sequence_length, 2] :param masks: masks for varying length ground truth, shape [batch_size, sequence_length] :param dist_thresh: distance threshold for computing miss rate. :return errs, inds: errors and indices for modes with min error, shape [batch_size] """ num_modes = traj.shape[1] traj_gt_rpt = traj_gt.unsqueeze(1).repeat(1, num_modes, 1, 1) masks_rpt = masks.unsqueeze(1).repeat(1, num_modes, 1) dist = traj_gt_rpt - traj[:, :, :, 0:2] dist = torch.pow(dist, exponent=2) dist = torch.sum(dist, dim=3) dist = torch.pow(dist, exponent=0.5) dist[masks_rpt.bool()] = -math.inf dist, _ = torch.max(dist, dim=2) dist, _ = torch.min(dist, dim=1) m_r = torch.sum(torch.as_tensor(dist > dist_thresh)) / len(dist) return m_r ================================================ FILE: mmdet3d/models/fbbev/planner_head/AD_mlp.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import torch import torch.nn.functional as F import torch.nn as nn from mmcv.runner import force_fp32 import os from mmdet3d.ops.bev_pool_v2.bev_pool import TRTBEVPoolv2 from mmdet.models import DETECTORS from mmdet3d.models import builder from mmdet3d.models.detectors import CenterPoint from mmdet3d.models.builder import build_head, build_neck import numpy as np import copy import spconv.pytorch as spconv from tqdm import tqdm from mmdet3d.models.fbbev.utils import run_time import torch from torchvision.utils import make_grid import torchvision import matplotlib import matplotlib.pyplot as plt import cv2 from collections import defaultdict from mmcv.runner import get_dist_info from mmdet.core import reduce_mean import mmcv from mmdet3d.datasets.utils import nuscenes_get_rt_matrix from mmdet3d.core.bbox import box_np_ops # , corner_to_surfaces_3d, points_in_convex_polygon_3d_jit import gc from typing import Any, Dict, List, Optional, Tuple, Union import torch from torch import nn from torch.nn import functional as F import pickle import numpy as np import math import copy import math from mmcv.runner.base_module import BaseModule from mmdet3d.models.detectors.base import Base3DDetector import torch import torch.nn as nn from mmcv.cnn import Linear, bias_init_with_prob, Scale from mmcv.runner import force_fp32 from mmdet.core import (build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.utils import build_transformer from mmdet.models import HEADS, build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.models.utils.transformer import inverse_sigmoid from mmdet3d.core.bbox.coders import build_bbox_coder from ..streampetr.streampetr_utils import * import copy from mmdet.models.utils import NormedLinear from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.fbbev.utils import save_tensor from mmcv.runner.base_module import BaseModule from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence from .metric_stp3 import PlanningMetric # from memory_profiler import profile from matplotlib.backends.backend_agg import FigureCanvasAgg import PIL.Image as Image def get_ego_pos(points, pc_range): if points.size(-1) == 3: points = points * (pc_range[3:6] - pc_range[0:3]) + pc_range[0:3] elif points.size(-1) == 2: points = points * (pc_range[3:5] - pc_range[0:2]) + pc_range[0:2] return points def get_rel_pos(points, pc_range): if points.size(-1) == 3: return (points - pc_range[0:3]) / (pc_range[3:6] - pc_range[0:3]) elif points.size(-1) == 2: return (points - pc_range[0:2]) / (pc_range[3:5] - pc_range[0:2]) @HEADS.register_module() class AD_MLP(Base3DDetector): """Implements the DETR transformer head. See `paper: End-to-End Object Detection with Transformers `_ for details. Args: num_classes (int): Number of categories excluding the background. in_channels (int): Number of channels in the input feature map. num_query (int): Number of query in Transformer. num_reg_fcs (int, optional): Number of fully-connected layers used in `FFN`, which is then used for the regression head. Default 2. transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer. Default: None. sync_cls_avg_factor (bool): Whether to sync the avg_factor of all ranks. Default to False. positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for position encoding. loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification loss. Default `CrossEntropyLoss`. loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the regression loss. Default `L1Loss`. loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the regression iou loss. Default `GIoULoss`. tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of transformer head. test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of transformer head. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ _version = 2 def __init__(self, in_channels=256, embed_dims=256, num_query=1, num_reg_fcs=2, memory_len=12, transformer=None, sync_cls_avg_factor=False, code_weights=None, init_cfg=None, point_cloud_range=None, loss_plan_reg=dict(type='L1Loss', loss_weight=5.0), **kwargs): super().__init__() if 'code_size' in kwargs: self.code_size = kwargs['code_size'] else: self.code_size = 2 self.num_query = num_query self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs # self.train_cfg = train_cfg # self.test_cfg = test_cfg self.fp16_enabled = False self.embed_dims = embed_dims self.num_motion_mode = 6 self.fut_steps = 6 self.memory_len = 6 self.ego_fut_mode = 3 # self.code_weights = nn.Parameter(torch.tensor( # self.code_weights), requires_grad=False) self.pc_range = nn.Parameter(torch.tensor( point_cloud_range), requires_grad=False) self.loss_plan_reg = build_loss(loss_plan_reg) # self.ego_map_decoder = build_transformer_layer_sequence(self.ego_map_decoder) # self.ego_decoder = build_transformer_layer_sequence(ego_agent_decoder) self._init_layers() self.planning_metric = PlanningMetric() self.count = 0 # dummy self.history_sweep_time = None self.history_bev = None self.history_bev_before_encoder = None self.history_seq_ids = None self.history_forward_augs = None def _init_layers(self): """Initialize layers of the transformer head.""" ego_fut_decoder = [] ego_fut_dec_in_dim = self.embed_dims*2 for i in range(self.num_reg_fcs): if i == 0: ego_fut_decoder.append(Linear(12, ego_fut_dec_in_dim)) else: ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, ego_fut_dec_in_dim)) ego_fut_decoder.append(nn.ReLU()) ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.ego_fut_mode*self.fut_steps*2)) self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder) def forward_train(self, img_metas=None, **kwargs): """ NOTE: if I do not `detach` the tensor but use `clone`, there will be a CPU memory leak. I do not figure it out yet. """ preds_plan_dicts = self.inner_forward(img_metas, **kwargs) return self.loss( preds_plan_dicts=preds_plan_dicts, img_metas=img_metas, **kwargs ) def inner_forward(self, img_metas=None, **kwargs): """ NOTE: if I do not `detach` the tensor but use `clone`, there will be a CPU memory leak. I do not figure it out yet. """ gt_ego_lcf_feat = torch.stack(kwargs['gt_ego_lcf_feat'], 0) gt_ego_fut_cmd = torch.stack(kwargs['gt_ego_fut_cmd'], 0) # gt_ego_fut_trajs = torch.stack(kwargs['gt_ego_fut_trajs'], 0) self.ego_fut_steps = 6 vel = gt_ego_lcf_feat[:, :2].unsqueeze(1).repeat(1, self.ego_fut_steps, 1) # * torch.arange(1, self.ego_fut_steps+1) accelation = gt_ego_lcf_feat[:, 2:4].unsqueeze(1).repeat(1, self.ego_fut_steps, 1) * torch.arange(1, self.ego_fut_steps+1)[None, :, None].to(vel.device) * 0.5 vel = vel # + accelation fut_traj_from_velo = torch.cumsum(vel * 0.5, 1)# [0] gt_ego_fut_trajs = kwargs['gt_ego_fut_trajs']# [0] # np.corrco(fut_traj_from_velo.cpu().numpy(), gt_ego_fut_trajs.cpu().numpy()) input = torch.cat([gt_ego_lcf_feat, gt_ego_fut_cmd], -1) outputs_ego_trajs = self.ego_fut_decoder(input) # reference = inverse_sigmoid(reference_points.clone()) outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0], self.ego_fut_mode, self.fut_steps, 2) start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(gt_ego_lcf_feat.device) timestamp = torch.FloatTensor([ single_img_metas['timestamp'] for single_img_metas in img_metas]).to(gt_ego_lcf_feat.device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(gt_ego_lcf_feat.device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(gt_ego_lcf_feat.device) data = dict( start_of_sequence = start_of_sequence, timestamp = timestamp, ego_pose_inv = ego_pose_inv, ego_pose = ego_pose, ) preds_plan_dicts = dict( # init_traj=reference_points[..., :2], data= data, ego_fut_preds=outputs_ego_trajs, # ego_trajs_in_global = ego_trajs_in_global, fut_traj_from_velo=fut_traj_from_velo ) return preds_plan_dicts def forward_test(self, **kwargs): for key in ['img_metas', 'gt_ego_lcf_feat', 'gt_ego_fut_cmd', 'gt_ego_fut_trajs', 'gt_ego_fut_masks','gt_fut_segmentations', 'vad_ego_fut_trajs', 'gt_fut_segmentations_plus']: kwargs[key] = kwargs[key][0] # img_metas = img_metas[0] return self.simple_test(**kwargs) @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_ego_fut_trajs=None, gt_ego_fut_cmd=None, gt_ego_fut_masks=None, preds_plan_dicts=None, img_metas=None, **kwargs, ): ego_fut_preds = preds_plan_dicts['ego_fut_preds'] gt_ego_fut_trajs = torch.stack(gt_ego_fut_trajs) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd) gt_ego_fut_masks = torch.stack(gt_ego_fut_masks) gt_ego_fut_trajs = torch.cat([gt_ego_fut_trajs[:,:1], (gt_ego_fut_trajs[:,1:] - gt_ego_fut_trajs[:,:-1])], 1) gt_ego_fut_trajs = gt_ego_fut_trajs.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1) loss_plan_l1_weight = gt_ego_fut_cmd[..., None, None] * gt_ego_fut_masks[:, None, :, None] loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2) loss_plan_l1 = self.loss_plan_reg( ego_fut_preds, gt_ego_fut_trajs, loss_plan_l1_weight ) loss_plan_l1 = torch.nan_to_num(loss_plan_l1) loss_plan_dict = dict() loss_plan_dict['loss_plan_reg'] = loss_plan_l1 return loss_plan_dict def aug_test(self): pass @force_fp32(apply_to=('reference_points', 'cam_params')) def point_sampling(self, reference_points, cam_params=None): rots, trans, intrins, post_rots, post_trans, bda = cam_params B, N, _ = trans.shape eps = 1e-5 ogfH, ogfW = 900, 1600 reference_points = reference_points[None, None].repeat(B, N, 1, 1, 1, 1) reference_points = torch.inverse(bda).view(B, 1, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points -= trans.view(B, N, 1, 1, 1, 3) combine = rots.matmul(torch.inverse(intrins)).inverse() reference_points_cam = combine.view(B, N, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points_cam = torch.cat([reference_points_cam[..., 0:2] / torch.maximum( reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3])*eps), reference_points_cam[..., 2:3]], 5 ) reference_points_cam = post_rots.view(B, N, 1, 1, 1, 3, 3).matmul(reference_points_cam.unsqueeze(-1)).squeeze(-1) reference_points_cam += post_trans.view(B, N, 1, 1, 1, 3) # reference_points_cam[..., 0] /= ogfW # reference_points_cam[..., 1] /= ogfH mask = (reference_points_cam[..., 2:3] > eps) mask = (mask & (reference_points_cam[..., 0:1] > eps) & (reference_points_cam[..., 0:1] < (1.0-eps) * ogfW) & (reference_points_cam[..., 1:2] > eps) & (reference_points_cam[..., 1:2] < (1.0-eps) * ogfH)) B, N, H, W, D, _ = reference_points_cam.shape reference_points_cam = reference_points_cam.permute(1, 0, 2, 3, 4, 5).reshape(N, B, H*W, D, 3) mask = mask.permute(1, 0, 2, 3, 4, 5).reshape(N, B, H*W, D, 1).squeeze(-1) return reference_points, reference_points_cam[..., :2], mask, reference_points_cam[..., 2:3] def simple_test(self, **kwargs): preds_plan_dicts = self.inner_forward(**kwargs) pred_traj = self.get_bboxes( preds_plan_dicts, **kwargs ) img_metas = kwargs['img_metas'] output_list = [dict() for _ in range(len(img_metas))] for i, result_dict in enumerate(output_list): result_dict['pred_ego_traj'] = pred_traj[i] result_dict['index'] = img_metas[i]['index'] pred_ego_fut_trajs = output_list[0]['pred_ego_traj']['pred_ego_fut_trajs'] if not self.training: pred_ego_fut_trajs_ = torch.cat([pred_ego_fut_trajs.new_zeros(1, 2), pred_ego_fut_trajs], 0) rotate_angle_list=[] rotate_angle = 0 for i in range(pred_ego_fut_trajs_.size(0)-1): delta = pred_ego_fut_trajs_[i+1] - pred_ego_fut_trajs_[i] cur_rotate_angle = torch.atan2(*delta[[1, 0]]) if delta.norm()<1: cur_rotate_angle = 0 rotate_angle = cur_rotate_angle rotate_angle_list.append(rotate_angle) fut_gt_bboxes_3d = kwargs['fut_boxes_in_cur_ego_list'][0][0] rgb_image_list = [] rgb_image, front_img = self.visual_sample(output_list, gt_bboxes_3d_=kwargs['gt_bboxes_3d'][0][0], ego_info=None, cam_params=kwargs['img_inputs'][0][1:], front_img=kwargs['img_inputs'][0][0][0, 1], metric_dict = pred_traj[0]['metric_dict'], **kwargs) print(f'sc_{img_metas[0]["index"]}') # mmcv.imwrite(rgb_image, f'sc_{img_metas[0]["index"]}.png') # mmcv.mkdir_or_exist(f'vis/{img_metas[0]["scene_name"]}/') mmcv.imwrite(front_img, f'vis/go_stright/{img_metas[0]["scene_name"]}/{img_metas[0]["index"]}.jpg') # for i, gt_bboxes_3d in enumerate(fut_gt_bboxes_3d): # ego_info = [pred_ego_fut_trajs[i][0].item(), pred_ego_fut_trajs[i][1].item(), 0], [1.85, 4.084, 1], rotate_angle_list[i].item() # rgb_image = self.visual_sample(output_list, gt_bboxes_3d_=gt_bboxes_3d, ego_info=ego_info, **kwargs) # rgb_image_list.append(rgb_image) return output_list def extract_feat(self): pass @force_fp32(apply_to=('preds_dicts')) def get_bboxes(self, preds_dicts, img_metas=None, rescale=False, gt_ego_fut_trajs=None, gt_ego_fut_cmd=None, gt_ego_fut_masks=None, gt_fut_segmentations_plus=None, gt_fut_segmentations=None, vad_ego_fut_trajs=None, **kwargs): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ pred_ego_fut_trajs = preds_dicts['ego_fut_preds'] gt_ego_fut_trajs = torch.stack(gt_ego_fut_trajs).to(pred_ego_fut_trajs.device) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd).to(pred_ego_fut_trajs.device) gt_ego_fut_masks = torch.stack(gt_ego_fut_masks).to(pred_ego_fut_trajs.device) pred_ego_fut_trajs = torch.cumsum(pred_ego_fut_trajs[gt_ego_fut_cmd==1], 1) # pred_ego_fut_trajs = vad_ego_fut_trajs[0][None] pred_ego_fut_trajs = preds_dicts['fut_traj_from_velo'] ego_trajs = torch.cat([torch.zeros_like(pred_ego_fut_trajs[:,:1]), pred_ego_fut_trajs], 1) ego_trajs = torch.cat([ego_trajs, torch.zeros_like(ego_trajs[..., :1])], -1) ego_trajs_in_global = transform_reference_points(ego_trajs, preds_dicts['data']['ego_pose'], reverse=False)[..., :2] # pred_ego_fut_trajs = gt_ego_fut_trajs metric_dict = { 'plan_L2_1s':0, 'plan_L2_2s':0, 'plan_L2_3s':0, 'plan_obj_col_1s':0, 'plan_obj_col_2s':0, 'plan_obj_col_3s':0, 'plan_obj_box_col_1s':0, 'plan_obj_box_col_2s':0, 'plan_obj_box_col_3s':0, 'plan_obj_col_plus_1s':0, 'plan_obj_col_plus_2s':0, 'plan_obj_col_plus_3s':0, 'plan_obj_box_col_plus_1s':0, 'plan_obj_box_col_plus_2s':0, 'plan_obj_box_col_plus_3s':0, 'l2_dist': 0, } fut_valid_flag = gt_ego_fut_masks.all() future_second = 3 metric_dict['fut_valid_flag'] = fut_valid_flag.cpu().item() for i in range(future_second): if fut_valid_flag: cur_time = (i+1)*2 traj_L2 = self.planning_metric.compute_L2( pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[0, :cur_time] ) obj_coll, obj_box_coll = self.planning_metric.evaluate_coll( pred_ego_fut_trajs[:, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[:, :cur_time], gt_fut_segmentations, index = [each['index'] for each in img_metas], ignore_gt=False, ) metric_dict['plan_L2_{}s'.format(i+1)] = traj_L2 metric_dict['plan_obj_col_{}s'.format(i+1)] = obj_coll.mean().item() metric_dict['plan_obj_box_col_{}s'.format(i+1)] = obj_box_coll.max().item() for i in range(future_second): if fut_valid_flag: cur_time = (i+1)*2 obj_coll, obj_box_coll = self.planning_metric.evaluate_coll( pred_ego_fut_trajs[:, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[:, :cur_time], gt_fut_segmentations_plus, index = [each['index'] for each in img_metas], ignore_gt=False, ) metric_dict['plan_obj_col_plus_{}s'.format(i+1)] = obj_coll.mean().item() metric_dict['plan_obj_box_col_plus_{}s'.format(i+1)] = obj_box_coll.max().item() l2_dist = (pred_ego_fut_trajs-gt_ego_fut_trajs).norm(dim=-1) * gt_ego_fut_masks[:, None] l2_dist[gt_ego_fut_masks[:, None]==0] = -1 metric_dict['l2_dist'] = l2_dist[0].cpu() ret_list = [] num_samples = len(pred_ego_fut_trajs) assert num_samples == 1 index_w_scene = img_metas[0]['scene_name'] + '-' + str(img_metas[0]['index']) for i in range(num_samples): ret_list.append( dict( pred_ego_fut_trajs = pred_ego_fut_trajs[i].cpu(), gt_ego_fut_trajs = gt_ego_fut_trajs[i].cpu(), metric_dict = metric_dict, l2_dist=l2_dist[i].cpu(), index_w_scene = index_w_scene, ego_trajs_in_global = ego_trajs_in_global[i].cpu(), gt_ego_fut_cmd = gt_ego_fut_cmd[i].cpu(), index = img_metas[i]['index'] ) ) return ret_list def world2bev_vis(self, x, y): return int((x + 50) * 5), int((y + 50) * 5) def visual_sample(self, results, gt_bboxes_3d_=None, ego_info=None, cam_params=None, front_img=None, metric_dict = None, **kwargs): import matplotlib.pyplot as plt import random import math import pyquaternion from nuscenes.utils.data_classes import Box as NuScenesBox from mmdet3d.core.bbox import CustomBox # nusc = NuScenes(version='v1.0-trainval', dataroot='./data/nuscenes', verbose=True) # _, boxes_gt, _ = nusc.get_sample_data(sample_data_token, box_vis_level=box_vis_level) ratio=1 # plt.figure(figsize=(10, 10*ratio), dpi=300) fig, axes = plt.subplots(1, 1, figsize=(10, 10*ratio), dpi=300) plt.gca().set_axis_off() plt.axis('off') fig.tight_layout() margin=50.0 coor_range = self.world2bev_vis(-margin, margin) axes.set_xlim(np.array(coor_range)) axes.set_ylim(np.array(coor_range)) axes.grid(False) # ax = plt.gca() axes.set_aspect('equal', adjustable='box') axes.invert_yaxis() random.seed(0) colors = ['#%02X%02X%02X' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255) ) for _ in range(40)] ego_center = self.world2bev_vis(0, 0) axes.scatter(ego_center[0], ego_center[1], s=15, marker='o',color='r', zorder=2) if gt_bboxes_3d_ is not None: # gt_bboxes_3d = kwargs['gt_bboxes_3d'][0][0] # bev_coor = gt_bboxes_3d.bev.cpu().numpy() # rects = [(tuple(coor[:2]), tuple(coor[2:4]), math.degrees(coor[4])) for coor in bev_coor] # boxes = np.array([cv2.boxPoints(rect) for rect in rects]) # raw = gt_bboxes_3d.corners[:, [4, 7, 3, 0], :2] boxes = gt_bboxes_3d_.tensor.numpy().copy() for i, box in enumerate(boxes): center = box[:3] wlh = box[[4, 3, 5]] box_yaw = box[6] box_vel = box[7:].tolist() box_vel.append(0) quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=box_yaw) center[:2]=np.array(self.world2bev_vis(center[0],center[1])) wlh[0]=wlh[0]*5 wlh[1]=wlh[1]*5 nusc_box = CustomBox(center, wlh, quat, velocity=box_vel) c = colors[i % len(colors)] nusc_box.render(axes, view=np.eye(4), colors=(c, c, c), linewidth=1) # if ego_info is not None: # center, wlh, yaw = ego_info # center[:2]=np.array(self.world2bev_vis(center[0],center[1])) # quat = pyquaternion.Quaternion(axis=[0, 0, 1], radians=yaw) # wlh[0]=wlh[0]*5 # wlh[1]=wlh[1]*5 # nusc_box = CustomBox(center, wlh, quat, velocity=[0, 0, 0]) # c = colors[-1] # nusc_box.render(axes, view=np.eye(4), colors=(c, c, c), linewidth=1) points_per_step=5 if results[0].get('pred_ego_traj') is not None: pred_ego_fut_trajs = results[0]['pred_ego_traj']['pred_ego_fut_trajs'] # pred_ego_fut_trajs = kwargs['gt_ego_fut_trajs'][0].cpu() pred_ego_fut_trajs = pred_ego_fut_trajs.numpy() points = np.array([self.world2bev_vis(*point) for point in pred_ego_fut_trajs]) points = np.insert(points, 0, np.array(ego_center), axis=0) points, colors = self._render_traj_v2(points, colormap='autumn') x_coords, y_coords = zip(*points) for j in range(len(points) - 1): axes.plot([x_coords[j], x_coords[j + 1]], [y_coords[j], y_coords[j + 1]], '-',c=colors[j], linewidth=1.5, zorder=2) if j != 0 and j % points_per_step==0: axes.scatter(x_coords[j], y_coords[j], s=5, marker='o',color=colors[j], zorder=3) axes.scatter(x_coords[-1], y_coords[-1], s=5, marker='o',color=colors[-1], zorder=3) if front_img is not None: pred_ego_fut_trajs = results[0]['pred_ego_traj']['pred_ego_fut_trajs'] # pred_ego_fut_trajs = kwargs['gt_ego_fut_trajs'][0].cpu() pred_ego_fut_trajs = torch.cat([torch.tensor([[4, 0]]), pred_ego_fut_trajs], 0) pred_ego_fut_trajs = torch.cat([pred_ego_fut_trajs, torch.zeros_like(pred_ego_fut_trajs[:, :1])], -1) traj_on_img = self.point_sampling(pred_ego_fut_trajs[None, None].to(cam_params[0].device), cam_params)[1][1, 0, 0].cpu().numpy() front_img = front_img.permute(1, 2, 0)[:, :, [2, 1, 0]].cpu().numpy() front_img = np.ascontiguousarray(front_img, dtype=np.uint8) traj_on_img, colors = self._render_traj_v2(traj_on_img, colormap='autumn') traj_on_img = np.ascontiguousarray(traj_on_img, dtype=np.int32) for i in range(len(traj_on_img)-1): front_img = cv2.line(front_img, traj_on_img[i], traj_on_img[i+1] , color=colors[i] * 255, thickness=5) avg_l2 = 0 for i in range(1,4): avg_l2 += metric_dict[f'plan_L2_{i}s'] avg_l2/=3 avg_coli = 0 for i in range(1,4): avg_coli += metric_dict[f'plan_obj_box_col_{i}s'] avg_coli = (avg_coli/3)>0 avg_intersect = 0 for i in range(1,4): avg_intersect += metric_dict[f'plan_obj_box_col_plus_{i}s'] avg_intersect = (avg_intersect/3)>0 # org org = (50, 50) # fontScale fontScale = 1.5 # Blue color in BGR color = (10, 10, 254) # Line thickness of 2 px thickness = 2 # Using cv2.putText() method # front_img = cv2.rectangle(front_img, (0, 0), (300, 150), (255, 255, 255), -1) front_img = cv2.putText(front_img, 'Avg.L2: %.2f'%avg_l2, (10, 40), cv2.FONT_HERSHEY_SIMPLEX , fontScale, color, thickness, cv2.LINE_AA) # front_img = cv2.putText(front_img, f'Collision: NaN', (10, 90), cv2.FONT_HERSHEY_SIMPLEX , fontScale, color, thickness, cv2.LINE_AA) # front_img = cv2.putText(front_img, f'Intersection: NaN', (10, 140), cv2.FONT_HERSHEY_SIMPLEX , fontScale, color, thickness, cv2.LINE_AA) front_img = cv2.putText(front_img, f'Collision: {str(avg_coli)}', (10, 90), cv2.FONT_HERSHEY_SIMPLEX , fontScale, color, thickness, cv2.LINE_AA) front_img = cv2.putText(front_img, f'Intersection: {str(avg_intersect)}', (10, 140), cv2.FONT_HERSHEY_SIMPLEX , fontScale, color, thickness, cv2.LINE_AA) # mmcv.imwrite(front_img, '') if kwargs.get('map_gt_bboxes_3d', False): map_gt_bboxes_3d = kwargs['map_gt_bboxes_3d'][0][0] map_gt_labels_3d = kwargs['map_gt_labels_3d'][0][0] for i, instance in enumerate(map_gt_bboxes_3d.instance_list): # if map_gt_labels_3d[i]!=2: continue line = np.array(list(instance.coords)) corners = np.array([self.world2bev_vis(*corner) for corner in line]) corners = [each for each in corners if ((each>=0).all() & (each<512).all())] if len(corners)<1: continue x_coords, y_coords = zip(*corners) for k, corner in enumerate(corners[:-1]): axes.plot([x_coords[k], x_coords[k + 1]], [y_coords[k], y_coords[k + 1]], c='dimgray', linewidth=1, zorder=1,) if kwargs.get('gt_agent_fut_traj', False): gt_agent_fut_traj = kwargs['gt_agent_fut_traj'][0][0].cpu() gt_agent_fut_traj_mask = kwargs['gt_agent_fut_traj_mask'][0][0].cpu() centers = kwargs['gt_bboxes_3d'][0][0].center[..., :2].cpu() tmp = torch.cat([centers[:, None], gt_agent_fut_traj], 1) trajs = torch.cumsum(tmp, 1) for k, traj in enumerate(trajs): traj = traj.cpu().numpy() # center = np.array(self.world2bev_vis(*centers[k])) agent_fut_traj = np.array([self.world2bev_vis(*corner) for corner in traj]) corners, colors = self._render_traj_v2(agent_fut_traj, colormap='winter',points_per_step=points_per_step) corners = [each for each in corners if ((each>=0).all() & (each<1536).all())] x_coords, y_coords = zip(*corners) for j in range(len(corners) - 1): # plot line between box center and the first traj point if j//points_per_step == 0 and gt_agent_fut_traj_mask[k, j//points_per_step].sum()==2: axes.plot([x_coords[j], x_coords[j + 1]], [y_coords[j], y_coords[j + 1]], '-',c=colors[j], linewidth=0.8, zorder=2) continue elif gt_agent_fut_traj_mask[k, j//points_per_step].sum()<2 or gt_agent_fut_traj_mask[k, j//points_per_step-1].sum()<2: continue axes.plot([x_coords[j], x_coords[j + 1]], [y_coords[j], y_coords[j + 1]], '-',c=colors[j], linewidth=0.8, zorder=2) plt.margins(0, 0) # plt.savefig(f'pred_bev_{results[0]["index"]}.png') canvas = FigureCanvasAgg(plt.gcf()) canvas.draw() w, h = canvas.get_width_height() buf = np.fromstring(canvas.tostring_argb(), dtype=np.uint8) buf.shape = (w, h, 4) buf = np.roll(buf, 3, axis=2) image = Image.frombytes("RGBA", (w, h), buf.tostring()) image = np.asarray(image) rgb_image = image[:, :, :3] plt.close() return rgb_image, front_img def _render_traj(self, future_traj, traj_score=1, colormap='winter', points_per_step=5, line_color=None, dot_color=None, dot_size=25): total_steps = (len(future_traj)-1) * points_per_step + 1 dot_colors = matplotlib.colormaps[colormap]( np.linspace(0, 1, total_steps))[:, :3] * 255 dot_colors = dot_colors*traj_score + \ (1-traj_score)*np.ones_like(dot_colors) total_xy = np.zeros((total_steps, 2)) for i in range(total_steps-1): unit_vec = future_traj[i//points_per_step + 1] - future_traj[i//points_per_step] total_xy[i] = (i/points_per_step - i//points_per_step) * \ unit_vec + future_traj[i//points_per_step] total_xy[-1] = future_traj[-1] return total_xy, dot_colors def _render_traj_v2(self, future_traj, traj_score=1, colormap='winter', points_per_step=5, line_color=None, dot_color=None, dot_size=25): total_steps = (len(future_traj)-1) * points_per_step + 1 dot_colors = matplotlib.colormaps[colormap]( np.linspace(0, 1, total_steps))[:, :3] # dot_colors = dot_colors*traj_score + \ # (1-traj_score)*np.ones_like(dot_colors) total_xy = np.zeros((total_steps, 2)) for i in range(total_steps-1): unit_vec = future_traj[i//points_per_step + 1] - future_traj[i//points_per_step] total_xy[i] = (i/points_per_step - i//points_per_step) * \ unit_vec + future_traj[i//points_per_step] total_xy[-1] = future_traj[-1] return total_xy, dot_colors ================================================ FILE: mmdet3d/models/fbbev/planner_head/__init__.py ================================================ from .plan_loss import * from .plan_loss_gt import * from .naive_planner import NaivePlannerHead from .AD_mlp import AD_MLP ================================================ FILE: mmdet3d/models/fbbev/planner_head/metric_stp3.py ================================================ ''' calculate planner metric same as stp3 ''' import numpy as np import torch import cv2 import copy import matplotlib.pyplot as plt from skimage.draw import polygon from nuscenes.utils.data_classes import Box from scipy.spatial.transform import Rotation as R ego_width, ego_length = 1.85, 4.084 class PlanningMetric(): def __init__(self): super().__init__() self.X_BOUND = [-50.0, 50.0, 0.1] # Forward self.Y_BOUND = [-50.0, 50.0, 0.1] # Sides self.Z_BOUND = [-10.0, 10.0, 20.0] # Height dx, bx, _ = self.gen_dx_bx(self.X_BOUND, self.Y_BOUND, self.Z_BOUND) self.dx, self.bx = dx[:2], bx[:2] bev_resolution, bev_start_position, bev_dimension = self.calculate_birds_eye_view_parameters( self.X_BOUND, self.Y_BOUND, self.Z_BOUND ) self.bev_resolution = bev_resolution.numpy() self.bev_start_position = bev_start_position.numpy() self.bev_dimension = bev_dimension.numpy() self.W = ego_width self.H = ego_length self.category_index = { 'human':[2,3,4,5,6,7,8], 'vehicle':[14,15,16,17,18,19,20,21,22,23] } def gen_dx_bx(self, xbound, ybound, zbound): dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]]) nx = torch.LongTensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) return dx, bx, nx def calculate_birds_eye_view_parameters(self, x_bounds, y_bounds, z_bounds): """ Parameters ---------- x_bounds: Forward direction in the ego-car. y_bounds: Sides z_bounds: Height Returns ------- bev_resolution: Bird's-eye view bev_resolution bev_start_position Bird's-eye view first element bev_dimension Bird's-eye view tensor spatial dimension """ bev_resolution = torch.tensor([row[2] for row in [x_bounds, y_bounds, z_bounds]]) bev_start_position = torch.tensor([row[0] + row[2] / 2.0 for row in [x_bounds, y_bounds, z_bounds]]) bev_dimension = torch.tensor([(row[1] - row[0]) / row[2] for row in [x_bounds, y_bounds, z_bounds]], dtype=torch.long) return bev_resolution, bev_start_position, bev_dimension def evaluate_single_coll(self, traj, segmentation, input_gt, gt_traj=None, index=None): ''' traj: torch.Tensor (n_future, 2) 自车IMU系为轨迹参考系 0-------> | x | |y segmentation: torch.Tensor (n_future, 200, 200) ''' # 0.985793 is the distance betweem the LiDAR and the IMU(ego). import mmcv pts = np.array([ [-self.H / 2. + 0.5 + 0.985793, self.W / 2.], [self.H / 2. + 0.5 + 0.985793, self.W / 2.], [self.H / 2. + 0.5 + 0.985793, -self.W / 2.], [-self.H / 2. + 0.5 + 0.985793, -self.W / 2.], ]) pts = (pts - self.bx.cpu().numpy() ) / (self.dx.cpu().numpy()) pts[:, [0, 1]] = pts[:, [1, 0]] rr, cc = polygon(pts[:,1], pts[:,0]) rc = np.concatenate([rr[:,None], cc[:,None]], axis=-1) rc_ori = rc + (self.bx.cpu().numpy() / self.dx.cpu().numpy()) traj_with_ego = torch.cat([traj.new_zeros(1, 2), traj], 0) rc_yaw = [] rotate_angle = 0 for i in range(traj.size(0)): delta = traj_with_ego[i+1] - traj_with_ego[i] cur_rotate_angle = torch.atan2(*delta[[1, 0]]) if delta.norm()<1: cur_rotate_angle = 0 rotate_angle = cur_rotate_angle rotate_angle = -torch.tensor(rotate_angle) rot_sin = torch.sin(rotate_angle) rot_cos = torch.cos(rotate_angle) rot_mat = torch.Tensor([[rot_cos, -rot_sin], [rot_sin, rot_cos]]) tmp = rc_ori @ rot_mat.cpu().numpy() - (self.bx.cpu().numpy() / self.dx.cpu().numpy()) tmp = tmp.round().astype(np.int) rc_yaw.append(tmp) rc_yaw = np.stack(rc_yaw) # n_future, _ = traj.shape # trajs = traj.view(n_future, 1, 2) # trajs_ = copy.deepcopy(trajs) # trajs_ = trajs_ / self.dx.to(trajs.device) # trajs_ = trajs_.cpu().numpy() + rc # (n_future, 32, 2) # r = trajs_[:,:,0].astype(np.int32) # r = np.clip(r, 0, self.bev_dimension[0] - 1) # c = trajs_[:,:,1].astype(np.int32) # c = np.clip(c, 0, self.bev_dimension[1] - 1) # collision = np.full(n_future, False) # for t in range(n_future): # rr = r[t] # cc = c[t] # I = np.logical_and( # np.logical_and(rr >= 0, rr < self.bev_dimension[0]), # np.logical_and(cc >= 0, cc < self.bev_dimension[1]), # ) # collision[t] = np.any(segmentation[t, cc[I], rr[I]].cpu().numpy()) n_future, _ = traj.shape trajs = traj.view(n_future, 1, 2) trajs_ = copy.deepcopy(trajs) trajs_ = trajs_ / self.dx.to(trajs.device) trajs_ = trajs_.cpu().numpy() + rc_yaw # (n_future, 32, 2) r = trajs_[:,:,0].astype(np.int32) r = np.clip(r, 0, self.bev_dimension[0] - 1) c = trajs_[:,:,1].astype(np.int32) c = np.clip(c, 0, self.bev_dimension[1] - 1) collision2 = np.full(n_future, False) # obs_occ = copy.deepcopy(segmentation).cpu().numpy() * 0 for t in range(n_future): rr = r[t] cc = c[t] I = np.logical_and( np.logical_and(rr >= 0, rr < self.bev_dimension[0]), np.logical_and(cc >= 0, cc < self.bev_dimension[1]), ) collision2[t] = np.any(segmentation[t, cc[I], rr[I]].cpu().numpy()) return torch.from_numpy(collision2).to(device=traj.device) def evaluate_coll( self, trajs, gt_trajs, segmentation, index=None, ignore_gt=True, ): ''' trajs: torch.Tensor (B, n_future, 2) 自车IMU系为轨迹参考系 0-------> | x | |y gt_trajs: torch.Tensor (B, n_future, 2) segmentation: torch.Tensor (B, n_future, 200, 200) ''' B, n_future, _ = trajs.shape # trajs = trajs * torch.tensor([-1, 1], device=trajs.device) # gt_trajs = gt_trajs * torch.tensor([-1, 1], device=gt_trajs.device) obj_coll_sum = torch.zeros(n_future, device=segmentation.device) obj_box_coll_sum = torch.zeros(n_future, device=segmentation.device) for i in range(B): gt_box_coll = self.evaluate_single_coll(gt_trajs[i], segmentation[i], input_gt=True) xx, yy = trajs[i,:,0], trajs[i, :, 1] xi = ((-self.bx[0] + xx) / self.dx[0]).long() yi = ((-self.bx[1] + yy) / self.dx[1]).long() m1 = torch.logical_and( torch.logical_and(xi >= 0, xi < self.bev_dimension[0]), torch.logical_and(yi >= 0, yi < self.bev_dimension[1]), ).to(gt_box_coll.device) m1 = torch.logical_and(m1, torch.logical_not(gt_box_coll)) ti = torch.arange(n_future).to(segmentation.device) # segmentation: B, T, H, W obj_coll_sum[ti[m1]] += segmentation[i, ti[m1], yi[m1], xi[m1]].long() m2 = torch.logical_not(gt_box_coll) box_coll = self.evaluate_single_coll(trajs[i], segmentation[i], gt_traj=gt_trajs[i], input_gt=False, index=index[i], ).to(segmentation.device) if ignore_gt: obj_box_coll_sum += (gt_box_coll).long() else: obj_box_coll_sum[ti[m2]] += (box_coll[ti[m2]]).long() return obj_coll_sum, obj_box_coll_sum def compute_L2(self, trajs, gt_trajs): ''' trajs: torch.Tensor (n_future, 2) gt_trajs: torch.Tensor (n_future, 2) ''' # return torch.sqrt(((trajs[:, :, :2] - gt_trajs[:, :, :2]) ** 2).sum(dim=-1)) pred_len = trajs.shape[0] ade = float( sum( torch.sqrt( (trajs[i, 0] - gt_trajs[i, 0]) ** 2 + (trajs[i, 1] - gt_trajs[i, 1]) ** 2 ) for i in range(pred_len) ) / pred_len ) return ade ================================================ FILE: mmdet3d/models/fbbev/planner_head/naive_planner.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import torch import torch.nn as nn from mmcv.cnn import Linear, bias_init_with_prob, Scale from mmcv.runner import force_fp32 from mmdet.core import (build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.utils import build_transformer from mmdet.models import HEADS, build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.models.utils.transformer import inverse_sigmoid from mmdet3d.core.bbox.coders import build_bbox_coder from ..streampetr.streampetr_utils import * import copy from mmdet.models.utils import NormedLinear from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.fbbev.utils import save_tensor from mmcv.runner.base_module import BaseModule from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence from .metric_stp3 import PlanningMetric def get_ego_pos(points, pc_range): if points.size(-1) == 3: points = points * (pc_range[3:6] - pc_range[0:3]) + pc_range[0:3] elif points.size(-1) == 2: points = points * (pc_range[3:5] - pc_range[0:2]) + pc_range[0:2] return points def get_rel_pos(points, pc_range): if points.size(-1) == 3: return (points - pc_range[0:3]) / (pc_range[3:6] - pc_range[0:3]) elif points.size(-1) == 2: return (points - pc_range[0:2]) / (pc_range[3:5] - pc_range[0:2]) @HEADS.register_module() class NaivePlannerHead(BaseModule): _version = 2 def __init__(self, # num_classes=1, in_channels=256, stride=[16], embed_dims=256, num_query=1, num_reg_fcs=2, memory_len=12, topk_proposals=4, num_propagated=0, with_dn=True, with_ego_pos=True, match_with_velo=True, match_costs=None, transformer=None, sync_cls_avg_factor=False, code_weights=None, bbox_coder=None, init_cfg=None, normedlinear=False, point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], loss_plan_reg=dict(type='L1Loss', loss_weight=5.0), loss_plan_col=dict(type='PlanCollisionLoss', loss_weight=5.0), with_ego_status=False, dist_func_type='MDE', use_map_info=False, **kwargs): if 'code_size' in kwargs: self.code_size = kwargs['code_size'] else: self.code_size = 2 self.use_map_info = use_map_info self.with_ego_status = with_ego_status self.num_query = num_query self.in_channels = in_channels self.num_reg_fcs = num_reg_fcs # self.train_cfg = train_cfg # self.test_cfg = test_cfg self.fp16_enabled = False self.embed_dims = embed_dims self.num_motion_mode = 6 self.fut_steps = 6 self.memory_len = 6 self.ego_fut_mode = 3 super(NaivePlannerHead, self).__init__() self.pc_range = nn.Parameter(torch.tensor( point_cloud_range), requires_grad=False) self.loss_plan_reg = build_loss(loss_plan_reg) loss_plan_col.update(point_cloud_range=point_cloud_range) self.loss_plan_col = build_loss(loss_plan_col) ego_img_decoder = dict( type='CustomTransformerDecoder', num_layers=1, return_intermediate=False, transformerlayers=dict( type='BaseTransformerLayer', batch_first=True, attn_cfgs=dict( type='MultiheadAttention', embed_dims=256, num_heads=8, attn_drop=0.1, proj_drop=0.1, ), feedforward_channels=1024, ffn_dropout=0.1, operation_order=('cross_attn', 'norm', 'ffn', 'norm'))) if self.use_map_info: ego_agent_decoder = dict( type='CustomTransformerDecoder', num_layers=1, return_intermediate=False, transformerlayers=dict( type='BaseTransformerLayer', batch_first=True, attn_cfgs=dict( type='MotionSelfAttention', embed_dims=256, num_heads=8, dropout=0.1, dist_func_type=dist_func_type, pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], consider_map_quality=True, ), feedforward_channels=2048, ffn_dropout=0.1, operation_order=('cross_attn', 'norm', 'ffn', 'norm'))) self.ego_agent_decoder = build_transformer_layer_sequence(ego_agent_decoder) self.gamma = nn.Parameter(torch.ones(256)*0.5, requires_grad=True) self.ego_img_decoder = build_transformer_layer_sequence(ego_img_decoder) # self.ego_decoder = build_transformer_layer_sequence(ego_agent_decoder) self.ego_info = MLN(3) self._init_layers() self.reset_memory() self.planning_metric = PlanningMetric() self.count = 0 def reset_memory(self): self.memory_traj = None # self.memory_ego_embed = None def pre_update_memory(self, data, fut_traj_from_velo): x = 1-data['start_of_sequence'] # original prev_exist, so we need do `not` B = x.size(0) # refresh the memory when the scene changes if self.memory_traj is None: self.memory_traj = fut_traj_from_velo.unsqueeze(1).repeat(1, self.memory_len, 1, 1) * 0 # self.memory_ego_embed = x.new_zeros(B, self.memory_len, self.embed_dims * 2) else: self.memory_traj = transform_reference_points(self.memory_traj, data['ego_pose_inv'], reverse=False)[..., :2] self.memory_traj = memory_refresh(self.memory_traj[:, :self.memory_len], x) for i in range(B): if not x[i]: self.memory_traj[i, 0] = fut_traj_from_velo[i] * 0 # self.memory_ego_embed = memory_refresh(self.memory_ego_embed[:, :self.memory_len], x) def post_update_memory(self, data, ego_fut_trajs, ego_embeds): self.memory_traj = torch.cat([ego_fut_trajs, self.memory_traj], dim=1) self.memory_traj = torch.cat([self.memory_traj, torch.zeros_like(self.memory_traj[..., :1])], -1) self.memory_traj = transform_reference_points(self.memory_traj, data['ego_pose'], reverse=False) # self.memory_ego_embed = torch.cat([ego_embeds, self.memory_ego_embed], dim=1) def _init_layers(self): """Initialize layers of the transformer head.""" ego_fut_decoder = [] ego_fut_dec_in_dim = self.embed_dims if self.with_ego_status: ego_fut_dec_in_dim += 9 for i in range(self.num_reg_fcs): if i ==0: ego_fut_decoder.append(Linear(ego_fut_dec_in_dim, self.embed_dims)) else: ego_fut_decoder.append(Linear(self.embed_dims, self.embed_dims)) ego_fut_decoder.append(nn.ReLU()) ego_fut_decoder.append(Linear(self.embed_dims, self.ego_fut_mode*self.fut_steps*2)) self.ego_fut_decoder = nn.Sequential(*ego_fut_decoder) self.query_feat_embedding = nn.Embedding(self.num_query, self.embed_dims) def calc_MDE(self, reference_points_q, reference_points_v, pc_range, map_scores=None): """ mim mean distance between the map lane and traj. """ reference_points_q = reference_points_q[..., :2] q_shape = reference_points_q.shape v_shape = reference_points_v.shape reference_points_q = reference_points_q.flatten(1, 2) reference_points_v = reference_points_v.flatten(1, 2) dist = [] code_size = reference_points_q.size(-1) for b in range(reference_points_q.shape[0]): dist_b = torch.norm(reference_points_q[b].reshape(-1, 1, code_size) - reference_points_v[b].reshape(1, -1, code_size), dim=-1) dist.append(dist_b[None, ...]) dist = torch.cat(dist, dim=0) # [B, Q, K] dist = dist.view(q_shape[0], q_shape[1], q_shape[2], v_shape[1], v_shape[2]) dist = dist.min(-1).values.mean(2) if map_scores is not None: map_scores = map_scores.sigmoid().max(-1)[0] # smaller, better map_scores = torch.round(1-map_scores, decimals=1) + self.map_alpha dist = dist * map_scores.unsqueeze(1) dist = -dist return dist def forward(self, results, gt_ego_lcf_feat, gt_ego_fut_cmd, gt_ego_his_traj=None, gt_ego_fut_trajs=None, img_metas=None, map_results=None): # agent_queries = map_results['queries'] if self.use_map_info: map_queries = map_results['queries'].clone() map_lines = map_results['lines'].clone() map_scores = map_results['scores'].clone() B, NMQ, K2 = map_lines.shape map_lines = map_lines.reshape(B, NMQ, K2//2, 2) # map_pos = self.query_embedding(bevpos2posemb(map_lines.mean(-2))) map_lines = get_ego_pos(map_lines, self.pc_range) img_context = results['img_bev_feat'][0].flatten(-2, -1).permute(0, 2, 1) gt_ego_lcf_feat = torch.stack(gt_ego_lcf_feat).to(img_context.device) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd).to(img_context.device) start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(img_context.device) timestamp = torch.FloatTensor([ single_img_metas['timestamp'] for single_img_metas in img_metas]).to(img_context.device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(img_context.device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(img_context.device) data = dict( start_of_sequence = start_of_sequence, timestamp = timestamp, ego_pose_inv = ego_pose_inv, ego_pose = ego_pose, ) fut_traj_from_velo = gt_ego_lcf_feat[:, :2].unsqueeze(1).repeat(1, self.fut_steps, 1) * torch.arange(1, self.fut_steps+1)[None,:, None].to(img_context.device) * 0.5 self.pre_update_memory(data, fut_traj_from_velo) bs = img_context.size(0) ego_query = self.query_feat_embedding.weight.repeat(bs, 1) ego_query = self.ego_info(ego_query, gt_ego_fut_cmd.to(ego_query.dtype)).unsqueeze(1) init_ego_traj = self.memory_traj[:, 0:1] if self.use_map_info: ego_query = (1-self.gamma) * self.ego_agent_decoder( query = ego_query, key = map_queries, val = map_queries, reference_points_q=init_ego_traj, reference_points_v=map_lines, pc_range=self.pc_range, map_scores=map_scores ) + self.gamma * self.ego_img_decoder( query = ego_query, key = img_context, val = img_context, # query_pos = ego_pose ) else: ego_query =self.ego_img_decoder( query = ego_query, key = img_context, val = img_context, ) if self.with_ego_status: ego_query = torch.cat([ego_query[:, 0], gt_ego_lcf_feat], -1) outputs_ego_trajs = self.ego_fut_decoder(ego_query) outputs_ego_trajs = outputs_ego_trajs.reshape(outputs_ego_trajs.shape[0], self.ego_fut_mode, self.fut_steps, 2) self.post_update_memory(data, torch.cumsum(outputs_ego_trajs[gt_ego_fut_cmd==1], 1)[:, None], ego_query) ego_trajs = torch.cumsum(outputs_ego_trajs[gt_ego_fut_cmd==1], 1) ego_trajs = torch.cat([torch.zeros_like(ego_trajs[:,:1]), ego_trajs], 1) ego_trajs = torch.cat([ego_trajs, torch.zeros_like(ego_trajs[..., :1])], -1) ego_trajs_in_global = transform_reference_points(ego_trajs, data['ego_pose'], reverse=False)[..., :2] return dict( ego_fut_preds=outputs_ego_trajs, ego_trajs_in_global = ego_trajs_in_global, data=data ) @force_fp32(apply_to=('preds_plan_dicts')) def loss(self, gt_ego_fut_trajs=None, gt_ego_fut_cmd=None, gt_ego_fut_masks=None, preds_plan_dicts=None, img_metas=None, ): ego_fut_preds = preds_plan_dicts['ego_fut_preds'] gt_ego_fut_trajs = torch.stack(gt_ego_fut_trajs) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd) gt_ego_fut_masks = torch.stack(gt_ego_fut_masks) gt_ego_fut_trajs = torch.cat([gt_ego_fut_trajs[:,:1], (gt_ego_fut_trajs[:,1:] - gt_ego_fut_trajs[:,:-1])], 1) gt_ego_fut_trajs = gt_ego_fut_trajs.unsqueeze(1).repeat(1, self.ego_fut_mode, 1, 1) loss_plan_l1_weight = gt_ego_fut_cmd[..., None, None] * gt_ego_fut_masks[:, None, :, None] loss_plan_l1_weight = loss_plan_l1_weight.repeat(1, 1, 1, 2) loss_plan_l1 = self.loss_plan_reg( ego_fut_preds, gt_ego_fut_trajs, loss_plan_l1_weight ) loss_plan_l1 = torch.nan_to_num(loss_plan_l1) loss_plan_dict = dict() loss_plan_dict['loss_plan_reg'] = loss_plan_l1 return loss_plan_dict @force_fp32(apply_to=('preds_dicts')) def get_bboxes(self, preds_dicts, img_metas, rescale=False, gt_ego_fut_trajs=None, gt_ego_fut_cmd=None, gt_ego_fut_masks=None, gt_fut_segmentations=None, gt_fut_segmentations_plus=None, vad_ego_fut_trajs=None, **kwargs, ): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ pred_ego_fut_trajs = preds_dicts['ego_fut_preds'] gt_ego_fut_trajs = torch.stack(gt_ego_fut_trajs).to(pred_ego_fut_trajs.device) gt_ego_fut_cmd = torch.stack(gt_ego_fut_cmd).to(pred_ego_fut_trajs.device) gt_ego_fut_masks = torch.stack(gt_ego_fut_masks).to(pred_ego_fut_trajs.device) pred_ego_fut_trajs = torch.cumsum(pred_ego_fut_trajs[gt_ego_fut_cmd==1], 1) # pred_ego_fut_trajs = vad_ego_fut_trajs[0][None] ego_trajs = torch.cat([torch.zeros_like(pred_ego_fut_trajs[:,:1]), pred_ego_fut_trajs], 1) ego_trajs = torch.cat([ego_trajs, torch.zeros_like(ego_trajs[..., :1])], -1) ego_trajs_in_global = transform_reference_points(ego_trajs, preds_dicts['data']['ego_pose'], reverse=False)[..., :2] metric_dict = { 'plan_L2_1s':0, 'plan_L2_2s':0, 'plan_L2_3s':0, 'plan_obj_col_1s':0, 'plan_obj_col_2s':0, 'plan_obj_col_3s':0, 'plan_obj_box_col_1s':0, 'plan_obj_box_col_2s':0, 'plan_obj_box_col_3s':0, 'plan_obj_col_plus_1s':0, 'plan_obj_col_plus_2s':0, 'plan_obj_col_plus_3s':0, 'plan_obj_box_col_plus_1s':0, 'plan_obj_box_col_plus_2s':0, 'plan_obj_box_col_plus_3s':0, 'l2_dist': 0, } fut_valid_flag = gt_ego_fut_masks.all() future_second = 3 metric_dict['fut_valid_flag'] = fut_valid_flag.cpu().item() for i in range(future_second): if fut_valid_flag: cur_time = (i+1)*2 traj_L2 = self.planning_metric.compute_L2( pred_ego_fut_trajs[0, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[0, :cur_time] ) obj_coll, obj_box_coll = self.planning_metric.evaluate_coll( pred_ego_fut_trajs[:, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[:, :cur_time], gt_fut_segmentations, index = [each['index'] for each in img_metas], ignore_gt=False, ) metric_dict['plan_L2_{}s'.format(i+1)] = traj_L2 metric_dict['plan_obj_col_{}s'.format(i+1)] = obj_coll.max().item() metric_dict['plan_obj_box_col_{}s'.format(i+1)] = obj_box_coll.max().item() for i in range(future_second): if fut_valid_flag: cur_time = (i+1)*2 obj_coll, obj_box_coll = self.planning_metric.evaluate_coll( pred_ego_fut_trajs[:, :cur_time].detach().to(gt_ego_fut_trajs.device), gt_ego_fut_trajs[:, :cur_time], gt_fut_segmentations_plus, index = [each['index'] for each in img_metas], ignore_gt=False, ) metric_dict['plan_obj_col_plus_{}s'.format(i+1)] = obj_coll.mean().item() metric_dict['plan_obj_box_col_plus_{}s'.format(i+1)] = obj_box_coll.max().item() l2_dist = (pred_ego_fut_trajs-gt_ego_fut_trajs).norm(dim=-1) * gt_ego_fut_masks[:, None] l2_dist[gt_ego_fut_masks[:, None]==0] = -1 metric_dict['l2_dist'] = l2_dist[0].cpu() ret_list = [] num_samples = len(pred_ego_fut_trajs) assert num_samples == 1 index_w_scene = img_metas[0]['scene_name'] + '-' + str(img_metas[0]['index']) for i in range(num_samples): ret_list.append( dict( pred_ego_fut_trajs = pred_ego_fut_trajs[i].cpu(), gt_ego_fut_trajs = gt_ego_fut_trajs[i].cpu(), metric_dict = metric_dict, l2_dist=l2_dist[i].cpu(), index_w_scene = index_w_scene, ego_trajs_in_global = ego_trajs_in_global[i].cpu(), gt_ego_fut_cmd = gt_ego_fut_cmd[i].cpu(), index = img_metas[i]['index'] ) ) return ret_list class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256, use_ln=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.use_ln = use_ln self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) if self.use_ln: self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.init_weight() def init_weight(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): if self.use_ln: x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out ================================================ FILE: mmdet3d/models/fbbev/planner_head/plan_loss.py ================================================ import math import mmcv import torch from torch import nn as nn from mmdet.models import weighted_loss from mmdet.models.builder import LOSSES @LOSSES.register_module() class PlanMapBoundLoss(nn.Module): """Planning constraint to push ego vehicle away from the lane boundary. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. map_thresh (float, optional): confidence threshold to filter map predictions. lane_bound_cls_idx (float, optional): lane_boundary class index. dis_thresh (float, optional): distance threshold between ego vehicle and lane bound. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, map_thresh=0.5, lane_bound_cls_idx=2, dis_thresh=1.0, point_cloud_range=[-15.0, -30.0, -2.0, 15.0, 30.0, 2.0], perception_detach=False ): super(PlanMapBoundLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.map_thresh = map_thresh self.lane_bound_cls_idx = lane_bound_cls_idx self.dis_thresh = dis_thresh self.pc_range = point_cloud_range self.perception_detach = perception_detach def forward(self, ego_fut_preds, lane_preds, lane_score_preds, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] lane_preds (Tensor): [B, num_vec, num_pts, 2] lane_score_preds (Tensor): [B, num_vec, 3] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.perception_detach: lane_preds = lane_preds.detach() lane_score_preds = lane_score_preds.detach() # filter lane element according to confidence score and class not_lane_bound_mask = lane_score_preds[..., self.lane_bound_cls_idx] < self.map_thresh # denormalize map pts lane_bound_preds = lane_preds.clone() lane_bound_preds[...,0:1] = (lane_bound_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) lane_bound_preds[...,1:2] = (lane_bound_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) # pad not-lane-boundary cls and low confidence preds lane_bound_preds[not_lane_bound_mask] = 1e6 loss_bbox = self.loss_weight * plan_map_bound_loss(ego_fut_preds, lane_bound_preds, weight=weight, dis_thresh=self.dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_map_bound_loss(pred, target, dis_thresh=1.0): """Planning map bound constraint (L1 distance). Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): lane_bound_preds, [B, num_vec, num_pts, 2]. weight (torch.Tensor): [B, fut_ts] Returns: torch.Tensor: Calculated loss [B, fut_ts] """ pred = pred.cumsum(dim=-2) ego_traj_starts = pred[:, :-1, :] ego_traj_ends = pred B, T, _ = ego_traj_ends.size() padding_zeros = torch.zeros((B, 1, 2), dtype=pred.dtype, device=pred.device) # initial position ego_traj_starts = torch.cat((padding_zeros, ego_traj_starts), dim=1) _, V, P, _ = target.size() ego_traj_expanded = ego_traj_ends.unsqueeze(2).unsqueeze(3) # [B, T, 1, 1, 2] maps_expanded = target.unsqueeze(1) # [1, 1, M, P, 2] dist = torch.linalg.norm(ego_traj_expanded - maps_expanded, dim=-1) # [B, T, M, P] dist = dist.min(dim=-1, keepdim=False)[0] min_inst_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] bd_target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) min_bd_insts = bd_target[batch_idxs, ts_idxs, min_inst_idxs] # [B, T, P, 2] bd_inst_starts = min_bd_insts[:, :, :-1, :].flatten(0, 2) bd_inst_ends = min_bd_insts[:, :, 1:, :].flatten(0, 2) ego_traj_starts = ego_traj_starts.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2) ego_traj_ends = ego_traj_ends.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2) intersect_mask = segments_intersect(ego_traj_starts, ego_traj_ends, bd_inst_starts, bd_inst_ends) intersect_mask = intersect_mask.reshape(B, T, P-1) intersect_mask = intersect_mask.any(dim=-1) intersect_idx = (intersect_mask == True).nonzero() target = target.view(target.shape[0], -1, target.shape[-1]) # [B, fut_ts, num_vec*num_pts] dist = torch.linalg.norm(pred[:, :, None, :] - target[:, None, :, :], dim=-1) min_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] min_dist = dist[batch_idxs, ts_idxs, min_idxs] loss = min_dist safe_idx = loss > dis_thresh unsafe_idx = loss <= dis_thresh loss[safe_idx] = 0 loss[unsafe_idx] = dis_thresh - loss[unsafe_idx] for i in range(len(intersect_idx)): loss[intersect_idx[i, 0], intersect_idx[i, 1]:] = 0 return loss def segments_intersect(line1_start, line1_end, line2_start, line2_end): # Calculating the differences dx1 = line1_end[:, 0] - line1_start[:, 0] dy1 = line1_end[:, 1] - line1_start[:, 1] dx2 = line2_end[:, 0] - line2_start[:, 0] dy2 = line2_end[:, 1] - line2_start[:, 1] # Calculating determinants det = dx1 * dy2 - dx2 * dy1 det_mask = det != 0 # Checking if lines are parallel or coincident parallel_mask = torch.logical_not(det_mask) # Calculating intersection parameters t1 = ((line2_start[:, 0] - line1_start[:, 0]) * dy2 - (line2_start[:, 1] - line1_start[:, 1]) * dx2) / det t2 = ((line2_start[:, 0] - line1_start[:, 0]) * dy1 - (line2_start[:, 1] - line1_start[:, 1]) * dx1) / det # Checking intersection conditions intersect_mask = torch.logical_and( torch.logical_and(t1 >= 0, t1 <= 1), torch.logical_and(t2 >= 0, t2 <= 1) ) # Handling parallel or coincident lines intersect_mask[parallel_mask] = False return intersect_mask @LOSSES.register_module() class PlanCollisionLoss(nn.Module): """Planning constraint to push ego vehicle away from other agents. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. agent_thresh (float, optional): confidence threshold to filter agent predictions. x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis. y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, agent_thresh=0.5, x_dis_thresh=3.0, y_dis_thresh=1.5, point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] ): super(PlanCollisionLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.agent_thresh = agent_thresh self.x_dis_thresh = x_dis_thresh self.y_dis_thresh = y_dis_thresh self.pc_range = point_cloud_range def forward(self, ego_fut_preds, agent_fut_preds, agent_score_preds, agent_fut_cls_preds, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] agent_preds (Tensor): [B, num_agent, 2] agent_fut_preds (Tensor): [B, num_agent, fut_mode, fut_ts, 2] agent_fut_cls_preds (Tensor): [B, num_agent, fut_mode] agent_score_preds (Tensor): [B, num_agent] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # filter agent element according to confidence score # agent_max_score_preds, agent_max_score_idxs = agent_score_preds # .max(dim=-1) not_valid_agent_mask = agent_score_preds < self.agent_thresh # filter low confidence preds agent_fut_preds[not_valid_agent_mask] = 1e6 # filter not vehicle preds # not_veh_pred_mask = agent_max_score_idxs > 4 # veh idxs are 0-4 # agent_fut_preds[not_veh_pred_mask] = 1e6 # only use best mode pred best_mode_idxs = torch.argmax(agent_fut_cls_preds, dim=-1).tolist() batch_idxs = [[i] for i in range(agent_fut_cls_preds.shape[0])] agent_num_idxs = [[i for i in range(agent_fut_cls_preds.shape[1])] for j in range(agent_fut_cls_preds.shape[0])] agent_fut_preds = agent_fut_preds[batch_idxs, agent_num_idxs, best_mode_idxs] loss_bbox = self.loss_weight * plan_col_loss(ego_fut_preds, target=agent_fut_preds, weight=weight, x_dis_thresh=self.x_dis_thresh, y_dis_thresh=self.y_dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_col_loss( pred, target, x_dis_thresh=3.0, y_dis_thresh=1.5, dis_thresh=3.0 ): """Planning ego-agent collsion constraint. Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): agent_preds, [B, num_agent, 2]. agent_fut_preds (Tensor): [B, num_agent, fut_ts, 2]. weight (torch.Tensor): [B, fut_ts, 2]. x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis. y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis. dis_thresh (float, optional): distance threshold to filter distant agents. Returns: torch.Tensor: Calculated loss [B, fut_mode, fut_ts, 2] """ pred = pred.cumsum(dim=-2) # agent_fut_preds = agent_fut_preds.cumsum(dim=-2) # target = target[:, :, None, :] + agent_fut_preds # filter distant agents from ego vehicle dist = torch.linalg.norm(pred[:, None, :, :] - target, dim=-1) dist_mask = dist > dis_thresh target[dist_mask] = 1e6 # [B, num_agent, fut_ts] x_dist = torch.abs(pred[:, None, :, 0] - target[..., 0]) y_dist = torch.abs(pred[:, None, :, 1] - target[..., 1]) x_min_idxs = torch.argmin(x_dist, dim=1).tolist() y_min_idxs = torch.argmin(y_dist, dim=1).tolist() batch_idxs = [[i] for i in range(y_dist.shape[0])] ts_idxs = [[i for i in range(y_dist.shape[-1])] for j in range(y_dist.shape[0])] # [B, fut_ts] x_min_dist = x_dist[batch_idxs, x_min_idxs, ts_idxs] y_min_dist = y_dist[batch_idxs, y_min_idxs, ts_idxs] x_loss = x_min_dist safe_idx = x_loss > x_dis_thresh unsafe_idx = x_loss <= x_dis_thresh x_loss[safe_idx] = 0 x_loss[unsafe_idx] = x_dis_thresh - x_loss[unsafe_idx] y_loss = y_min_dist safe_idx = y_loss > y_dis_thresh unsafe_idx = y_loss <= y_dis_thresh y_loss[safe_idx] = 0 y_loss[unsafe_idx] = y_dis_thresh - y_loss[unsafe_idx] loss = torch.cat([x_loss.unsqueeze(-1), y_loss.unsqueeze(-1)], dim=-1) return loss @LOSSES.register_module() class PlanMapDirectionLoss(nn.Module): """Planning loss to force the ego heading angle consistent with lane direction. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. theta_thresh (float, optional): angle diff thresh between ego and lane. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, map_thresh=0.5, dis_thresh=2.0, lane_div_cls_idx=1, point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] ): super(PlanMapDirectionLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.map_thresh = map_thresh self.dis_thresh = dis_thresh self.lane_div_cls_idx = lane_div_cls_idx self.pc_range = point_cloud_range def forward(self, ego_fut_preds, lane_preds, lane_score_preds, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] lane_preds (Tensor): [B, num_vec, num_pts, 2] lane_score_preds (Tensor): [B, num_vec, 3] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # filter lane element according to confidence score and class not_lane_div_mask = lane_score_preds[..., self.lane_div_cls_idx] < self.map_thresh # denormalize map pts lane_div_preds = lane_preds.clone() lane_div_preds[...,0:1] = (lane_div_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) lane_div_preds[...,1:2] = (lane_div_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) # pad not-lane-divider cls and low confidence preds lane_div_preds[not_lane_div_mask] = 1e6 loss_bbox = self.loss_weight * plan_map_dir_loss(ego_fut_preds, lane_div_preds, weight=weight, dis_thresh=self.dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_map_dir_loss(pred, target, dis_thresh=2.0): """Planning ego-map directional loss. Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): lane_div_preds, [B, num_vec, num_pts, 2]. weight (torch.Tensor): [B, fut_ts] Returns: torch.Tensor: Calculated loss [B, fut_ts] """ num_map_pts = target.shape[2] pred = pred.cumsum(dim=-2) traj_dis = torch.linalg.norm(pred[:, -1, :] - pred[:, 0, :], dim=-1) static_mask = traj_dis < 1.0 target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) # find the closest map instance for ego at each timestamp dist = torch.linalg.norm(pred[:, :, None, None, :] - target, dim=-1) dist = dist.min(dim=-1, keepdim=False)[0] min_inst_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] target_map_inst = target[batch_idxs, ts_idxs, min_inst_idxs] # [B, fut_ts, num_pts, 2] # calculate distance dist = torch.linalg.norm(pred[:, :, None, :] - target_map_inst, dim=-1) min_pts_idxs = torch.argmin(dist, dim=-1) min_pts_next_idxs = min_pts_idxs.clone() is_end_point = (min_pts_next_idxs == num_map_pts-1) not_end_point = (min_pts_next_idxs != num_map_pts-1) min_pts_next_idxs[is_end_point] = num_map_pts - 2 min_pts_next_idxs[not_end_point] = min_pts_next_idxs[not_end_point] + 1 min_pts_idxs = min_pts_idxs.tolist() min_pts_next_idxs = min_pts_next_idxs.tolist() traj_yaw = torch.atan2(torch.diff(pred[..., 1]), torch.diff(pred[..., 0])) # [B, fut_ts-1] # last ts yaw assume same as previous traj_yaw = torch.cat([traj_yaw, traj_yaw[:, [-1]]], dim=-1) # [B, fut_ts] min_pts = target_map_inst[batch_idxs, ts_idxs, min_pts_idxs] dist = torch.linalg.norm(min_pts - pred, dim=-1) dist_mask = dist > dis_thresh min_pts = min_pts.unsqueeze(2) min_pts_next = target_map_inst[batch_idxs, ts_idxs, min_pts_next_idxs].unsqueeze(2) map_pts = torch.cat([min_pts, min_pts_next], dim=2) lane_yaw = torch.atan2(torch.diff(map_pts[..., 1]).squeeze(-1), torch.diff(map_pts[..., 0]).squeeze(-1)) # [B, fut_ts] yaw_diff = traj_yaw - lane_yaw yaw_diff[yaw_diff > math.pi] = yaw_diff[yaw_diff > math.pi] - math.pi yaw_diff[yaw_diff > math.pi/2] = yaw_diff[yaw_diff > math.pi/2] - math.pi yaw_diff[yaw_diff < -math.pi] = yaw_diff[yaw_diff < -math.pi] + math.pi yaw_diff[yaw_diff < -math.pi/2] = yaw_diff[yaw_diff < -math.pi/2] + math.pi yaw_diff[dist_mask] = 0 # loss = 0 if no lane around ego yaw_diff[static_mask] = 0 # loss = 0 if ego is static loss = torch.abs(yaw_diff) return loss # [B, fut_ts] @LOSSES.register_module() class PlanMapDirectionLoss2(nn.Module): """Planning loss to force the ego heading angle consistent with lane direction. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. theta_thresh (float, optional): angle diff thresh between ego and lane. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, map_thresh=0.5, dis_thresh=2.0, lane_div_cls_idx=1, point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] ): super(PlanMapDirectionLoss2, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.map_thresh = map_thresh self.dis_thresh = dis_thresh self.lane_div_cls_idx = lane_div_cls_idx self.pc_range = point_cloud_range def forward(self, ego_fut_preds, lane_preds, lane_score_preds, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] lane_preds (Tensor): [B, num_vec, num_pts, 2] lane_score_preds (Tensor): [B, num_vec, 3] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # filter lane element according to confidence score and class not_lane_div_mask = lane_score_preds[..., self.lane_div_cls_idx] < self.map_thresh # denormalize map pts lane_div_preds = lane_preds.clone() lane_div_preds[...,0:1] = (lane_div_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) lane_div_preds[...,1:2] = (lane_div_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) # pad not-lane-divider cls and low confidence preds lane_div_preds[not_lane_div_mask] = 1e6 loss_bbox = self.loss_weight * plan_map_dir_loss2(ego_fut_preds, lane_div_preds, weight=weight, dis_thresh=self.dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_map_dir_loss2(pred, target, dis_thresh=2.0): """Planning ego-map directional loss. Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): lane_div_preds, [B, num_vec, num_pts, 2]. weight (torch.Tensor): [B, fut_ts] Returns: torch.Tensor: Calculated loss [B, fut_ts] """ num_map_pts = target.shape[2] pred = pred.cumsum(dim=-2) traj_dis = torch.linalg.norm(pred[:, -1, :] - pred[:, 0, :], dim=-1) static_mask = traj_dis < 1.0 target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) # find the closest map instance for ego at each timestamp dist = torch.linalg.norm(pred[:, :, None, None, :] - target, dim=-1) dist = dist.min(dim=-1, keepdim=False)[0] min_inst_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] target_map_inst = target[batch_idxs, ts_idxs, min_inst_idxs] # [B, fut_ts, num_pts, 2] # calculate distance dist = torch.linalg.norm(pred[:, :, None, :] - target_map_inst, dim=-1) min_pts_idxs = torch.argmin(dist, dim=-1) min_pts_next_idxs = min_pts_idxs.clone() is_end_point = (min_pts_next_idxs == num_map_pts-1) not_end_point = (min_pts_next_idxs != num_map_pts-1) min_pts_next_idxs[is_end_point] = num_map_pts - 2 min_pts_next_idxs[not_end_point] = min_pts_next_idxs[not_end_point] + 1 min_pts_idxs = min_pts_idxs.tolist() min_pts_next_idxs = min_pts_next_idxs.tolist() traj_yaw = torch.atan2(torch.diff(pred[..., 0]), torch.diff(pred[..., 1])) # [B, fut_ts-1] # last ts yaw assume same as previous traj_yaw = torch.cat([traj_yaw, traj_yaw[:, [-1]]], dim=-1) # [B, fut_ts] min_pts = target_map_inst[batch_idxs, ts_idxs, min_pts_idxs] dist = torch.linalg.norm(min_pts - pred, dim=-1) dist_mask = dist > dis_thresh min_pts = min_pts.unsqueeze(2) min_pts_next = target_map_inst[batch_idxs, ts_idxs, min_pts_next_idxs].unsqueeze(2) map_pts = torch.cat([min_pts, min_pts_next], dim=2) lane_yaw = torch.atan2(torch.diff(map_pts[..., 0]).squeeze(-1), torch.diff(map_pts[..., 1]).squeeze(-1)) # [B, fut_ts] yaw_diff = traj_yaw - lane_yaw yaw_diff[yaw_diff > math.pi] = yaw_diff[yaw_diff > math.pi] - math.pi yaw_diff[yaw_diff > math.pi/2] = yaw_diff[yaw_diff > math.pi/2] - math.pi yaw_diff[yaw_diff < -math.pi] = yaw_diff[yaw_diff < -math.pi] + math.pi yaw_diff[yaw_diff < -math.pi/2] = yaw_diff[yaw_diff < -math.pi/2] + math.pi yaw_diff[dist_mask] = 0 # loss = 0 if no lane around ego yaw_diff[static_mask] = 0 # loss = 0 if ego is static loss = torch.abs(yaw_diff) return loss # [B, fut_ts] ================================================ FILE: mmdet3d/models/fbbev/planner_head/plan_loss_gt.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import math import mmcv import torch from torch import nn as nn from mmdet.models import weighted_loss from mmdet.models.builder import LOSSES @LOSSES.register_module() class PlanMapBoundLoss_gt(nn.Module): """Planning constraint to push ego vehicle away from the lane boundary. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. map_thresh (float, optional): confidence threshold to filter map predictions. lane_bound_cls_idx (float, optional): lane_boundary class index. dis_thresh (float, optional): distance threshold between ego vehicle and lane bound. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, map_thresh=0.5, lane_bound_cls_idx=2, dis_thresh=1.0, point_cloud_range=[-15.0, -30.0, -2.0, 15.0, 30.0, 2.0], perception_detach=False ): super(PlanMapBoundLoss_gt, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.map_thresh = map_thresh self.lane_bound_cls_idx = lane_bound_cls_idx self.dis_thresh = dis_thresh self.pc_range = point_cloud_range self.perception_detach = perception_detach def forward(self, ego_fut_preds, lane_gt, lane_labels, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] lane_preds (Tensor): [B, num_vec, num_pts, 2] lane_score_preds (Tensor): [B, num_vec, 3] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # filter lane element according to confidence score and class not_lane_bound_mask = lane_labels != self.lane_bound_cls_idx # denormalize map pts lane_bound_preds = lane_gt.clone() lane_bound_preds[...,0:1] = (lane_bound_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) lane_bound_preds[...,1:2] = (lane_bound_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) # pad not-lane-boundary cls and low confidence preds lane_bound_preds[not_lane_bound_mask] = 1e6 loss_bbox = self.loss_weight * plan_map_bound_loss(ego_fut_preds, lane_bound_preds, weight=weight, dis_thresh=self.dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_map_bound_loss(pred, target, dis_thresh=1.0): """Planning map bound constraint (L1 distance). Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): lane_bound_preds, [B, num_vec, num_pts, 2]. weight (torch.Tensor): [B, fut_ts] Returns: torch.Tensor: Calculated loss [B, fut_ts] """ pred = pred.cumsum(dim=-2) ego_traj_starts = pred[:, :-1, :] ego_traj_ends = pred B, T, _ = ego_traj_ends.size() padding_zeros = torch.zeros((B, 1, 2), dtype=pred.dtype, device=pred.device) # initial position ego_traj_starts = torch.cat((padding_zeros, ego_traj_starts), dim=1) _, V, P, _ = target.size() ego_traj_expanded = ego_traj_ends.unsqueeze(2).unsqueeze(3) # [B, T, 1, 1, 2] maps_expanded = target.unsqueeze(1) # [1, 1, M, P, 2] dist = torch.linalg.norm(ego_traj_expanded - maps_expanded, dim=-1) # [B, T, M, P] dist = dist.min(dim=-1, keepdim=False)[0] min_inst_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] bd_target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) min_bd_insts = bd_target[batch_idxs, ts_idxs, min_inst_idxs] # [B, T, P, 2] bd_inst_starts = min_bd_insts[:, :, :-1, :].flatten(0, 2) bd_inst_ends = min_bd_insts[:, :, 1:, :].flatten(0, 2) ego_traj_starts = ego_traj_starts.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2) ego_traj_ends = ego_traj_ends.unsqueeze(2).repeat(1, 1, P-1, 1).flatten(0, 2) intersect_mask = segments_intersect(ego_traj_starts, ego_traj_ends, bd_inst_starts, bd_inst_ends) intersect_mask = intersect_mask.reshape(B, T, P-1) intersect_mask = intersect_mask.any(dim=-1) intersect_idx = (intersect_mask == True).nonzero() target = target.view(target.shape[0], -1, target.shape[-1]) # [B, fut_ts, num_vec*num_pts] dist = torch.linalg.norm(pred[:, :, None, :] - target[:, None, :, :], dim=-1) min_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] min_dist = dist[batch_idxs, ts_idxs, min_idxs] loss = min_dist safe_idx = loss > dis_thresh unsafe_idx = loss <= dis_thresh loss[safe_idx] = 0 loss[unsafe_idx] = dis_thresh - loss[unsafe_idx] for i in range(len(intersect_idx)): loss[intersect_idx[i, 0], intersect_idx[i, 1]:] = 0 return loss def segments_intersect(line1_start, line1_end, line2_start, line2_end): # Calculating the differences dx1 = line1_end[:, 0] - line1_start[:, 0] dy1 = line1_end[:, 1] - line1_start[:, 1] dx2 = line2_end[:, 0] - line2_start[:, 0] dy2 = line2_end[:, 1] - line2_start[:, 1] # Calculating determinants det = dx1 * dy2 - dx2 * dy1 det_mask = det != 0 # Checking if lines are parallel or coincident parallel_mask = torch.logical_not(det_mask) # Calculating intersection parameters t1 = ((line2_start[:, 0] - line1_start[:, 0]) * dy2 - (line2_start[:, 1] - line1_start[:, 1]) * dx2) / det t2 = ((line2_start[:, 0] - line1_start[:, 0]) * dy1 - (line2_start[:, 1] - line1_start[:, 1]) * dx1) / det # Checking intersection conditions intersect_mask = torch.logical_and( torch.logical_and(t1 >= 0, t1 <= 1), torch.logical_and(t2 >= 0, t2 <= 1) ) # Handling parallel or coincident lines intersect_mask[parallel_mask] = False return intersect_mask @LOSSES.register_module() class PlanCollisionLoss_gt(nn.Module): """Planning constraint to push ego vehicle away from other agents. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. agent_thresh (float, optional): confidence threshold to filter agent predictions. x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis. y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, agent_thresh=0.5, x_dis_thresh=3.0, y_dis_thresh=1.5, point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] ): super(PlanCollisionLoss_gt, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.agent_thresh = agent_thresh self.x_dis_thresh = x_dis_thresh self.y_dis_thresh = y_dis_thresh self.pc_range = point_cloud_range def forward(self, ego_fut_preds, agent_fut_preds, # agent_score_preds, # agent_fut_cls_preds, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] agent_preds (Tensor): [B, num_agent, 2] agent_fut_preds (Tensor): [B, num_agent, fut_ts, 2] agent_fut_cls_preds (Tensor): [B, num_agent, fut_mode] agent_score_preds (Tensor): [B, num_agent] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # filter agent element according to confidence score # agent_max_score_preds, agent_max_score_idxs = agent_score_preds # .max(dim=-1) # not_valid_agent_mask = agent_score_preds < self.agent_thresh # filter low confidence preds # agent_fut_preds[not_valid_agent_mask] = 1e6 # filter not vehicle preds # not_veh_pred_mask = agent_max_score_idxs > 4 # veh idxs are 0-4 # agent_fut_preds[not_veh_pred_mask] = 1e6 # only use best mode pred # best_mode_idxs = torch.argmax(agent_fut_cls_preds, dim=-1).tolist() # batch_idxs = [[i] for i in range(agent_fut_cls_preds.shape[0])] # agent_num_idxs = [[i for i in range(agent_fut_cls_preds.shape[1])] for j in range(agent_fut_cls_preds.shape[0])] # agent_fut_preds = agent_fut_preds[batch_idxs, agent_num_idxs, best_mode_idxs] loss_bbox = self.loss_weight * plan_col_loss(ego_fut_preds, target=agent_fut_preds, weight=weight, x_dis_thresh=self.x_dis_thresh, y_dis_thresh=self.y_dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_col_loss( pred, target, x_dis_thresh=3.0, y_dis_thresh=1.5, dis_thresh=3.0 ): """Planning ego-agent collsion constraint. Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): agent_preds, [B, num_agent, 2]. agent_fut_preds (Tensor): [B, num_agent, fut_ts, 2]. weight (torch.Tensor): [B, fut_ts, 2]. x_dis_thresh (float, optional): distance threshold between ego and other agents in x-axis. y_dis_thresh (float, optional): distance threshold between ego and other agents in y-axis. dis_thresh (float, optional): distance threshold to filter distant agents. Returns: torch.Tensor: Calculated loss [B, fut_mode, fut_ts, 2] """ pred = pred.cumsum(dim=-2) # agent_fut_preds = agent_fut_preds.cumsum(dim=-2) # target = target[:, :, None, :] + agent_fut_preds # filter distant agents from ego vehicle dist = torch.linalg.norm(pred[:, None, :, :] - target, dim=-1) dist_mask = dist > dis_thresh target[dist_mask] = 1e6 # [B, num_agent, fut_ts] x_dist = torch.abs(pred[:, None, :, 0] - target[..., 0]) y_dist = torch.abs(pred[:, None, :, 1] - target[..., 1]) x_min_idxs = torch.argmin(x_dist, dim=1).tolist() y_min_idxs = torch.argmin(y_dist, dim=1).tolist() batch_idxs = [[i] for i in range(y_dist.shape[0])] ts_idxs = [[i for i in range(y_dist.shape[-1])] for j in range(y_dist.shape[0])] # [B, fut_ts] x_min_dist = x_dist[batch_idxs, x_min_idxs, ts_idxs] y_min_dist = y_dist[batch_idxs, y_min_idxs, ts_idxs] x_loss = x_min_dist safe_idx = x_loss > x_dis_thresh unsafe_idx = x_loss <= x_dis_thresh x_loss[safe_idx] = 0 x_loss[unsafe_idx] = x_dis_thresh - x_loss[unsafe_idx] y_loss = y_min_dist safe_idx = y_loss > y_dis_thresh unsafe_idx = y_loss <= y_dis_thresh y_loss[safe_idx] = 0 y_loss[unsafe_idx] = y_dis_thresh - y_loss[unsafe_idx] loss = torch.cat([x_loss.unsqueeze(-1), y_loss.unsqueeze(-1)], dim=-1) return loss @LOSSES.register_module() class PlanMapDirectionLoss_gt(nn.Module): """Planning loss to force the ego heading angle consistent with lane direction. Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. theta_thresh (float, optional): angle diff thresh between ego and lane. point_cloud_range (list, optional): point cloud range. """ def __init__( self, reduction='mean', loss_weight=1.0, map_thresh=0.5, dis_thresh=2.0, lane_div_cls_idx=1, point_cloud_range = [-15.0, -30.0, -2.0, 15.0, 30.0, 2.0] ): super(PlanMapDirectionLoss_gt, self).__init__() self.reduction = reduction self.loss_weight = loss_weight self.map_thresh = map_thresh self.dis_thresh = dis_thresh self.lane_div_cls_idx = lane_div_cls_idx self.pc_range = point_cloud_range def forward(self, ego_fut_preds, lane_gt, lane_labels, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: ego_fut_preds (Tensor): [B, fut_ts, 2] lane_preds (Tensor): [B, num_vec, num_pts, 2] lane_score_preds (Tensor): [B, num_vec, 3] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) # filter lane element according to confidence score and class not_lane_div_mask = lane_labels !=self.lane_div_cls_idx # denormalize map pts lane_div_preds = lane_gt.clone() lane_div_preds[...,0:1] = (lane_div_preds[..., 0:1] * (self.pc_range[3] - self.pc_range[0]) + self.pc_range[0]) lane_div_preds[...,1:2] = (lane_div_preds[..., 1:2] * (self.pc_range[4] - self.pc_range[1]) + self.pc_range[1]) # pad not-lane-divider cls and low confidence preds lane_div_preds[not_lane_div_mask] = 1e6 loss_bbox = self.loss_weight * plan_map_dir_loss(ego_fut_preds, lane_div_preds, weight=weight, dis_thresh=self.dis_thresh, reduction=reduction, avg_factor=avg_factor) return loss_bbox @mmcv.jit(derivate=True, coderize=True) @weighted_loss def plan_map_dir_loss(pred, target, dis_thresh=2.0): """Planning ego-map directional loss. Args: pred (torch.Tensor): ego_fut_preds, [B, fut_ts, 2]. target (torch.Tensor): lane_div_preds, [B, num_vec, num_pts, 2]. weight (torch.Tensor): [B, fut_ts] Returns: torch.Tensor: Calculated loss [B, fut_ts] """ num_map_pts = target.shape[2] pred = pred.cumsum(dim=-2) traj_dis = torch.linalg.norm(pred[:, -1, :] - pred[:, 0, :], dim=-1) static_mask = traj_dis < 1.0 target = target.unsqueeze(1).repeat(1, pred.shape[1], 1, 1, 1) # find the closest map instance for ego at each timestamp dist = torch.linalg.norm(pred[:, :, None, None, :] - target, dim=-1) dist = dist.min(dim=-1, keepdim=False)[0] min_inst_idxs = torch.argmin(dist, dim=-1).tolist() batch_idxs = [[i] for i in range(dist.shape[0])] ts_idxs = [[i for i in range(dist.shape[1])] for j in range(dist.shape[0])] target_map_inst = target[batch_idxs, ts_idxs, min_inst_idxs] # [B, fut_ts, num_pts, 2] # calculate distance dist = torch.linalg.norm(pred[:, :, None, :] - target_map_inst, dim=-1) min_pts_idxs = torch.argmin(dist, dim=-1) min_pts_next_idxs = min_pts_idxs.clone() is_end_point = (min_pts_next_idxs == num_map_pts-1) not_end_point = (min_pts_next_idxs != num_map_pts-1) min_pts_next_idxs[is_end_point] = num_map_pts - 2 min_pts_next_idxs[not_end_point] = min_pts_next_idxs[not_end_point] + 1 min_pts_idxs = min_pts_idxs.tolist() min_pts_next_idxs = min_pts_next_idxs.tolist() traj_yaw = torch.atan2(torch.diff(pred[..., 1]), torch.diff(pred[..., 0])) # [B, fut_ts-1] # last ts yaw assume same as previous traj_yaw = torch.cat([traj_yaw, traj_yaw[:, [-1]]], dim=-1) # [B, fut_ts] min_pts = target_map_inst[batch_idxs, ts_idxs, min_pts_idxs] dist = torch.linalg.norm(min_pts - pred, dim=-1) dist_mask = dist > dis_thresh min_pts = min_pts.unsqueeze(2) min_pts_next = target_map_inst[batch_idxs, ts_idxs, min_pts_next_idxs].unsqueeze(2) map_pts = torch.cat([min_pts, min_pts_next], dim=2) lane_yaw = torch.atan2(torch.diff(map_pts[..., 1]).squeeze(-1), torch.diff(map_pts[..., 0]).squeeze(-1)) # [B, fut_ts] yaw_diff = traj_yaw - lane_yaw yaw_diff[yaw_diff > math.pi] = yaw_diff[yaw_diff > math.pi] - math.pi yaw_diff[yaw_diff > math.pi/2] = yaw_diff[yaw_diff > math.pi/2] - math.pi yaw_diff[yaw_diff < -math.pi] = yaw_diff[yaw_diff < -math.pi] + math.pi yaw_diff[yaw_diff < -math.pi/2] = yaw_diff[yaw_diff < -math.pi/2] + math.pi yaw_diff[dist_mask] = 0 # loss = 0 if no lane around ego yaw_diff[static_mask] = 0 # loss = 0 if ego is static loss = torch.abs(yaw_diff) return loss # [B, fut_ts] ================================================ FILE: mmdet3d/models/fbbev/streammapnet/CustomMSDeformableAttention.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch import mmcv import cv2 as cv import copy import warnings from matplotlib import pyplot as plt import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init, constant_init from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import TransformerLayerSequence import math from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning, to_2tuple) from mmcv.utils import ext_loader from mmcv.ops.multi_scale_deform_attn import (MultiScaleDeformableAttnFunction, multi_scale_deformable_attn_pytorch) from .fp16_dattn import MultiScaleDeformableAttnFunctionFp32 @ATTENTION.register_module() class CustomMSDeformableAttention(BaseModule): """An attention module used in Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, im2col_step=64, dropout=0.1, use_sampling_offsets=True, batch_first=False, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first self.fp16_enabled = False # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.use_sampling_offsets = use_sampling_offsets if use_sampling_offsets: self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" if self.use_sampling_offsets: constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiScaleDeformableAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, flag='decoder', **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, num_points, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) if self.use_sampling_offsets: sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) else: sampling_offsets = query.new_zeros((bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) # TODO: try remove sampling offsets offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) # changed to (h, w) _, _, num_points, _ = reference_points.shape # (bs, num_queries, num_pts, 2) -> # (bs, num_queries, num_heads, num_lvls, num_pts, 2) reference_points = reference_points[:, :, None, None, :, :] # reference_points[..., 1:2] = -reference_points[..., 1:2] sampling_locations = reference_points + \ (sampling_offsets # (bs, num_queries, num_heads, num_lvls, num_pts, 2) / offset_normalizer[None, None, None, :, None, :]) assert list(sampling_locations.shape) == [bs, num_query, self.num_heads, self.num_levels, num_points, 2] if torch.cuda.is_available() and value.is_cuda: # using fp16 deformable attention is unstable because it performs many sum operations output = MultiScaleDeformableAttnFunctionFp32.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) output = self.output_proj(output) if not self.batch_first: # (num_query, bs ,embed_dims) output = output.permute(1, 0, 2) return self.dropout(output) + identity ================================================ FILE: mmdet3d/models/fbbev/streammapnet/__init__.py ================================================ from .cost import * from .hungarian_lines_assigner import * from .loss import * from .streammapnet_head import MapDetectorHead from .transformer import MapTransformerDecoder_new, MapTransformerLayer, MapTransformer ================================================ FILE: mmdet3d/models/fbbev/streammapnet/cost.py ================================================ import torch from mmdet.core.bbox.match_costs.builder import MATCH_COST from mmdet.core.bbox.match_costs import build_match_cost from torch.nn.functional import smooth_l1_loss from mmdet.core.bbox.iou_calculators import bbox_overlaps from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy def chamfer_distance(line1, line2) -> float: ''' Calculate chamfer distance between two lines. Make sure the lines are interpolated. Args: line1 (tensor): shape (num_pts, 2) line2 (tensor): shape (num_pts, 2) Returns: distance (float): chamfer distance ''' dist_matrix = torch.cdist(line1, line2, p=2) dist12 = dist_matrix.min(-1)[0].sum() / len(line1) dist21 = dist_matrix.min(-2)[0].sum() / len(line2) return (dist12 + dist21) / 2 @MATCH_COST.register_module() class ClsSigmoidCost: """ClsSoftmaxCost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def __call__(self, cls_pred, gt_labels): """ Args: cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). Returns: torch.Tensor: cls_cost value with weight """ # Following the official DETR repo, contrary to the loss that # NLL is used, we approximate it in 1 - cls_score[gt_label]. # The 1 is a constant that doesn't change the matching, # so it can be omitted. cls_score = cls_pred.sigmoid() cls_cost = -cls_score[:, gt_labels] return cls_cost * self.weight @MATCH_COST.register_module() class LinesFixNumChamferCost(object): """BBox3DL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.0, permute=False): self.weight = weight self.permute = permute def __call__(self, lines_pred, gt_lines): """ Args: lines_pred (Tensor): predicted normalized lines: [num_query, 2*num_points] gt_lines (Tensor): Ground truth lines [num_gt, 2*num_points] or [num_gt, num_permute, 2*num_points] Returns: torch.Tensor: reg_cost value with weight shape [num_pred, num_gt] """ if self.permute: assert len(gt_lines.shape) == 3 else: assert len(gt_lines.shape) == 2 num_gt, num_pred = len(gt_lines), len(lines_pred) if self.permute: gt_lines = gt_lines.flatten(0, 1) # (num_gt*num_permute, 2*num_pts) num_pts = lines_pred.shape[-1] // 2 lines_pred = lines_pred.view(-1, 2) # [num_query*num_points, 2] gt_lines = gt_lines.view(-1, 2) # [num_gt*num_points, 2] dist_mat = torch.cdist(lines_pred, gt_lines, p=2) # (num_query*num_points, num_gt*num_points) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=-1)) # (num_gt, num_query*num_points, num_pts) dist_mat = torch.stack(torch.split(dist_mat, num_pts, dim=1)) # (num_q, num_gt, num_pts, num_pts) dist1 = dist_mat.min(-1)[0].sum(-1) dist2 = dist_mat.min(-2)[0].sum(-1) dist_mat = (dist1 + dist2) / (2 * num_pts) # (num_pred, num_gt) if self.permute: # dist_mat: (num_pred, num_gt*num_permute) dist_mat = dist_mat.view(num_pred, num_gt, -1) # (num_pred, num_gt, num_permute) dist_mat, gt_permute_index = dist_mat.min(-1) return dist_mat * self.weight, gt_permute_index return dist_mat * self.weight @MATCH_COST.register_module() class LinesL1Cost(object): """LinesL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.0, beta=0.0, permute=False): self.weight = weight self.permute = permute self.beta = beta def __call__(self, lines_pred, gt_lines, **kwargs): """ Args: lines_pred (Tensor): predicted normalized lines: [num_query, num_points, 2] or [num_query, num_points * 2] gt_lines (Tensor): Ground truth lines [num_gt, num_points, 2] or [num_gt, num_permute, num_points, 2] Returns: torch.Tensor: reg_cost value with weight shape [num_pred, num_gt] """ if self.permute: assert len(gt_lines.shape) == 4 else: assert len(gt_lines.shape) == 3 if lines_pred.dim() == 3: lines_pred = lines_pred.flatten(-2, -1) gt_lines = gt_lines.flatten(-2, -1) num_pred, num_gt = len(lines_pred), len(gt_lines) if self.permute: # permute-invarint labels gt_lines = gt_lines.flatten(0, 1) # (num_gt*num_permute, 2*num_pts) num_pts = lines_pred.shape[-1]//2 if self.beta > 0: lines_pred = lines_pred.unsqueeze(1).repeat(1, len(gt_lines), 1) gt_lines = gt_lines.unsqueeze(0).repeat(num_pred, 1, 1) dist_mat = smooth_l1_loss(lines_pred, gt_lines, reduction='none', beta=self.beta).sum(-1) else: dist_mat = torch.cdist(lines_pred, gt_lines, p=1) dist_mat = dist_mat / num_pts if self.permute: # dist_mat: (num_pred, num_gt*num_permute) dist_mat = dist_mat.view(num_pred, num_gt, -1) # (num_pred, num_gt, num_permute) dist_mat, gt_permute_index = torch.min(dist_mat, 2) return dist_mat * self.weight, gt_permute_index return dist_mat * self.weight @MATCH_COST.register_module() class BBoxCostC: """BBoxL1Cost. Args: weight (int | float, optional): loss_weight box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN Examples: >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost >>> import torch >>> self = BBoxL1Cost() >>> bbox_pred = torch.rand(1, 4) >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> factor = torch.tensor([10, 8, 10, 8]) >>> self(bbox_pred, gt_bboxes, factor) tensor([[1.6172, 1.6422]]) """ def __init__(self, weight=1., box_format='xyxy'): self.weight = weight assert box_format in ['xyxy', 'xywh'] self.box_format = box_format def __call__(self, bbox_pred, gt_bboxes): """ Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with normalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: bbox_cost value with weight """ # if self.box_format == 'xywh': # gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes) # elif self.box_format == 'xyxy': # bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred) bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) return bbox_cost * self.weight @MATCH_COST.register_module() class IoUCostC: """IoUCost. Args: iou_mode (str, optional): iou mode such as 'iou' | 'giou' weight (int | float, optional): loss weight Examples: >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost >>> import torch >>> self = IoUCost() >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]]) >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]]) >>> self(bboxes, gt_bboxes) tensor([[-0.1250, 0.1667], [ 0.1667, -0.5000]]) """ def __init__(self, iou_mode='giou', weight=1., box_format='xywh'): self.weight = weight self.iou_mode = iou_mode assert box_format in ['xyxy', 'xywh'] self.box_format = box_format def __call__(self, bboxes, gt_bboxes): """ Args: bboxes (Tensor): Predicted boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: iou_cost value with weight """ if self.box_format == 'xywh': bboxes = bbox_cxcywh_to_xyxy(bboxes) gt_bboxes = bbox_cxcywh_to_xyxy(gt_bboxes) # overlaps: [num_bboxes, num_gt] overlaps = bbox_overlaps( bboxes, gt_bboxes, mode=self.iou_mode, is_aligned=False) # The 1 is a constant that doesn't change the matching, so omitted. iou_cost = -overlaps return iou_cost * self.weight @MATCH_COST.register_module() class DynamicLinesCost(object): """LinesL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def __call__(self, lines_pred, lines_gt, masks_pred, masks_gt): """ Args: lines_pred (Tensor): predicted normalized lines: [nP, num_points, 2] lines_gt (Tensor): Ground truth lines [nG, num_points, 2] masks_pred: [nP, num_points] masks_gt: [nG, num_points] Returns: dist_mat: reg_cost value with weight shape [nP, nG] """ dist_mat = self.cal_dist(lines_pred, lines_gt) dist_mat = self.get_dynamic_line(dist_mat, masks_pred, masks_gt) dist_mat = dist_mat * self.weight return dist_mat def cal_dist(self, x1, x2): ''' Args: x1: B1,N,2 x2: B2,N,2 Return: dist_mat: B1,B2,N ''' x1 = x1.permute(1, 0, 2) x2 = x2.permute(1, 0, 2) dist_mat = torch.cdist(x1, x2, p=2) dist_mat = dist_mat.permute(1, 2, 0) return dist_mat def get_dynamic_line(self, mat, m1, m2): ''' get dynamic line with difference approach mat: N1xN2xnpts m1: N1xnpts m2: N2xnpts ''' # nPxnGxnum_points m1 = m1.unsqueeze(1).sigmoid() > 0.5 m2 = m2.unsqueeze(0) valid_points_mask = (m1 + m2)/2. average_factor_mask = valid_points_mask.sum(-1) > 0 average_factor = average_factor_mask.masked_fill( ~average_factor_mask, 1) # takes the average mat = mat * valid_points_mask mat = mat.sum(-1) / average_factor return mat @MATCH_COST.register_module() class BBoxLogitsCost(object): """BBoxLogits. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def calNLL(self, logits, value): ''' Args: logits: B1, 8, cls_dim value: B2, 8, Return: log_likelihood: B1,B2,8 ''' logits = logits[:, None] value = value[None] value = value.long().unsqueeze(-1) value, log_pmf = torch.broadcast_tensors(value, logits) value = value[..., :1] return log_pmf.gather(-1, value).squeeze(-1) def __call__(self, bbox_pred, bbox_gt, **kwargs): """ Args: bbox_pred: nproposal, 4*2, pos_dim bbox_gt: ngt, 4*2 Returns: cost: nproposal, ngt """ cost = self.calNLL(bbox_pred, bbox_gt).mean(-1) return cost * self.weight @MATCH_COST.register_module() class MapQueriesCost(object): def __init__(self, cls_cost, reg_cost, iou_cost=None): self.cls_cost = build_match_cost(cls_cost) self.reg_cost = build_match_cost(reg_cost) self.iou_cost = None if iou_cost is not None: self.iou_cost = build_match_cost(iou_cost) def __call__(self, preds: dict, gts: dict): # classification and bboxcost. cls_cost = self.cls_cost(preds['scores'], gts['labels']) # regression cost regkwargs = {} if 'masks' in preds and 'masks' in gts: assert isinstance(self.reg_cost, DynamicLinesCost), ' Issues!!' regkwargs = { 'masks_pred': preds['masks'], 'masks_gt': gts['masks'], } reg_cost = self.reg_cost(preds['lines'], gts['lines'], **regkwargs) if self.reg_cost.permute: reg_cost, gt_permute_idx = reg_cost # weighted sum of above three costs cost = cls_cost + reg_cost # Iou if self.iou_cost is not None: iou_cost = self.iou_cost(preds['lines'],gts['lines']) cost += iou_cost if self.reg_cost.permute: return cost, gt_permute_idx return cost ================================================ FILE: mmdet3d/models/fbbev/streammapnet/fp16_dattn.py ================================================ import warnings try: from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention except ImportError: warnings.warn( '`MultiScaleDeformableAttention` in MMCV has been moved to ' '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV') from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention from mmcv.runner import force_fp32, auto_fp16 from mmcv.cnn.bricks.registry import ATTENTION from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.cnn.bricks.transformer import build_attention import math import warnings import torch import torch.nn as nn import torch.nn.functional as F from torch.autograd.function import Function, once_differentiable from mmcv import deprecated_api_warning from mmcv.cnn import constant_init, xavier_init from mmcv.cnn.bricks.registry import ATTENTION from mmcv.runner import BaseModule from mmcv.utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) from torch.cuda.amp import custom_bwd, custom_fwd @ATTENTION.register_module() class MultiScaleDeformableAttentionFp16(BaseModule): def __init__(self, attn_cfg=None,init_cfg=None,**kwarg): super(MultiScaleDeformableAttentionFp16,self).__init__(init_cfg) # import ipdb; ipdb.set_trace() self.deformable_attention = build_attention(attn_cfg) self.deformable_attention.init_weights() self.fp16_enabled = False @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points','identity')) def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): # import ipdb; ipdb.set_trace() return self.deformable_attention(query, key=key, value=value, identity=identity, query_pos=query_pos, key_padding_mask=key_padding_mask, reference_points=reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index,**kwargs) class MultiScaleDeformableAttnFunctionFp32(Function): @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index,\ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): """CPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), Returns: Tensor: has shape (bs, num_queries, embed_dims) """ bs, _, num_heads, embed_dims = value.shape _, num_queries, num_heads, num_levels, num_points, _ =\ sampling_locations.shape value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) sampling_grids = 2 * sampling_locations - 1 sampling_value_list = [] for level, (H_, W_) in enumerate(value_spatial_shapes): # bs, H_*W_, num_heads, embed_dims -> # bs, H_*W_, num_heads*embed_dims -> # bs, num_heads*embed_dims, H_*W_ -> # bs*num_heads, embed_dims, H_, W_ value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape( bs * num_heads, embed_dims, H_, W_) # bs, num_queries, num_heads, num_points, 2 -> # bs, num_heads, num_queries, num_points, 2 -> # bs*num_heads, num_queries, num_points, 2 sampling_grid_l_ = sampling_grids[:, :, :, level].transpose(1, 2).flatten(0, 1) # bs*num_heads, embed_dims, num_queries, num_points sampling_value_l_ = F.grid_sample( value_l_, sampling_grid_l_, mode='bilinear', padding_mode='zeros', align_corners=False) sampling_value_list.append(sampling_value_l_) # (bs, num_queries, num_heads, num_levels, num_points) -> # (bs, num_heads, num_queries, num_levels, num_points) -> # (bs, num_heads, 1, num_queries, num_levels*num_points) attention_weights = attention_weights.transpose(1, 2).reshape( bs * num_heads, 1, num_queries, num_levels * num_points) output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(bs, num_heads * embed_dims, num_queries) return output.transpose(1, 2).contiguous() @ATTENTION.register_module() class MultiScaleDeformableAttentionFP32(BaseModule): """An attention module used in Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=4, im2col_step=64, dropout=0.1, batch_first=False, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.dropout = nn.Dropout(dropout) self.batch_first = batch_first # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.output_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 2).repeat(1, self.num_levels, self.num_points, 1) for i in range(self.num_points): grid_init[:, :, i, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True @deprecated_api_warning({'residual': 'identity'}, cls_name='MultiScaleDeformableAttention') def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) if reference_points.shape[-1] == 2: offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) sampling_locations = reference_points[:, :, None, :, None, :] \ + sampling_offsets \ / offset_normalizer[None, None, None, :, None, :] elif reference_points.shape[-1] == 4: sampling_locations = reference_points[:, :, None, :, None, :2] \ + sampling_offsets / self.num_points \ * reference_points[:, :, None, :, None, 2:] \ * 0.5 else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') if torch.cuda.is_available(): output = MultiScaleDeformableAttnFunctionFp32.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) output = self.output_proj(output) if not self.batch_first: # (num_query, bs ,embed_dims) output = output.permute(1, 0, 2) return self.dropout(output) + identity ================================================ FILE: mmdet3d/models/fbbev/streammapnet/hungarian_lines_assigner.py ================================================ import torch from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.assigners import AssignResult from mmdet.core.bbox.assigners import BaseAssigner from mmdet.core.bbox.match_costs import build_match_cost from scipy.optimize import linear_sum_assignment @BBOX_ASSIGNERS.register_module() class HungarianLinesAssigner(BaseAssigner): """ Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classification cost and regression L1 cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. """ def __init__(self, cost=dict( type='MapQueriesCost', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='LinesCost', weight=1.0), ), **kwargs): self.cost = build_match_cost(cost) def assign(self, preds: dict, gts: dict, gt_bboxes_ignore=None, eps=1e-7): """ Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: lines_pred (Tensor): predicted normalized lines: [num_query, num_points, 2] cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. lines_gt (Tensor): Ground truth lines [num_gt, num_points, 2]. labels_gt (Tensor): Label of `gt_bboxes`, shape (num_gt,). gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_lines = gts['lines'].size(0), preds['lines'].size(0) # 1. assign -1 by default assigned_gt_inds = \ preds['lines'].new_full((num_lines,), -1, dtype=torch.long) assigned_labels = \ preds['lines'].new_full((num_lines,), -1, dtype=torch.long) if num_gts == 0 or num_lines == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels), None # 2. compute the weighted costs gt_permute_idx = None # (num_preds, num_gts) if self.cost.reg_cost.permute: cost, gt_permute_idx = self.cost(preds, gts) else: cost = self.cost(preds, gts) # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu().numpy() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') try: matched_row_inds, matched_col_inds = linear_sum_assignment(cost) except: print('cost max{}, min{}'.format(cost.max(), cost.min())) from IPython import embed embed() matched_row_inds = torch.from_numpy(matched_row_inds).to( preds['lines'].device) matched_col_inds = torch.from_numpy(matched_col_inds).to( preds['lines'].device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gts['labels'][matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels), gt_permute_idx ================================================ FILE: mmdet3d/models/fbbev/streammapnet/loss.py ================================================ import torch from torch import nn as nn from torch.nn import functional as F from mmdet.models.losses import l1_loss, smooth_l1_loss from mmdet.models.losses.utils import weighted_loss import mmcv from mmdet.models.builder import LOSSES @LOSSES.register_module() class LinesL1Loss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0, beta=0.5): """ L1 loss. The same as the smooth L1 loss Args: reduction (str, optional): The method to reduce the loss. Options are "none", "mean" and "sum". loss_weight (float, optional): The weight of loss. """ super().__init__() self.reduction = reduction self.loss_weight = loss_weight self.beta = beta def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. shape: [bs, ...] target (torch.Tensor): The learning target of the prediction. shape: [bs, ...] weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. it's useful when the predictions are not all valid. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if self.beta > 0: loss = smooth_l1_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor, beta=self.beta) else: loss = l1_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor) num_points = pred.shape[-1] // 2 loss = loss / num_points return loss*self.loss_weight @mmcv.jit(derivate=True, coderize=True) @weighted_loss def bce(pred, label, class_weight=None): """ pred: B,nquery,npts label: B,nquery,npts """ if label.numel() == 0: return pred.sum() * 0 assert pred.size() == label.size() loss = F.binary_cross_entropy_with_logits( pred, label.float(), pos_weight=class_weight, reduction='none') return loss @LOSSES.register_module() class MasksLoss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0): super(MasksLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: xxx """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss = bce(pred, target, weight, reduction=reduction, avg_factor=avg_factor) return loss*self.loss_weight @mmcv.jit(derivate=True, coderize=True) @weighted_loss def ce(pred, label, class_weight=None): """ pred: B*nquery,npts label: B*nquery, """ if label.numel() == 0: return pred.sum() * 0 loss = F.cross_entropy( pred, label, weight=class_weight, reduction='none') return loss @LOSSES.register_module() class LenLoss(nn.Module): def __init__(self, reduction='mean', loss_weight=1.0): super(LenLoss, self).__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: xxx """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss = ce(pred, target, weight, reduction=reduction, avg_factor=avg_factor) return loss*self.loss_weight ================================================ FILE: mmdet3d/models/fbbev/streammapnet/map_utils.py ================================================ from mmdet.core.bbox.transforms import bbox_xyxy_to_cxcywh, bbox_cxcywh_to_xyxy def normalize_2d_bbox(bboxes, pc_range): patch_h = pc_range[4]-pc_range[1] patch_w = pc_range[3]-pc_range[0] cxcywh_bboxes = bbox_xyxy_to_cxcywh(bboxes) cxcywh_bboxes[...,0:1] = cxcywh_bboxes[..., 0:1] - pc_range[0] cxcywh_bboxes[...,1:2] = cxcywh_bboxes[...,1:2] - pc_range[1] factor = bboxes.new_tensor([patch_w, patch_h,patch_w,patch_h]) normalized_bboxes = cxcywh_bboxes / factor return normalized_bboxes def normalize_2d_pts(pts, pc_range): patch_h = pc_range[4]-pc_range[1] patch_w = pc_range[3]-pc_range[0] new_pts = pts.clone() new_pts[...,0:1] = pts[..., 0:1] - pc_range[0] new_pts[...,1:2] = pts[...,1:2] - pc_range[1] factor = pts.new_tensor([patch_w, patch_h]) normalized_pts = new_pts / factor return normalized_pts def denormalize_2d_bbox(bboxes, pc_range): bboxes = bbox_cxcywh_to_xyxy(bboxes) bboxes[..., 0::2] = (bboxes[..., 0::2]*(pc_range[3] - pc_range[0]) + pc_range[0]) bboxes[..., 1::2] = (bboxes[..., 1::2]*(pc_range[4] - pc_range[1]) + pc_range[1]) return bboxes def denormalize_2d_pts(pts, pc_range): new_pts = pts.clone() new_pts[...,0:1] = (pts[..., 0:1]*(pc_range[3] - pc_range[0]) + pc_range[0]) new_pts[...,1:2] = (pts[...,1:2]*(pc_range[4] - pc_range[1]) + pc_range[1]) return new_pts ================================================ FILE: mmdet3d/models/fbbev/streammapnet/streammapnet_head.py ================================================ import copy import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import mmcv from mmcv.cnn import Conv2d, Linear, build_activation_layer, bias_init_with_prob, xavier_init from mmcv.runner import force_fp32 from mmcv.cnn.bricks.transformer import build_positional_encoding from mmdet.models.utils import build_transformer from mmdet.models import build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.core import multi_apply, reduce_mean, build_assigner, build_sampler from mmdet.models import HEADS from mmdet.models.utils.transformer import inverse_sigmoid from .utils import StreamTensorMemory from .utils import MotionMLP @HEADS.register_module(force=True) class MapDetectorHead(nn.Module): def __init__(self, num_queries, num_classes=3, in_channels=256, embed_dims=256, score_thr=0.1, num_points=20, coord_dim=2, roi_size=(60, 30), different_heads=True, predict_refine=False, bev_pos=None, sync_cls_avg_factor=True, bg_cls_weight=0., streaming_cfg=dict(), transformer=dict(), loss_cls=dict(), loss_reg=dict(), assigner=dict(), map_layer_index=-1, **kwargs, ): super().__init__() self.num_queries = num_queries self.num_classes = num_classes self.in_channels = in_channels self.embed_dims = embed_dims self.different_heads = different_heads self.predict_refine = predict_refine self.bev_pos = bev_pos self.num_points = num_points self.coord_dim = coord_dim self.sync_cls_avg_factor = sync_cls_avg_factor self.bg_cls_weight = bg_cls_weight self.map_layer_index = map_layer_index if streaming_cfg: self.streaming_query = streaming_cfg['streaming'] else: self.streaming_query = False if self.streaming_query: self.batch_size = streaming_cfg['batch_size'] self.topk_query = streaming_cfg['topk'] self.trans_loss_weight = streaming_cfg.get('trans_loss_weight', 0.0) self.query_memory = StreamTensorMemory( self.batch_size, ) self.reference_points_memory = StreamTensorMemory( self.batch_size, ) c_dim = 12 self.query_update = MotionMLP(c_dim=c_dim, f_dim=self.embed_dims, identity=True) self.target_memory = StreamTensorMemory(self.batch_size) self.register_buffer('roi_size', torch.tensor(roi_size, dtype=torch.float32)) origin = (-roi_size[0]/2, -roi_size[1]/2) self.register_buffer('origin', torch.tensor(origin, dtype=torch.float32)) sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.transformer = build_transformer(transformer) self.loss_cls = build_loss(loss_cls) self.loss_reg = build_loss(loss_reg) self.assigner = build_assigner(assigner) if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self._init_embedding() self._init_branch() self.init_weights() def init_weights(self): """Initialize weights of the DeformDETR head.""" for p in self.input_proj.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) xavier_init(self.reference_points_embed, distribution='uniform', bias=0.) self.transformer.init_weights() # init prediction branch for m in self.reg_branches: for param in m.parameters(): if param.dim() > 1: nn.init.xavier_uniform_(param) # focal loss init if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) if isinstance(self.cls_branches, nn.ModuleList): for m in self.cls_branches: if hasattr(m, 'bias'): nn.init.constant_(m.bias, bias_init) else: m = self.cls_branches nn.init.constant_(m.bias, bias_init) if self.streaming_query: if isinstance(self.query_update, MotionMLP): self.query_update.init_weights() if hasattr(self, 'query_alpha'): for m in self.query_alpha: for param in m.parameters(): if param.dim() > 1: nn.init.zeros_(param) def _init_embedding(self): positional_encoding = dict( type='SinePositionalEncoding', num_feats=self.embed_dims//2, normalize=True ) self.bev_pos_embed = build_positional_encoding(positional_encoding) # query_pos_embed & query_embed self.query_embedding = nn.Embedding(self.num_queries, self.embed_dims) self.reference_points_embed = nn.Linear(self.embed_dims, self.num_points * 2) def _init_branch(self,): """Initialize classification branch and regression branch of head.""" self.input_proj = Conv2d( self.in_channels, self.embed_dims, kernel_size=1) cls_branch = Linear(self.embed_dims, self.cls_out_channels) reg_branch = [ Linear(self.embed_dims, 2*self.embed_dims), nn.LayerNorm(2*self.embed_dims), nn.ReLU(), Linear(2*self.embed_dims, 2*self.embed_dims), nn.LayerNorm(2*self.embed_dims), nn.ReLU(), Linear(2*self.embed_dims, self.num_points * self.coord_dim), ] reg_branch = nn.Sequential(*reg_branch) num_layers = self.transformer.decoder.num_layers if self.different_heads: cls_branches = nn.ModuleList( [copy.deepcopy(cls_branch) for _ in range(num_layers)]) reg_branches = nn.ModuleList( [copy.deepcopy(reg_branch) for _ in range(num_layers)]) else: cls_branches = nn.ModuleList( [cls_branch for _ in range(num_layers)]) reg_branches = nn.ModuleList( [reg_branch for _ in range(num_layers)]) self.reg_branches = reg_branches self.cls_branches = cls_branches def _prepare_context(self, bev_features): """Prepare class label and vertex context.""" device = bev_features.device # Add 2D coordinate grid embedding B, C, H, W = bev_features.shape bev_mask = bev_features.new_zeros(B, H, W) bev_pos_embeddings = self.bev_pos_embed(bev_mask) # (bs, embed_dims, H, W) bev_features = self.input_proj(bev_features) + bev_pos_embeddings # (bs, embed_dims, H, W) assert list(bev_features.shape) == [B, self.embed_dims, H, W] return bev_features def propagate(self, query_embedding, img_metas, start_of_sequence, ego_pose_inv, return_loss=True): bs = query_embedding.shape[0] propagated_query_list = [] prop_reference_points_list = [] tmp = self.query_memory.get(img_metas) query_memory, pose_memory = tmp['tensor'], tmp['img_metas'] tmp = self.reference_points_memory.get(img_metas) ref_pts_memory, pose_memory = tmp['tensor'], tmp['img_metas'] if return_loss: target_memory = self.target_memory.get(img_metas)['tensor'] trans_loss = query_embedding.new_zeros((1,)) num_pos = 0 is_first_frame_list = start_of_sequence for i in range(bs): is_first_frame = is_first_frame_list[i] if is_first_frame: padding = query_embedding.new_zeros((self.topk_query, self.embed_dims)) if return_loss: trans_loss += self.query_update( padding, # (topk, embed_dims) padding.new_zeros((self.topk_query, 12)) ).sum() * 0 propagated_query_list.append(padding) padding = query_embedding.new_zeros((self.topk_query, self.num_points, 2)) prop_reference_points_list.append(padding) else: curr_to_prev_ego_rt = query_embedding.new_tensor(img_metas[i]['curr_to_prev_ego_rt'], dtype=torch.float64).to(query_embedding.device) pos_encoding = curr_to_prev_ego_rt.float()[:3].view(-1) prop_q = query_memory[i] # query_memory_updated = prop_q query_memory_updated = self.query_update( prop_q, # (topk, embed_dims) pos_encoding.view(1, -1).repeat(len(query_memory[i]), 1) * 0, ) propagated_query_list.append(query_memory_updated.clone()) pred = self.reg_branches[-1](query_memory_updated).sigmoid() # (num_prop, 2*num_pts) assert list(pred.shape) == [self.topk_query, 2*self.num_points] if return_loss: targets = target_memory[i] weights = targets.new_ones((self.topk_query, 2*self.num_points)) bg_idx = torch.all(targets.view(self.topk_query, -1) == -1e5, dim=1) num_pos = num_pos + (self.topk_query - bg_idx.sum()) weights[bg_idx, :] = 0.0 # global -> ego curr_targets = torch.einsum('lk,ijk->ijl', ego_pose_inv[i].float(), targets) normed_targets = (curr_targets[..., :2] - self.origin) / self.roi_size # (num_prop, num_pts, 2) normed_targets = torch.clip(normed_targets, min=0., max=1.).reshape(-1, 2*self.num_points) # (num_prop, 2*num_pts) trans_loss += self.loss_reg(pred, normed_targets, weights, avg_factor=1.0) # trans_loss = None # ref pts prev_ref_pts = ref_pts_memory[i] curr_ref_pts = torch.einsum('lk,ijk->ijl', ego_pose_inv[i].double(), prev_ref_pts.double()).float() normed_ref_pts = (curr_ref_pts[..., :2] - self.origin) / self.roi_size # (num_prop, num_pts, 2) # self.visual_sample(normed_ref_pts, img_metas[i]['index'], prev=True) normed_ref_pts = torch.clip(normed_ref_pts, min=0., max=1.) prop_reference_points_list.append(normed_ref_pts) prop_query_embedding = torch.stack(propagated_query_list) # (bs, topk, embed_dims) prop_ref_pts = torch.stack(prop_reference_points_list) # (bs, topk, num_pts, 2) assert list(prop_query_embedding.shape) == [bs, self.topk_query, self.embed_dims] assert list(prop_ref_pts.shape) == [bs, self.topk_query, self.num_points, 2] init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts) init_reference_points = init_reference_points.view(bs, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2) memory_query_embedding = None if return_loss: trans_loss = self.trans_loss_weight * trans_loss / (num_pos + 1e-10) return query_embedding, prop_query_embedding, init_reference_points, prop_ref_pts, memory_query_embedding, is_first_frame_list, trans_loss else: return query_embedding, prop_query_embedding, init_reference_points, prop_ref_pts, memory_query_embedding, is_first_frame_list def forward_train(self, input_dict, img_metas, map_gt_bboxes_3d, map_gt_labels_3d): ''' Args: bev_feature (List[Tensor]): shape [B, C, H, W] feature in bev view Outs: preds_dict (list[dict]): lines (Tensor): Classification score of all decoder layers, has shape [bs, num_query, 2*num_points] scores (Tensor): [bs, num_query,] ''' if input_dict['img_bev_feat'][0].dim() == 5: bev_features = [level.mean(-1) for level in input_dict['img_bev_feat']][0] else: bev_features = input_dict['img_bev_feat'][0] start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(input_dict['img_bev_feat'][0].device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) bev_features = self._prepare_context(bev_features) bs, C, H, W = bev_features.shape img_masks = bev_features.new_zeros((bs, H, W)) # pos_embed = self.positional_encoding(img_masks) pos_embed = None query_embedding = self.query_embedding.weight[None, ...].repeat(bs, 1, 1) # [B, num_q, embed_dims] input_query_num = self.num_queries # num query: self.num_query + self.topk if self.streaming_query: query_embedding, prop_query_embedding, init_reference_points, prop_ref_pts, memory_query, is_first_frame_list, trans_loss = \ self.propagate(query_embedding, img_metas, start_of_sequence, ego_pose_inv, return_loss=True) else: init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts) init_reference_points = init_reference_points.view(-1, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2) prop_query_embedding = None prop_ref_pts = None is_first_frame_list = [True for i in range(bs)] assert list(init_reference_points.shape) == [bs, self.num_queries, self.num_points, 2] assert list(query_embedding.shape) == [bs, self.num_queries, self.embed_dims] # outs_dec: (num_layers, num_qs, bs, embed_dims) inter_queries, init_reference, inter_references = self.transformer( mlvl_feats=[bev_features,], mlvl_masks=[img_masks.type(torch.bool)], query_embed=query_embedding, prop_query=prop_query_embedding, mlvl_pos_embeds=[pos_embed], # not used memory_query=None, init_reference_points=init_reference_points, prop_reference_points=prop_ref_pts, reg_branches=self.reg_branches, cls_branches=self.cls_branches, predict_refine=self.predict_refine, is_first_frame_list=is_first_frame_list, query_key_padding_mask=query_embedding.new_zeros((bs, self.num_queries), dtype=torch.bool), # mask used in self-attn, ) outputs = [] for i, (queries) in enumerate(inter_queries): reg_points = inter_references[i] # (bs, num_q, num_points, 2) bs = reg_points.shape[0] reg_points = reg_points.view(bs, -1, 2*self.num_points) # (bs, num_q, 2*num_points) scores = self.cls_branches[i](queries) # (bs, num_q, num_classes) reg_points_list = [] scores_list = [] for j in range(len(scores)): # padding queries should not be output reg_points_list.append(reg_points[j]) scores_list.append(scores[j]) pred_dict = { 'lines': torch.stack(reg_points_list), 'scores': torch.stack(scores_list), 'queries': queries, } # if i == len(inter_queries)-1: # map_queries = queries # map_lines = map_lines # map_scores = map_scores outputs.append(pred_dict) loss_dict, det_match_idxs, det_match_gt_idxs, gt_lines_list = self.loss(map_gt_bboxes_3d, map_gt_labels_3d, outputs, img_metas) if self.streaming_query: query_list = [] ref_pts_list = [] gt_targets_list = [] lines, scores = outputs[self.map_layer_index]['lines'], outputs[self.map_layer_index]['scores'] gt_lines = gt_lines_list[ self.map_layer_index] # take results from the last layer for i in range(bs): _lines = lines[i] _queries = inter_queries[self.map_layer_index][i] _scores = scores[i] _gt_targets = gt_lines[i] # (num_q or num_q+topk, 20, 2) assert len(_lines) == len(_queries) assert len(_lines) == len(_gt_targets) _scores, _ = _scores.max(-1) topk_score, topk_idx = _scores.topk(k=self.topk_query, dim=-1) _queries = _queries[topk_idx] # (topk, embed_dims) _lines = _lines[topk_idx] # (topk, 2*num_pts) _gt_targets = _gt_targets[topk_idx] # (topk, 20, 2) query_list.append(_queries) _lines = _lines.view(-1, self.num_points, 2) _lines = _lines * self.roi_size + self.origin _lines = torch.cat([_lines, torch.zeros_like(_lines[..., 0:1]), torch.ones_like(_lines[..., 0:1])], dim=-1) _lines = (ego_pose[i] @ _lines.unsqueeze(-1)).squeeze(-1) ref_pts_list.append(_lines) _gt_targets = _gt_targets.view(-1, self.num_points, 2) mask = _gt_targets == 0.0 _gt_targets =_gt_targets * self.roi_size + self.origin _gt_targets = torch.cat([_gt_targets, torch.zeros_like(_lines[..., 0:1]), torch.ones_like(_lines[..., 0:1])], dim=-1) _gt_targets = (ego_pose[i] @ _gt_targets.unsqueeze(-1)).squeeze(-1) _gt_targets[mask.repeat(1, 1, 2)] = -1e5 gt_targets_list.append(_gt_targets) self.query_memory.update(query_list, img_metas) self.reference_points_memory.update(ref_pts_list, img_metas) self.target_memory.update(gt_targets_list, img_metas) loss_dict['trans_loss'] = trans_loss return loss_dict, outputs # return outputs, loss_dict, det_match_idxs, det_match_gt_idxs def forward_test(self, input_dict, img_metas, map_gt_bboxes_3d=None, map_gt_labels_3d=None): ''' Args: bev_feature (List[Tensor]): shape [B, C, H, W] feature in bev view Outs: preds_dict (list[dict]): lines (Tensor): Classification score of all decoder layers, has shape [bs, num_query, 2*num_points] scores (Tensor): [bs, num_query,] ''' if input_dict['img_bev_feat'][0].dim() == 5: bev_features = [level.mean(-1) for level in input_dict['img_bev_feat']][0] else: bev_features = input_dict['img_bev_feat'][0] start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(input_dict['img_bev_feat'][0].device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) bev_features = self._prepare_context(bev_features) bs, C, H, W = bev_features.shape img_masks = bev_features.new_zeros((bs, H, W)) # pos_embed = self.positional_encoding(img_masks) pos_embed = None query_embedding = self.query_embedding.weight[None, ...].repeat(bs, 1, 1) # [B, num_q, embed_dims] input_query_num = self.num_queries # num query: self.num_query + self.topk if self.streaming_query: query_embedding, prop_query_embedding, init_reference_points, prop_ref_pts, memory_query, is_first_frame_list = \ self.propagate(query_embedding, img_metas, start_of_sequence, ego_pose_inv, return_loss=False) else: init_reference_points = self.reference_points_embed(query_embedding).sigmoid() # (bs, num_q, 2*num_pts) init_reference_points = init_reference_points.view(-1, self.num_queries, self.num_points, 2) # (bs, num_q, num_pts, 2) prop_query_embedding = None prop_ref_pts = None is_first_frame_list = [True for i in range(bs)] assert list(init_reference_points.shape) == [bs, input_query_num, self.num_points, 2] assert list(query_embedding.shape) == [bs, input_query_num, self.embed_dims] # outs_dec: (num_layers, num_qs, bs, embed_dims) inter_queries, init_reference, inter_references = self.transformer( mlvl_feats=[bev_features,], mlvl_masks=[img_masks.type(torch.bool)], query_embed=query_embedding, prop_query=prop_query_embedding, mlvl_pos_embeds=[pos_embed], # not used memory_query=None, init_reference_points=init_reference_points, prop_reference_points=prop_ref_pts, reg_branches=self.reg_branches, cls_branches=self.cls_branches, predict_refine=self.predict_refine, is_first_frame_list=is_first_frame_list, query_key_padding_mask=query_embedding.new_zeros((bs, self.num_queries), dtype=torch.bool), # mask used in self-attn, ) outputs = [] for i, (queries) in enumerate(inter_queries): reg_points = inter_references[i] # (bs, num_q, num_points, 2) bs = reg_points.shape[0] reg_points = reg_points.view(bs, -1, 2*self.num_points) # (bs, num_q, 2*num_points) scores = self.cls_branches[i](queries) # (bs, num_q, num_classes) reg_points_list = [] scores_list = [] prop_mask_list = [] for i in range(len(scores)): # padding queries should not be output reg_points_list.append(reg_points[i]) scores_list.append(scores[i]) prop_mask = scores.new_ones((len(scores[i]), ), dtype=torch.bool) prop_mask[-self.num_queries:] = False prop_mask_list.append(prop_mask) pred_dict = { 'lines': torch.stack(reg_points_list), 'scores': torch.stack(scores_list), 'prop_mask': torch.stack(prop_mask_list), 'queries': queries } outputs.append(pred_dict) if self.streaming_query: query_list = [] ref_pts_list = [] lines, scores = outputs[self.map_layer_index]['lines'], outputs[ self.map_layer_index]['scores'] for i in range(bs): _lines = lines[i] _queries = inter_queries[ self.map_layer_index][i] _scores = scores[i] assert len(_lines) == len(_queries) _scores, _ = _scores.max(-1) topk_score, topk_idx = _scores.topk(k=self.topk_query, dim=-1) _queries = _queries[topk_idx] # (topk, embed_dims) _lines = _lines[topk_idx] # (topk, 2*num_pts) query_list.append(_queries) _lines = _lines.view(-1, self.num_points, 2) # self.visual_sample(_lines, img_metas[i]['index'], pre=False) _lines = _lines * self.roi_size + self.origin _lines = torch.cat([_lines, torch.zeros_like(_lines[..., 0:1]), torch.ones_like(_lines[..., 0:1])], dim=-1) _lines = (ego_pose[i] @ _lines.unsqueeze(-1)).squeeze(-1) ref_pts_list.append(_lines) self.query_memory.update(query_list, img_metas) self.reference_points_memory.update(ref_pts_list, img_metas) gt_lane = map_gt_bboxes_3d[0][0].fixed_num_sampled_points.to(ego_pose[0].device) gt_lane = torch.cat([gt_lane, torch.zeros_like(gt_lane[..., 0:1]), torch.ones_like(gt_lane[..., 0:1])], dim=-1) gt_lane = (ego_pose[0] @ gt_lane.unsqueeze(-1)).squeeze(-1)[..., :2] gt_lane_label = map_gt_labels_3d[0][0] outputs[-1]['gt_lane_in_global'] = gt_lane outputs[-1]['gt_lane_label'] = gt_lane_label return outputs def world2bev_vis(self, x, y): return int(x * 640), int(y*320) def visual_sample(self, lines, index, pre=False, **kwargs): import cv2 bev_img = np.ones([640, 640, 3], dtype=np.float32) * 255 bev_img = bev_img.astype(np.float32) bev_img = cv2.circle(bev_img, self.world2bev_vis(0.5, 1.5), 5, (0, 255, 0), thickness=-1) for k, line in enumerate(lines): label = 0 line = line.cpu().numpy() corners = np.array([self.world2bev_vis(*corner) for corner in line]) corners = [each for each in corners if ((each>=0).all() & (each<1500).all())] corners = [(x, y+320) for (x, y) in corners ] colors = [(255, 255, 0), (255, 0, 0), (0, 255, 0)] for i, corner in enumerate(corners[:-1]): bev_img = cv2.circle(bev_img, corners[i], 2, (61, 102, 255)) bev_img = cv2.line(bev_img, corners[i], corners[i+1], color=colors[label], thickness=1) if pre: mmcv.imwrite(bev_img, f'pred_bev_{index}_prev.png') else: mmcv.imwrite(bev_img, f'pred_bev_{index}_after.png') @force_fp32(apply_to=('score_pred', 'lines_pred', 'gt_lines')) def _get_target_single(self, score_pred, lines_pred, gt_labels, gt_lines, valid_map, gt_bboxes_ignore=None): """ Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: score_pred (Tensor): Box score logits from a single decoder layer for one image. Shape [num_query, cls_out_channels]. lines_pred (Tensor): shape [num_query, 2*num_points] gt_labels (torch.LongTensor) shape [num_gt, ] gt_lines (Tensor): shape [num_gt, 2*num_points]. Returns: tuple[Tensor]: a tuple containing the following for one sample. - labels (LongTensor): Labels of each image. shape [num_query, 1] - label_weights (Tensor]): Label weights of each image. shape [num_query, 1] - lines_target (Tensor): Lines targets of each image. shape [num_query, num_points, 2] - lines_weights (Tensor): Lines weights of each image. shape [num_query, num_points, 2] - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ num_pred_lines = len(lines_pred) # assigner and sampler assign_result, gt_permute_idx = self.assigner.assign(preds=dict(lines=lines_pred, scores=score_pred,), gts=dict(lines=gt_lines, labels=gt_labels, ), gt_bboxes_ignore=gt_bboxes_ignore) if gt_lines.dim() == 4: gt_lines = gt_lines.flatten(-2, -1) sampling_result = self.sampler.sample( assign_result, lines_pred, gt_lines) num_gt = len(gt_lines) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds pos_gt_inds = sampling_result.pos_assigned_gt_inds labels = gt_lines.new_full( (num_pred_lines, ), self.num_classes, dtype=torch.long) # (num_q, ) if valid_map: labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] label_weights = gt_lines.new_ones(num_pred_lines) # (num_q, ) lines_target = torch.zeros_like(lines_pred) # (num_q, 2*num_pts) lines_weights = torch.zeros_like(lines_pred) # (num_q, 2*num_pts) if num_gt > 0 and valid_map: if gt_permute_idx is not None: # using permute invariant label # gt_permute_idx: (num_q, num_gt) # pos_inds: which query is positive # pos_gt_inds: which gt each pos pred is assigned # single_matched_gt_permute_idx: which permute order is matched single_matched_gt_permute_idx = gt_permute_idx[ pos_inds, pos_gt_inds ] lines_target[pos_inds] = gt_lines[pos_gt_inds, single_matched_gt_permute_idx].type( lines_target.dtype) # (num_q, 2*num_pts) else: lines_target[pos_inds] = sampling_result.pos_gt_bboxes.type( lines_target.dtype) # (num_q, 2*num_pts) lines_weights[pos_inds] = 1.0 # (num_q, 2*num_pts) # normalization # n = lines_weights.sum(-1, keepdim=True) # (num_q, 1) # lines_weights = lines_weights / n.masked_fill(n == 0, 1) # (num_q, 2*num_pts) # [0, ..., 0] for neg ind and [1/npts, ..., 1/npts] for pos ind return (labels, label_weights, lines_target, lines_weights, pos_inds, neg_inds, pos_gt_inds) # @force_fp32(apply_to=('preds', 'gts')) def get_targets(self, preds, map_gt_bboxes_3d, map_gt_labels_3d, valid_map, gt_bboxes_ignore_list=None): """ Compute regression and classification targets for a batch image. Outputs from a single decoder layer of a single feature level are used. Args: preds (dict): - lines (Tensor): shape (bs, num_queries, 2*num_points) - scores (Tensor): shape (bs, num_queries, num_class_channels) gts (dict): - class_label (list[Tensor]): tensor shape (num_gts, ) - lines (list[Tensor]): tensor shape (num_gts, 2*num_points) gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all \ images. - lines_targets_list (list[Tensor]): Lines targets for all \ images. - lines_weight_list (list[Tensor]): Lines weights for all \ images. - num_total_pos (int): Number of positive samples in all \ images. - num_total_neg (int): Number of negative samples in all \ images. """ assert gt_bboxes_ignore_list is None, \ 'Only supports for gt_bboxes_ignore setting to None.' # format the inputs gt_labels = map_gt_labels_3d gt_lines = map_gt_bboxes_3d lines_pred = preds['lines'] (labels_list, label_weights_list, lines_targets_list, lines_weights_list, pos_inds_list, neg_inds_list,pos_gt_inds_list) = multi_apply( self._get_target_single, preds['scores'], lines_pred, gt_labels, gt_lines, valid_map, gt_bboxes_ignore=gt_bboxes_ignore_list) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) new_gts = dict( labels=labels_list, # list[Tensor(num_q, )], length=bs label_weights=label_weights_list, # list[Tensor(num_q, )], length=bs, all ones lines=lines_targets_list, # list[Tensor(num_q, 2*num_pts)], length=bs lines_weights=lines_weights_list, # list[Tensor(num_q, 2*num_pts)], length=bs ) return new_gts, num_total_pos, num_total_neg, pos_inds_list, pos_gt_inds_list # @force_fp32(apply_to=('preds', 'gts')) def loss_single(self, preds, map_gt_bboxes_3d, map_gt_labels_3d, valid_map, gt_bboxes_ignore_list=None, reduction='none'): """ Loss function for outputs from a single decoder layer of a single feature level. Args: preds (dict): - lines (Tensor): shape (bs, num_queries, 2*num_points) - scores (Tensor): shape (bs, num_queries, num_class_channels) gts (dict): - class_label (list[Tensor]): tensor shape (num_gts, ) - lines (list[Tensor]): tensor shape (num_gts, 2*num_points) gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ # Get target for each sample new_gts, num_total_pos, num_total_neg, pos_inds_list, pos_gt_inds_list =\ self.get_targets(preds, map_gt_bboxes_3d, map_gt_labels_3d, valid_map, gt_bboxes_ignore_list) # Batched all data # for k, v in new_gts.items(): # new_gts[k] = torch.stack(v, dim=0) # tensor (bs, num_q, ...) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( preds['scores'][0].new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) # Classification loss # since the inputs needs the second dim is the class dim, we permute the prediction. pred_scores = preds['scores'].flatten(0, 1) # (bs*num_q, cls_out_channles) cls_scores = pred_scores.reshape(-1, self.cls_out_channels) # (bs*num_q, cls_out_channels) cls_labels = torch.cat(new_gts['labels'], dim=0).reshape(-1) # (bs*num_q, ) cls_weights = torch.cat(new_gts['label_weights'], dim=0).reshape(-1) # (bs*num_q, ) loss_cls = self.loss_cls( cls_scores, cls_labels, cls_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes across all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() pred_lines = preds['lines'].flatten(0, 1) gt_lines = torch.cat(new_gts['lines'], dim=0) line_weights = torch.cat(new_gts['lines_weights'], dim=0) assert len(pred_lines) == len(gt_lines) assert len(gt_lines) == len(line_weights) loss_reg = self.loss_reg( pred_lines, gt_lines, line_weights, avg_factor=num_total_pos) loss_dict = dict( loss_cls=loss_cls, loss_reg=loss_reg, ) return loss_dict, pos_inds_list, pos_gt_inds_list, new_gts['lines'] @force_fp32(apply_to=('map_gt_bboxes_3d', 'preds')) def loss(self, map_gt_bboxes_3d, map_gt_labels_3d, preds, img_metas, gt_bboxes_ignore=None, reduction='mean'): """ Loss Function. Args: gts (list[dict]): list length: num_layers dict { 'label': list[tensor(num_gts, )], list length: batchsize, 'line': list[tensor(num_gts, 2*num_points)], list length: batchsize, ... } preds (list[dict]): list length: num_layers dict { 'lines': tensor(bs, num_queries, 2*num_points), 'scores': tensor(bs, num_queries, class_out_channels), } gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' # Since there might have multi layer num_dec_layers = len(preds) map_gt_bboxes_3d_list = [map_gt_bboxes_3d for _ in range(num_dec_layers)] map_gt_labels_3d_list = [map_gt_labels_3d for _ in range(num_dec_layers)] valid_map = torch.tensor([each['has_valid_map'] for each in img_metas], device=map_gt_bboxes_3d[0].device) valid_map_list = [valid_map for _ in range(num_dec_layers)] losses, pos_inds_lists, pos_gt_inds_lists, gt_lines_list = multi_apply( self.loss_single, preds, map_gt_bboxes_3d_list, map_gt_labels_3d_list , valid_map_list, reduction=reduction) # Format the losses loss_dict = dict() # loss from the last decoder layer for k, v in losses[-1].items(): loss_dict[k] = v # Loss from other decoder layers num_dec_layer = 0 for loss in losses[:-1]: for k, v in loss.items(): loss_dict[f'd{num_dec_layer}.{k}'] = v num_dec_layer += 1 return loss_dict, pos_inds_lists, pos_gt_inds_lists, gt_lines_list def get_bboxes(self, preds_dict, img_metas, thr=0.0): preds_dict = preds_dict[-1] lines = preds_dict['lines'] # List[Tensor(num_queries, 2*num_points)] bs = len(lines) scores = preds_dict['scores'] # (bs, num_queries, 3) prop_mask = preds_dict['prop_mask'] results = [] for i in range(bs): tmp_vectors = lines[i] tmp_prop_mask = prop_mask[i] num_preds, num_points2 = tmp_vectors.shape tmp_vectors = tmp_vectors.view(num_preds, num_points2//2, 2) # focal loss if self.loss_cls.use_sigmoid: tmp_scores, tmp_labels = scores[i].max(-1) tmp_scores = tmp_scores.sigmoid() pos = tmp_scores > thr else: assert self.num_classes + 1 == self.cls_out_channels tmp_scores, tmp_labels = scores[i].max(-1) bg_cls = self.cls_out_channels pos = tmp_labels != bg_cls tmp_vectors = tmp_vectors[pos] tmp_scores = tmp_scores[pos] tmp_labels = tmp_labels[pos] tmp_prop_mask = tmp_prop_mask[pos] tmp_vectors = tmp_vectors * self.roi_size + self.origin if len(tmp_scores) == 0: single_result = { 'map_pts_3d': [], 'map_scores_3d': [], 'map_labels_3d': [], 'prop_mask': [], 'index': img_metas[0]['index'] } else: single_result = { 'map_pts_3d': tmp_vectors.detach().cpu(), # .numpy(), 'map_scores_3d': tmp_scores.detach().cpu(), # .numpy(), 'map_labels_3d': tmp_labels.detach().cpu(), #.numpy(), 'prop_mask': tmp_prop_mask.detach().cpu(), # .numpy(), 'index': img_metas[0]['index'], 'gt_lane_in_global': preds_dict['gt_lane_in_global'].cpu().numpy(), 'gt_lane_label': preds_dict['gt_lane_label'].cpu().numpy(), } results.append(single_result) return results def train(self, *args, **kwargs): super().train(*args, **kwargs) for k, v in self.__dict__.items(): if isinstance(v, StreamTensorMemory): v.train(*args, **kwargs) def eval(self): super().eval() for k, v in self.__dict__.items(): if isinstance(v, StreamTensorMemory): v.eval() def forward(self, *args, return_loss=True, **kwargs): if return_loss: return self.forward_train(*args, **kwargs) else: return self.forward_test(*args, **kwargs) ================================================ FILE: mmdet3d/models/fbbev/streammapnet/transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import warnings import copy import torch import torch.nn as nn from mmcv.cnn import build_activation_layer, build_norm_layer, xavier_init from mmcv.cnn.bricks.registry import (TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer) from mmcv.runner.base_module import BaseModule, ModuleList from mmdet.models.utils.builder import TRANSFORMER from mmdet.models.utils.transformer import Transformer from .CustomMSDeformableAttention import CustomMSDeformableAttention from mmdet.models.utils.transformer import inverse_sigmoid @TRANSFORMER_LAYER_SEQUENCE.register_module() class MapTransformerDecoder_new(BaseModule): """Implements the decoder in DETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, transformerlayers=None, num_layers=None, prop_add_stage=0, return_intermediate=True, fix=False, init_cfg=None): super().__init__(init_cfg) if isinstance(transformerlayers, dict): transformerlayers = [ copy.deepcopy(transformerlayers) for _ in range(num_layers) ] else: assert isinstance(transformerlayers, list) and \ len(transformerlayers) == num_layers self.num_layers = num_layers self.layers = ModuleList() for i in range(num_layers): self.layers.append(build_transformer_layer(transformerlayers[i])) self.embed_dims = self.layers[0].embed_dims self.pre_norm = self.layers[0].pre_norm self.return_intermediate = return_intermediate self.prop_add_stage = prop_add_stage self.fix = fix assert prop_add_stage >= 0 and prop_add_stage < num_layers def forward(self, query, prop_query, key, value, query_pos, key_padding_mask, query_key_padding_mask, reference_points, prop_reference_points, spatial_shapes, level_start_index, reg_branches, cls_branches, is_first_frame_list, predict_refine, **kwargs): """Forward function for `TransformerDecoder`. Args: query (Tensor): Input query with shape `(num_query, bs, embed_dims)`. reference_points (Tensor): The reference points of offset. has shape (bs, num_query, num_points, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) reg_branch: (obj:`nn.ModuleList`): Used for refining the regression results. Only would be passed when with_box_refine is True, otherwise would be passed a `None`. Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ num_queries, bs, embed_dims = query.shape output = query intermediate = [] intermediate_reference_points = [] for lid, layer in enumerate(self.layers): if lid == self.prop_add_stage and prop_query is not None and prop_reference_points is not None: bs, topk, embed_dims = prop_query.shape output = output.permute(1, 0, 2) with torch.no_grad(): tmp_scores, _ = cls_branches[lid](output).max(-1) # (bs, num_q) new_query = [] new_refpts = [] for i in range(bs): if is_first_frame_list[i]: new_query.append(output[i]) new_refpts.append(reference_points[i]) else: _, valid_idx = torch.topk(tmp_scores[i], k=num_queries-topk, dim=-1) new_query.append(torch.cat([prop_query[i], output[i][valid_idx]], dim=0)) new_refpts.append(torch.cat([prop_reference_points[i], reference_points[i][valid_idx]], dim=0)) output = torch.stack(new_query).permute(1, 0, 2) reference_points = torch.stack(new_refpts) assert list(output.shape) == [num_queries, bs, embed_dims] tmp = reference_points.clone() if self.fix: tmp[..., 1:2] = 1.0 - reference_points[..., 1:2] # reverse y-axis # reference_points = tmp output = layer( output, key, value, query_pos=query_pos, key_padding_mask=key_padding_mask, reference_points=tmp, spatial_shapes=spatial_shapes, level_start_index=level_start_index, query_key_padding_mask=None, **kwargs) reg_points = reg_branches[lid](output.permute(1, 0, 2)) # (bs, num_q, 2*num_points) bs, num_queries, num_points2 = reg_points.shape reg_points = reg_points.view(bs, num_queries, num_points2//2, 2) # range (0, 1) if predict_refine: new_reference_points = reg_points + inverse_sigmoid( reference_points ) new_reference_points = new_reference_points.sigmoid() else: new_reference_points = reg_points.sigmoid() # (bs, num_q, num_points, 2) reference_points = new_reference_points.clone().detach() if self.return_intermediate: intermediate.append(output.permute(1, 0, 2)) # [(bs, num_q, embed_dims)] intermediate_reference_points.append(new_reference_points) # (bs, num_q, num_points, 2) if self.return_intermediate: return intermediate, intermediate_reference_points return output, reference_points @TRANSFORMER_LAYER.register_module() class MapTransformerLayer(BaseTransformerLayer): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, **kwargs): super().__init__( attn_cfgs=attn_cfgs, ffn_cfgs=ffn_cfgs, operation_order=operation_order, norm_cfg=norm_cfg, init_cfg=init_cfg, batch_first=batch_first, **kwargs ) def forward(self, query, key=None, value=None, memory_query=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': if memory_query is None: temp_key = temp_value = query else: temp_key = temp_value = torch.cat([memory_query, query], dim=0) query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query @TRANSFORMER.register_module() class MapTransformer(Transformer): """Implements the DeformableDETR transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ def __init__(self, num_feature_levels=1, num_points=20, coord_dim=2, **kwargs): super().__init__(**kwargs) self.num_feature_levels = num_feature_levels self.embed_dims = self.encoder.embed_dims self.coord_dim = coord_dim self.num_points = num_points self.init_layers() def init_layers(self): """Initialize layers of the DeformableDetrTransformer.""" # self.level_embeds = nn.Parameter( # torch.Tensor(self.num_feature_levels, self.embed_dims)) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, CustomMSDeformableAttention): m.init_weights() # normal_(self.level_embeds) def forward(self, mlvl_feats, mlvl_masks, query_embed, mlvl_pos_embeds, init_reference_points, reg_branches=None, cls_branches=None, memory_query=None, prop_query=None, prop_reference_points=None, **kwargs): """Forward function for `Transformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape [bs, embed_dims, h, w]. mlvl_masks (list(Tensor)): The key_padding_mask from different level used for encoder and decoder, each element has shape [bs, h, w]. query_embed (Tensor): The query embedding for decoder, with shape [num_query, c]. mlvl_pos_embeds (list(Tensor)): The positional encoding of feats from different level, has the shape [bs, embed_dims, h, w]. reg_branches (obj:`nn.ModuleList`): Regression heads for feature maps from each decoder layer. Only would be passed when `with_box_refine` is True. Default to None. cls_branches (obj:`nn.ModuleList`): Classification heads for feature maps from each decoder layer. Only would be passed when `as_two_stage` is True. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - inter_states: Outputs from decoder. If return_intermediate_dec is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - init_reference_out: The initial value of reference \ points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) - enc_outputs_class: The classification score of \ proposals generated from \ encoder's feature maps, has shape \ (batch, h*w, num_classes). \ Only would be returned when `as_two_stage` is True, \ otherwise None. - enc_outputs_coord_unact: The regression results \ generated from encoder's feature maps., has shape \ (batch, h*w, 4). Only would \ be returned when `as_two_stage` is True, \ otherwise None. """ feat_flatten = [] mask_flatten = [] # lvl_pos_embed_flatten = [] spatial_shapes = [] for lvl, (feat, mask, pos_embed) in enumerate( zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)): bs, c, h, w = feat.shape spatial_shape = (h, w) spatial_shapes.append(spatial_shape) feat = feat.flatten(2).transpose(1, 2) mask = mask.flatten(1) # pos_embed = pos_embed.flatten(2).transpose(1, 2) # lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1) # lvl_pos_embed_flatten.append(lvl_pos_embed) feat_flatten.append(feat) mask_flatten.append(mask) feat_flatten = torch.cat(feat_flatten, 1) mask_flatten = torch.cat(mask_flatten, 1) # lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) spatial_shapes = torch.as_tensor( spatial_shapes, dtype=torch.long, device=feat_flatten.device) level_start_index = torch.cat((spatial_shapes.new_zeros( (1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) feat_flatten = feat_flatten.permute(1, 0, 2) # (H*W, bs, embed_dims) # decoder query = query_embed.permute(1, 0, 2) # (num_q, bs, embed_dims) if memory_query is not None: memory_query = memory_query.permute(1, 0, 2) inter_states, inter_references = self.decoder( query=query, key=None, value=feat_flatten, query_pos=None, key_padding_mask=mask_flatten, reference_points=init_reference_points, spatial_shapes=spatial_shapes, level_start_index=level_start_index, reg_branches=reg_branches, cls_branches=cls_branches, memory_query=memory_query, prop_query=prop_query, prop_reference_points=prop_reference_points, **kwargs) return inter_states, init_reference_points, inter_references @TRANSFORMER_LAYER_SEQUENCE.register_module() class PlaceHolderEncoder(nn.Module): def __init__(self, *args, embed_dims=None, **kwargs): super(PlaceHolderEncoder, self).__init__() self.embed_dims = embed_dims def forward(self, *args, query=None, **kwargs): return query ================================================ FILE: mmdet3d/models/fbbev/streammapnet/utils.py ================================================ import torch import copy import math import torch import torch.nn as nn import numpy as np from mmcv.cnn import bias_init_with_prob, xavier_init class StreamTensorMemory(object): def __init__(self, batch_size): self.train_bs = batch_size self.training = True self.bs = self.train_bs self.train_memory_list = [None for i in range(self.bs)] self.train_img_metas_memory = [None for i in range(self.bs)] self.test_memory_list = [None] # bs = 1 when testing self.test_img_metas_memory = [None] @property def memory_list(self): if self.training: return self.train_memory_list else: return self.test_memory_list @property def img_metas_memory(self): if self.training: return self.train_img_metas_memory else: return self.test_img_metas_memory def update(self, memory, img_metas): for i in range(self.bs): self.memory_list[i] = memory[i].clone().detach() self.img_metas_memory[i] = copy.deepcopy(img_metas[i]) def reset_single(self, idx): self.memory_list[idx] = None self.img_metas_memory[idx] = None def get(self, img_metas): ''' img_metas: list[img_metas] ''' tensor_list = [] img_metas_list = [] is_first_frame_list = [] for i in range(self.bs): if not self.img_metas_memory[i]: is_first_frame = True else: is_first_frame = (img_metas[i]['scene_name'] != self.img_metas_memory[i]['scene_name']) if is_first_frame: self.reset_single(i) tensor_list.append(self.memory_list[i]) img_metas_list.append(self.img_metas_memory[i]) is_first_frame_list.append(is_first_frame) result = { 'tensor': tensor_list, 'img_metas': img_metas_list, 'is_first_frame': is_first_frame_list, } return result def train(self, mode=True): self.training = mode if mode: self.bs = self.train_bs else: self.bs = 1 def eval(self): self.train(False) class MotionMLP(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=512, identity=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.identity = identity self.fc = nn.Sequential( nn.Linear(c_dim + f_dim, 2*f_dim), nn.LayerNorm(2*f_dim), nn.ReLU(), nn.Linear(2*f_dim, f_dim) ) self.init_weights() def init_weights(self): for m in self.fc: for param in m.parameters(): if param.dim() > 1: if self.identity: nn.init.zeros_(param) else: nn.init.xavier_uniform_(param) def forward(self, x, c): xc = torch.cat([x, c], dim=-1) out = self.fc(xc) if self.identity: out = out + x return out ================================================ FILE: mmdet3d/models/fbbev/streampetr/__init__.py ================================================ from .streampetr_v2 import SparseHead4BEV # ok from .petr_transformer import * # ok from .hungarian_assigner_2d import * from .hungarian_assigner_3d import * from .match_cost import BBox3DL1Cost from .nms_free_coder import NMSFreeCoder ================================================ FILE: mmdet3d/models/fbbev/streampetr/hungarian_assigner_2d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Shihao Wang # --------------------------------------------- import torch from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.assigners import AssignResult from mmdet.core.bbox.assigners import BaseAssigner from mmdet.core.bbox.match_costs import build_match_cost from mmdet.core import bbox_cxcywh_to_xyxy try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @BBOX_ASSIGNERS.register_module() class HungarianAssigner2D(BaseAssigner): """Computes one-to-one matching between predictions and ground truth. This class computes an assignment between the targets and the predictions based on the costs. The costs are weighted sum of three components: classification cost, regression L1 cost and regression iou cost. The targets don't include the no_object, so generally there are more predictions than targets. After the one-to-one matching, the un-matched are treated as backgrounds. Thus each query prediction will be assigned with `0` or a positive integer indicating the ground truth index: - 0: negative sample, no assigned gt - positive integer: positive sample, index (1-based) of assigned gt Args: cls_weight (int | float, optional): The scale factor for classification cost. Default 1.0. bbox_weight (int | float, optional): The scale factor for regression L1 cost. Default 1.0. iou_weight (int | float, optional): The scale factor for regression iou cost. Default 1.0. iou_calculator (dict | optional): The config for the iou calculation. Default type `BboxOverlaps2D`. iou_mode (str | optional): "iou" (intersection over union), "iof" (intersection over foreground), or "giou" (generalized intersection over union). Default "giou". """ def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=1.0), iou_cost=dict(type='IoUCost', iou_mode='giou', weight=1.0), centers2d_cost=dict(type='BBox3DL1Cost', weight=1.0)): self.cls_cost = build_match_cost(cls_cost) self.reg_cost = build_match_cost(reg_cost) self.iou_cost = build_match_cost(iou_cost) self.centers2d_cost = build_match_cost(centers2d_cost) def assign(self, bbox_pred, cls_pred, pred_centers2d, gt_bboxes, gt_labels, centers2d, img_meta, gt_bboxes_ignore=None, eps=1e-7): """Computes one-to-one matching based on the weighted costs. This method assign each query prediction to a ground truth or background. The `assigned_gt_inds` with -1 means don't care, 0 means negative sample, and positive number is the index (1-based) of assigned gt. The assignment is done in the following steps, the order matters. 1. assign every prediction to -1 2. compute the weighted costs 3. do Hungarian matching on CPU based on the costs 4. assign all to 0 (background) first, then for each matched pair between predictions and gts, treat this prediction as foreground and assign the corresponding gt index (plus 1) to it. Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. cls_pred (Tensor): Predicted classification logits, shape [num_query, num_class]. gt_bboxes (Tensor): Ground truth boxes with unnormalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,). img_meta (dict): Meta information for current image. gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are labelled as `ignored`. Default None. eps (int | float, optional): A value added to the denominator for numerical stability. Default 1e-7. Returns: :obj:`AssignResult`: The assigned result. """ assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) img_h, img_w, _ = img_meta['pad_shape'] factor = gt_bboxes.new_tensor([img_w, img_h, img_w, img_h]).unsqueeze(0) # 2. compute the weighted costs # classification and bboxcost. cls_cost = self.cls_cost(cls_pred, gt_labels) # regression L1 cost normalize_gt_bboxes = gt_bboxes / factor reg_cost = self.reg_cost(bbox_pred, normalize_gt_bboxes) # regression iou cost, defaultly giou is used in official DETR. bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor iou_cost = self.iou_cost(bboxes, gt_bboxes) # center2d L1 cost normalize_centers2d = centers2d / factor[:, 0:2] centers2d_cost = self.centers2d_cost(pred_centers2d, normalize_centers2d) # weighted sum of above four costs cost = cls_cost + reg_cost + iou_cost + centers2d_cost cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0) # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) ================================================ FILE: mmdet3d/models/fbbev/streampetr/hungarian_assigner_3d.py ================================================ # ------------------------------------------------------------------------ # Modified from DETR3D (https://github.com/WangYueFt/detr3d) # Copyright (c) 2021 Wang, Yue # ------------------------------------------------------------------------ import torch from mmdet.core.bbox.builder import BBOX_ASSIGNERS from mmdet.core.bbox.assigners import AssignResult from mmdet.core.bbox.assigners import BaseAssigner from mmdet.core.bbox.match_costs import build_match_cost from .streampetr_utils import normalize_bbox try: from scipy.optimize import linear_sum_assignment except ImportError: linear_sum_assignment = None @BBOX_ASSIGNERS.register_module() class HungarianAssigner3D(BaseAssigner): def __init__(self, cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=1.0), iou_cost=dict(type='IoUCost', weight=0.0), pc_range=None): self.cls_cost = build_match_cost(cls_cost) self.reg_cost = build_match_cost(reg_cost) self.iou_cost = build_match_cost(iou_cost) self.pc_range = pc_range def assign(self, bbox_pred, cls_pred, gt_bboxes, gt_labels, gt_bboxes_ignore=None, code_weights=None, with_velo=False, eps=1e-7): assert gt_bboxes_ignore is None, \ 'Only case when gt_bboxes_ignore is None is supported.' num_gts, num_bboxes = gt_bboxes.size(0), bbox_pred.size(0) # 1. assign -1 by default assigned_gt_inds = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) assigned_labels = bbox_pred.new_full((num_bboxes, ), -1, dtype=torch.long) if num_gts == 0 or num_bboxes == 0: # No ground truth or boxes, return empty assignment if num_gts == 0: # No ground truth, assign all to background assigned_gt_inds[:] = 0 return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs # classification and bboxcost. cls_cost = self.cls_cost(cls_pred, gt_labels) # regression L1 cost normalized_gt_bboxes = normalize_bbox(gt_bboxes, self.pc_range) if code_weights is not None: bbox_pred = bbox_pred * code_weights normalized_gt_bboxes = normalized_gt_bboxes * code_weights if with_velo: reg_cost = self.reg_cost(bbox_pred, normalized_gt_bboxes) else: reg_cost = self.reg_cost(bbox_pred[:, :8], normalized_gt_bboxes[:, :8]) # weighted sum of above two costs cost = cls_cost + reg_cost # 3. do Hungarian matching on CPU using linear_sum_assignment cost = cost.detach().cpu() if linear_sum_assignment is None: raise ImportError('Please run "pip install scipy" ' 'to install scipy first.') cost = torch.nan_to_num(cost, nan=100.0, posinf=100.0, neginf=-100.0) matched_row_inds, matched_col_inds = linear_sum_assignment(cost) matched_row_inds = torch.from_numpy(matched_row_inds).to( bbox_pred.device) matched_col_inds = torch.from_numpy(matched_col_inds).to( bbox_pred.device) # 4. assign backgrounds and foregrounds # assign all indices to backgrounds first assigned_gt_inds[:] = 0 # assign foregrounds based on matching results assigned_gt_inds[matched_row_inds] = matched_col_inds + 1 assigned_labels[matched_row_inds] = gt_labels[matched_col_inds] return AssignResult( num_gts, assigned_gt_inds, None, labels=assigned_labels) ================================================ FILE: mmdet3d/models/fbbev/streampetr/match_cost.py ================================================ import torch from mmdet.core.bbox.match_costs.builder import MATCH_COST @MATCH_COST.register_module() class BBox3DL1Cost(object): """BBox3DL1Cost. Args: weight (int | float, optional): loss_weight """ def __init__(self, weight=1.): self.weight = weight def __call__(self, bbox_pred, gt_bboxes): """ Args: bbox_pred (Tensor): Predicted boxes with normalized coordinates (cx, cy, w, h), which are all in range [0, 1]. Shape [num_query, 4]. gt_bboxes (Tensor): Ground truth boxes with normalized coordinates (x1, y1, x2, y2). Shape [num_gt, 4]. Returns: torch.Tensor: bbox_cost value with weight """ bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1) return bbox_cost * self.weight ================================================ FILE: mmdet3d/models/fbbev/streampetr/nms_free_coder.py ================================================ import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS from .streampetr_utils import denormalize_bbox @BBOX_CODERS.register_module() class NMSFreeCoder(BaseBBoxCoder): """Bbox coder for NMS-free detector. Args: pc_range (list[float]): Range of point cloud. post_center_range (list[float]): Limit of the center. Default: None. max_num (int): Max number to be kept. Default: 100. score_threshold (float): Threshold to filter boxes based on score. Default: None. code_size (int): Code size of bboxes. Default: 9 """ def __init__(self, pc_range=None, voxel_size=None, post_center_range=None, max_num=100, score_threshold=None, num_classes=10): self.pc_range = pc_range self.voxel_size = voxel_size self.post_center_range = post_center_range self.max_num = max_num self.score_threshold = score_threshold self.num_classes = num_classes def encode(self): pass def decode_single(self, cls_scores, bbox_preds): """Decode bboxes. Args: cls_scores (Tensor): Outputs from the classification head, \ shape [num_query, cls_out_channels]. Note \ cls_out_channels should includes background. bbox_preds (Tensor): Outputs from the regression \ Shape [num_query, 9]. Returns: list[dict]: Decoded boxes. """ max_num = self.max_num cls_scores = cls_scores.sigmoid() scores, indexs = cls_scores.view(-1).topk(max_num) labels = indexs % self.num_classes bbox_index = torch.div(indexs, self.num_classes, rounding_mode='floor') bbox_preds = bbox_preds[bbox_index] final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) final_scores = scores final_preds = labels # use score threshold if self.score_threshold is not None: thresh_mask = final_scores >= self.score_threshold if self.post_center_range is not None: self.post_center_range = torch.tensor(self.post_center_range, device=scores.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(1) mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(1) if self.score_threshold: mask &= thresh_mask boxes3d = final_box_preds[mask] scores = final_scores[mask] labels = final_preds[mask] predictions_dict = { 'bboxes': boxes3d, 'scores': scores, 'labels': labels } else: raise NotImplementedError( 'Need to reorganize output as a batch, only ' 'support post_center_range is not None for now!') return predictions_dict def decode(self, preds_dicts, layer_index=-1): """Decode bboxes. Args: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format \ Shape [nb_dec, bs, num_query, 9]. Returns: list[dict]: Decoded boxes. """ all_cls_scores = preds_dicts['all_cls_scores'][layer_index] all_bbox_preds = preds_dicts['all_bbox_preds'][layer_index] batch_size = all_cls_scores.size()[0] predictions_list = [] for i in range(batch_size): predictions_list.append(self.decode_single(all_cls_scores[i], all_bbox_preds[i])) return predictions_list ================================================ FILE: mmdet3d/models/fbbev/streampetr/petr_transformer.py ================================================ import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init, constant_init, build_norm_layer from mmcv.cnn.bricks.transformer import (BaseTransformerLayer, TransformerLayerSequence, build_transformer_layer_sequence, build_attention, build_feedforward_network) from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttnFunction from mmcv.runner.base_module import BaseModule from mmcv.cnn.bricks.registry import (ATTENTION,TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmdet.models.utils.builder import TRANSFORMER from .streampetr_utils import pos2posemb3d, bevpos2posemb from mmdet.models.utils.transformer import inverse_sigmoid from mmcv.utils import deprecated_api_warning, ConfigDict import warnings import copy from torch.nn import ModuleList import torch.utils.checkpoint as cp from mmcv.runner import force_fp32 from torch.cuda.amp import autocast # Disable warnings warnings.filterwarnings("ignore") def get_ego_pos(points, pc_range): if points.size(-1) == 3: points = points * (pc_range[3:6] - pc_range[0:3]) + pc_range[0:3] elif points.size(-1) == 2: points = points * (pc_range[3:5] - pc_range[0:2]) + pc_range[0:2] return points def get_rel_pos(points, pc_range): if points.size(-1) == 3: return (points - pc_range[0:3]) / (pc_range[3:6] - pc_range[0:3]) elif points.size(-1) == 2: return (points - pc_range[0:2]) / (pc_range[3:5] - pc_range[0:2]) @TRANSFORMER.register_module() class Detr3DTransformer(BaseModule): """Implements the Detr3D transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ def __init__(self, decoder=None, **kwargs): super(Detr3DTransformer, self).__init__(**kwargs) self.decoder = build_transformer_layer_sequence(decoder) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if hasattr(m, "init_weight"): m.init_weight() def forward(self, query, query_pos, feat_flatten, spatial_flatten, level_start_index, temp_memory, temp_pos, attn_masks, reference_points, pc_range, data, img_metas, temp_reference_points=None, reg_branches=None, query_embedding=None, return_intermediate_pts=False, cam_params=None, debug_info=None, ): """Forward function for `Detr3DTransformer`. Args: mlvl_feats (list(Tensor)): Input queries from different level. Each element has shape [bs, embed_dims, h, w]. query_embed (Tensor): The query embedding for decoder, with shape [num_query, c]. mlvl_pos_embeds (list(Tensor)): The positional encoding of feats from different level, has the shape [bs, embed_dims, h, w]. reg_branches (obj:`nn.ModuleList`): Regression heads for feature maps from each decoder layer. Only would be passed when `with_box_refine` is True. Default to None. Returns: tuple[Tensor]: results of decoder containing the following tensor. - inter_states: Outputs from decoder. If return_intermediate_dec is True output has shape \ (num_dec_layers, bs, num_query, embed_dims), else has \ shape (1, bs, num_query, embed_dims). - init_reference_out: The initial value of reference \ points, has shape (bs, num_queries, 4). - inter_references_out: The internal value of reference \ points in decoder, has shape \ (num_dec_layers, bs,num_query, embed_dims) - enc_outputs_class: The classification score of \ proposals generated from \ encoder's feature maps, has shape \ (batch, h*w, num_classes). \ Only would be returned when `as_two_stage` is True, \ otherwise None. - enc_outputs_coord_unact: The regression results \ generated from encoder's feature maps., has shape \ (batch, h*w, 4). Only would \ be returned when `as_two_stage` is True, \ otherwise None. """ lidar2img = None # data['lidar2img'] inter_states = self.decoder( query=query, query_pos=query_pos, mlvl_feats=feat_flatten, temp_memory=temp_memory, temp_pos=temp_pos, reference_points=reference_points, spatial_flatten=spatial_flatten, level_start_index=level_start_index, pc_range=pc_range, lidar2img=lidar2img, img_metas=img_metas, attn_masks=attn_masks, reg_branches=reg_branches, query_embedding=query_embedding, return_intermediate_pts=return_intermediate_pts, cam_params=cam_params, debug_info=debug_info, temp_reference_points=temp_reference_points, ) return inter_states @TRANSFORMER_LAYER_SEQUENCE.register_module() class Detr3DTransformerDecoder(TransformerLayerSequence): """Implements the decoder in DETR3D transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, embed_dims, *args, predict_refine=True, **kwargs): self.predict_refine =predict_refine super(Detr3DTransformerDecoder, self).__init__(*args, **kwargs) def forward(self, query, query_pos, mlvl_feats, temp_memory, temp_pos, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, attn_masks, temp_reference_points=None, reg_branches=None, query_embedding=None, return_intermediate_pts=False, cam_params=None, debug_info=None, ): """Forward function for `Detr3DTransformerDecoder`. Args: query (Tensor): Input query with shape `(num_query, bs, embed_dims)`. reference_points (Tensor): The reference points of offset. has shape (bs, num_query, 4) when as_two_stage, otherwise has shape ((bs, num_query, 2). reg_branch: (obj:`nn.ModuleList`): Used for refining the regression results. Only would be passed when with_box_refine is True, otherwise would be passed a `None`. Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ intermediate = [] intermediate_reference_points = [] ori_reference_points = reference_points.clone() for lid, layer in enumerate(self.layers): query = layer( query, query_pos, mlvl_feats, temp_memory, temp_pos, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, attn_masks, temp_reference_points=temp_reference_points, cam_params=cam_params, debug_info=debug_info, ) if reg_branches is not None: ref_shape = reference_points.shape if len(ref_shape) == 3: # Detection reg_points = reg_branches[lid](query)[..., :3].reshape(*ref_shape) elif len(ref_shape) == 4: # Map reg_points = reg_branches[lid](query).reshape(*ref_shape) if self.predict_refine: new_reference_points = reg_points + inverse_sigmoid(reference_points) else: if len(ref_shape) == 3: # Detection predicts the offset from the initial reference_points new_reference_points = reg_points + inverse_sigmoid(ori_reference_points) elif len(ref_shape) == 4: # Map predcits absolute reference points new_reference_points = reg_points new_reference_points = new_reference_points.sigmoid() reference_points = new_reference_points.clone().detach() intermediate_reference_points.append(new_reference_points) # Look twice from DINO if lid < len(self.layers)-1 and query_embedding is not None: if len(ref_shape) == 3: # Detection query_pos = query_embedding(pos2posemb3d(reference_points)) elif len(ref_shape) == 4: # Map query_pos = query_embedding(bevpos2posemb(reference_points, 32).flatten(-2, -1)) intermediate.append(query) if return_intermediate_pts: return torch.stack(intermediate), torch.stack(intermediate_reference_points) return torch.stack(intermediate) @TRANSFORMER_LAYER.register_module() class Detr3DTemporalDecoderLayer(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=False, with_cp=True, **kwargs): super().__init__(init_cfg) self.batch_first = batch_first assert set(operation_order) & { 'self_attn', 'norm', 'ffn', 'cross_attn'} == \ set(operation_order), f'The operation_order of' \ f' {self.__class__.__name__} should ' \ f'contains all four operation type ' \ f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index], dict(type='FFN'))) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) self.use_checkpoint = with_cp def _forward(self, query, query_pos, mlvl_feats, temp_memory, temp_pos, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, temp_reference_points=None, cam_params=None, debug_info=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': if temp_memory is not None: temp_key = temp_value = torch.cat([query, temp_memory], dim=1) if query_pos is not None and temp_pos is not None: temp_pos = torch.cat([query_pos, temp_pos], dim=1) temp_reference_points = torch.cat([reference_points, temp_reference_points], dim=1) else: temp_key = temp_value = query temp_pos = query_pos temp_reference_points = reference_points query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=temp_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, reference_points=reference_points, temp_reference_points=temp_reference_points, pc_range=pc_range, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, query_pos, mlvl_feats, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, cam_params=cam_params, debug_info=debug_info, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query def forward(self, query, query_pos, mlvl_feats, temp_memory, temp_pos, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, temp_reference_points=None, cam_params=None, debug_info=None, ): """Forward function for `TransformerCoder`. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if self.use_checkpoint and self.training: x = cp.checkpoint( self._forward, query, query_pos, mlvl_feats, temp_memory, temp_pos, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, attn_masks, query_key_padding_mask, key_padding_mask, temp_reference_points, cam_params, debug_info ) else: x = self._forward( query, query_pos, mlvl_feats, temp_memory, temp_pos, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img, img_metas, attn_masks, query_key_padding_mask, key_padding_mask, temp_reference_points=temp_reference_points, cam_params=cam_params, debug_info=debug_info, ) return x @ATTENTION.register_module() class DeformableFeatureAggregationCuda(BaseModule): def __init__( self, embed_dims=256, num_groups=8, num_levels=4, num_cams=6, dropout=0.1, num_pts=13, num_anchor_pts=1, im2col_step=64, batch_first=True, code_size=3, bias=1., ): super(DeformableFeatureAggregationCuda, self).__init__() self.embed_dims = embed_dims self.num_groups = num_groups self.num_anchor_pts = num_anchor_pts self.group_dims = (self.embed_dims // self.num_groups) self.num_levels = num_levels self.num_cams = num_cams self.num_pts = num_pts self.code_size = code_size self.weights_fc = nn.Linear(self.embed_dims, self.num_groups * self.num_levels * num_pts * self.num_anchor_pts) self.output_proj = nn.Linear(self.embed_dims, self.embed_dims) self.learnable_fc = nn.Linear(self.embed_dims, self.num_anchor_pts * num_pts * code_size) # self.cam_embed = nn.Sequential( # nn.Linear(12, self.embed_dims // 2), # nn.ReLU(inplace=True), # nn.Linear(self.embed_dims // 2, self.embed_dims), # nn.ReLU(inplace=True), # nn.LayerNorm(self.embed_dims), # ) self.drop = nn.Dropout(dropout) self.im2col_step = im2col_step self.bias = bias def init_weight(self): constant_init(self.weights_fc, val=0.0, bias=0.0) xavier_init(self.output_proj, distribution="uniform", bias=0.0) nn.init.uniform_(self.learnable_fc.bias.data, -self.bias, self.bias) @force_fp32() def forward(self, instance_feature, query_pos, feat_flatten, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img_mat, img_metas, cam_params=None, debug_info=None): bs, num_query = reference_points.shape[:2] reference_points = get_ego_pos(reference_points, pc_range) if reference_points.dim()==3 and self.num_anchor_pts==1: key_points = reference_points.unsqueeze(-2) + self.learnable_fc(instance_feature).reshape(bs, num_query, -1, self.code_size) elif reference_points.dim()==4 and self.num_anchor_pts==reference_points.size(2): # one query has more than 1 reference points key_points = reference_points.unsqueeze(-2) + self.learnable_fc(instance_feature).reshape(bs, num_query, self.num_anchor_pts, -1, self.code_size) key_points = key_points.reshape(bs, num_query, self.num_anchor_pts * self.num_pts, self.code_size) key_points = get_rel_pos(key_points, pc_range) weights = self._get_weights(instance_feature, query_pos, lidar2img_mat) features = self.feature_sampling(feat_flatten, spatial_flatten, level_start_index, key_points, weights, lidar2img_mat, img_metas) output = self.output_proj(features) output = self.drop(output) + instance_feature return output def _get_weights(self, instance_feature, anchor_embed, lidar2img_mat): bs, num_query = instance_feature.shape[:2] # lidar2img = lidar2img_mat[..., :3, :].flatten(-2) # cam_embed = self.cam_embed(lidar2img) # B, N, C if anchor_embed is not None: feat_pos = (instance_feature + anchor_embed) # .unsqueeze(2) # + cam_embed.unsqueeze(1) else: feat_pos = instance_feature if self.num_anchor_pts==1: weights = self.weights_fc(feat_pos).reshape(bs, num_query, self.num_groups, -1).softmax(dim=-1) weights = weights.reshape(bs, num_query, self.num_groups, self.num_levels, self.num_pts).contiguous() else: weights = self.weights_fc(feat_pos).reshape(bs, num_query, self.num_groups, self.num_anchor_pts, -1).softmax(dim=-1) / self.num_anchor_pts weights = weights.reshape(bs, num_query, self.num_groups, self.num_anchor_pts, self.num_levels, self.num_pts) weights = weights.permute(0, 1, 2, 4, 3, 5).flatten(-2).contiguous() return weights def feature_sampling(self, feat_flatten, spatial_flatten, level_start_index, key_points, weights, lidar2img_mat, img_metas): bs, num_query, _ = key_points.shape[:3] # pts_extand = torch.cat([key_points, torch.ones_like(key_points[..., :1])], dim=-1) # points_2d = torch.matmul(lidar2img_mat[:, :, None, None], pts_extand[:, None, ..., None]).squeeze(-1) # points_2d = points_2d[..., :2] / torch.clamp(points_2d[..., 2:3], min=1e-5) # points_2d[..., 0:1] = points_2d[..., 0:1] / img_metas[0]['pad_shape'][0][1] # points_2d[..., 1:2] = points_2d[..., 1:2] / img_metas[0]['pad_shape'][0][0] # points_2d = points_2d.flatten(end_dim=1) #[b*6, 900, 13, 2] # points_2d = points_2d[:, :, None, None, :, :].repeat(1, 1, self.num_groups, self.num_levels, 1, 1) points_2d = key_points[..., :2] points_2d = points_2d[:, :, None, None, :, :].repeat(1, 1, self.num_groups, self.num_levels, 1, 1) bn, num_value, _ = feat_flatten.size() feat_flatten = feat_flatten.reshape(bn, num_value, self.num_groups, -1) # attention_weights = weights * mask with autocast(enabled=False): output = MultiScaleDeformableAttnFunction.apply( feat_flatten, spatial_flatten, level_start_index, points_2d, weights, self.im2col_step) output = output.reshape(bs, num_query, -1) return output @ATTENTION.register_module() class DeformableFeatureAggregationCuda_v2(BaseModule): def __init__( self, embed_dims=256, num_groups=8, num_levels=4, num_cams=6, dropout=0.1, num_pts=13, num_anchor_pts=1, im2col_step=64, batch_first=True, bias=1., ): super(DeformableFeatureAggregationCuda_v2, self).__init__() self.embed_dims = embed_dims self.num_groups = num_groups self.num_anchor_pts = num_anchor_pts self.group_dims = (self.embed_dims // self.num_groups) self.num_levels = num_levels self.num_cams = num_cams self.num_pts = num_pts self.weights_fc = nn.Linear(self.embed_dims, self.num_groups * self.num_levels * num_pts * self.num_anchor_pts) self.output_proj = nn.Linear(self.embed_dims, self.embed_dims) self.learnable_fc = nn.Linear(self.embed_dims, self.num_anchor_pts * num_pts * 3) # self.cam_embed = nn.Sequential( # nn.Linear(12, self.embed_dims // 2), # nn.ReLU(inplace=True), # nn.Linear(self.embed_dims // 2, self.embed_dims), # nn.ReLU(inplace=True), # nn.LayerNorm(self.embed_dims), # ) self.drop = nn.Dropout(dropout) self.im2col_step = im2col_step self.bias = bias def init_weight(self): constant_init(self.weights_fc, val=0.0, bias=0.0) xavier_init(self.output_proj, distribution="uniform", bias=0.0) nn.init.uniform_(self.learnable_fc.bias.data, -self.bias, self.bias) @force_fp32() def forward(self, instance_feature, query_pos, feat_flatten, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img_mat, img_metas, cam_params=None, debug_info=None): bs, num_query = reference_points.shape[:2] reference_points = get_ego_pos(reference_points, pc_range) if reference_points.dim()==3 and self.num_anchor_pts==1: key_points = reference_points.unsqueeze(-2) + self.learnable_fc(instance_feature+query_pos).reshape(bs, num_query, -1, 3) elif reference_points.dim()==4 and self.num_anchor_pts==reference_points.size(2): # one query has more than 1 reference points key_points = reference_points.unsqueeze(-2) + self.learnable_fc(instance_feature+query_pos).reshape(bs, num_query, self.num_anchor_pts, -1, 3) key_points = key_points.reshape(bs, num_query, self.num_anchor_pts * self.num_pts, 3) key_points = get_rel_pos(key_points, pc_range) weights = self._get_weights(instance_feature, query_pos, lidar2img_mat) features = self.feature_sampling(feat_flatten, spatial_flatten, level_start_index, key_points, weights, lidar2img_mat, img_metas) output = self.output_proj(features) output = self.drop(output) + instance_feature return output def _get_weights(self, instance_feature, anchor_embed, lidar2img_mat): bs, num_query = instance_feature.shape[:2] # lidar2img = lidar2img_mat[..., :3, :].flatten(-2) # cam_embed = self.cam_embed(lidar2img) # B, N, C feat_pos = instance_feature + anchor_embed # .unsqueeze(2) # + cam_embed.unsqueeze(1) if self.num_anchor_pts==1: weights = self.weights_fc(feat_pos).reshape(bs, num_query, self.num_groups, -1).softmax(dim=-1) weights = weights.reshape(bs, num_query, self.num_groups, self.num_levels, self.num_pts).contiguous() else: weights = self.weights_fc(feat_pos).reshape(bs, num_query, self.num_groups, self.num_anchor_pts, -1).softmax(dim=-1) / self.num_anchor_pts weights = weights.reshape(bs, num_query, self.num_groups, self.num_anchor_pts, self.num_levels, self.num_pts) weights = weights.permute(0, 1, 2, 4, 3, 5).flatten(-2).contiguous() return weights def feature_sampling(self, feat_flatten, spatial_flatten, level_start_index, key_points, weights, lidar2img_mat, img_metas): bs, num_query, _ = key_points.shape[:3] # pts_extand = torch.cat([key_points, torch.ones_like(key_points[..., :1])], dim=-1) # points_2d = torch.matmul(lidar2img_mat[:, :, None, None], pts_extand[:, None, ..., None]).squeeze(-1) # points_2d = points_2d[..., :2] / torch.clamp(points_2d[..., 2:3], min=1e-5) # points_2d[..., 0:1] = points_2d[..., 0:1] / img_metas[0]['pad_shape'][0][1] # points_2d[..., 1:2] = points_2d[..., 1:2] / img_metas[0]['pad_shape'][0][0] # points_2d = points_2d.flatten(end_dim=1) #[b*6, 900, 13, 2] # points_2d = points_2d[:, :, None, None, :, :].repeat(1, 1, self.num_groups, self.num_levels, 1, 1) points_2d = key_points[..., :2] points_2d = points_2d[:, :, None, None, :, :].repeat(1, 1, self.num_groups, self.num_levels, 1, 1) bn, num_value, _ = feat_flatten.size() feat_flatten = feat_flatten.reshape(bn, num_value, self.num_groups, -1) # attention_weights = weights * mask with autocast(enabled=False): output = MultiScaleDeformableAttnFunction.apply( feat_flatten, spatial_flatten, level_start_index, points_2d, weights, self.im2col_step) output = output.reshape(bs, num_query, -1) return output @ATTENTION.register_module() class MVDeformableFeatureAggregationCuda(BaseModule): def __init__( self, embed_dims=256, num_groups=8, num_levels=4, num_cams=6, dropout=0.1, num_pts=13, im2col_step=64, batch_first=True, bias=1., ): super(MVDeformableFeatureAggregationCuda, self).__init__() self.embed_dims = embed_dims self.num_groups = num_groups self.group_dims = (self.embed_dims // self.num_groups) self.num_levels = num_levels self.num_cams = num_cams self.weights_fc = nn.Linear(self.embed_dims, self.num_groups * self.num_levels * num_pts) self.output_proj = nn.Linear(self.embed_dims, self.embed_dims) self.learnable_fc = nn.Linear(self.embed_dims, num_pts * 3) self.cam_embed = nn.Sequential( nn.Linear(26, self.embed_dims // 2), nn.ReLU(inplace=True), nn.Linear(self.embed_dims // 2, self.embed_dims), nn.ReLU(inplace=True), nn.LayerNorm(self.embed_dims), ) self.drop = nn.Dropout(dropout) self.im2col_step = im2col_step self.bias = bias def init_weight(self): constant_init(self.weights_fc, val=0.0, bias=0.0) xavier_init(self.output_proj, distribution="uniform", bias=0.0) nn.init.uniform_(self.learnable_fc.bias.data, -self.bias, self.bias) def forward(self, instance_feature, query_pos, feat_flatten, reference_points, spatial_flatten, level_start_index, pc_range, lidar2img_mat, img_metas, cam_params=None, debug_info=None): bs, num_anchor = reference_points.shape[:2] reference_points = get_ego_pos(reference_points, pc_range) key_points = reference_points.unsqueeze(-2) + self.learnable_fc(instance_feature).reshape(bs, num_anchor, -1, 3) weights = self._get_weights(instance_feature, query_pos, lidar2img_mat, cam_params) features = self.feature_sampling(feat_flatten, spatial_flatten, level_start_index, key_points, weights, lidar2img_mat, img_metas, cam_params=cam_params, debug_info=debug_info) output = self.output_proj(features) output = self.drop(output) + instance_feature return output def _get_weights(self, instance_feature, anchor_embed, lidar2img_mat, cam_params=None): bs, num_anchor = instance_feature.shape[:2] # lidar2img = lidar2img_mat[..., :3, :].flatten(-2) rots, trans, intrins, post_rots, post_trans, bda = cam_params mln_input = torch.cat([intrins[..., 0, 0:1], intrins[..., 1,1:2], rots.flatten(-2), trans, post_rots.flatten(-2), post_trans], dim=-1) cam_embed = self.cam_embed(mln_input) # B, N, C feat_pos = (instance_feature + anchor_embed).unsqueeze(2) + cam_embed.unsqueeze(1) weights = self.weights_fc(feat_pos).reshape(bs, num_anchor, -1, self.num_groups).softmax(dim=-2) weights = weights.reshape(bs, num_anchor, self.num_cams, -1, self.num_groups).permute(0, 2, 1, 4, 3).contiguous() return weights.flatten(end_dim=1) @force_fp32(apply_to=('feat_flatten', 'key_points')) def feature_sampling(self, feat_flatten, spatial_flatten, level_start_index, key_points, weights, lidar2img_mat, img_metas, cam_params=None, debug_info=None): bs, num_anchor, _ = key_points.shape[:3] rots, trans, intrins, post_rots, post_trans, bda = cam_params B, N, _ = trans.shape eps = 1e-5 ogfH, ogfW = img_metas[0]['input_size'] reference_points = key_points # reference_points = debug_info['centers3d'][0][:, :3][None, :, None, :].to(rots.device) with autocast(enabled=False): reference_points = reference_points[:, None].repeat(1, N, 1, 1, 1) reference_points = torch.inverse(bda).view(B, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points -= trans.view(B, N, 1, 1, 3) combine = rots.matmul(torch.inverse(intrins)).inverse() points_2d = combine.view(B, N, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) points_2d = torch.cat([points_2d[..., 0:2] / torch.maximum( points_2d[..., 2:3], torch.ones_like(points_2d[..., 2:3])*eps), points_2d[..., 2:3]], 4 ) points_2d = post_rots.view(B, N, 1, 1, 3, 3).matmul(points_2d.unsqueeze(-1)).squeeze(-1) points_2d += post_trans.view(B, N, 1, 1, 3) # imgs = debug_info['img'][0] # import cv2 # from IPython import embed # embed() # exit() # for i in range(6): # img2 = imgs[i].permute(1, 2, 0).cpu().numpy().astype(np.float32) # img = np.ones([320, 800, 3], dtype=np.float32) * 255 # img = img.astype(np.uint8) # for corner in points_2d[0][i]: # corner = corner[0] # if (05] = 1000 if self.consider_map_quality and map_scores is not None: map_scores = map_scores.sigmoid().max(-1)[0] # smaller, better # map_scores = torch.round(1-map_scores, decimals=1) + self.map_alpha dist[map_scores.unsqueeze(1)<0.2] = 1000 dist = -dist return dist @TRANSFORMER_LAYER_SEQUENCE.register_module() class CustomTransformerDecoder(TransformerLayerSequence): """Implements the decoder in DETR3D transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, *args, return_intermediate=False, **kwargs): super(CustomTransformerDecoder, self).__init__(*args, **kwargs) self.return_intermediate = return_intermediate self.fp16_enabled = False def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, key_padding_mask=None, *args, **kwargs): """Forward function for `Detr3DTransformerDecoder`. Args: query (Tensor): Input query with shape `(num_query, bs, embed_dims)`. Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ intermediate = [] for lid, layer in enumerate(self.layers): query = layer( query=query, key=key, value=value, query_pos=query_pos, key_pos=key_pos, attn_masks=attn_masks, key_padding_mask=key_padding_mask, *args, **kwargs) if self.return_intermediate: intermediate.append(query) if self.return_intermediate: return torch.stack(intermediate) return query ================================================ FILE: mmdet3d/models/fbbev/streampetr/streampetr_utils.py ================================================ import torch def normalize_bbox(bboxes, pc_range): cx = bboxes[..., 0:1] cy = bboxes[..., 1:2] cz = bboxes[..., 2:3] w = bboxes[..., 3:4].log() l = bboxes[..., 4:5].log() h = bboxes[..., 5:6].log() rot = bboxes[..., 6:7] if bboxes.size(-1) > 7: vx = bboxes[..., 7:8] vy = bboxes[..., 8:9] normalized_bboxes = torch.cat( (cx, cy, cz, w, l, h, rot.sin(), rot.cos(), vx, vy), dim=-1 ) else: normalized_bboxes = torch.cat( (cx, cy, cz, w, l, h, rot.sin(), rot.cos()), dim=-1 ) return normalized_bboxes # ------------------------------------------------------------------------ # Copyright (c) 2022 megvii-model. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from mmdetection (https://github.com/open-mmlab/mmdetection) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ # Modified by Shihao Wang # ------------------------------------------------------------------------ import math import torch import torch.nn as nn import numpy as np def denormalize_bbox(normalized_bboxes, pc_range): # rotation rot_sine = normalized_bboxes[..., 6:7] rot_cosine = normalized_bboxes[..., 7:8] rot = torch.atan2(rot_sine, rot_cosine) # center in the bev cx = normalized_bboxes[..., 0:1] cy = normalized_bboxes[..., 1:2] cz = normalized_bboxes[..., 2:3] # size w = normalized_bboxes[..., 3:4] l = normalized_bboxes[..., 4:5] h = normalized_bboxes[..., 5:6] w = w.exp() l = l.exp() h = h.exp() if normalized_bboxes.size(-1) > 8: # velocity vx = normalized_bboxes[:, 8:9] vy = normalized_bboxes[:, 9:10] denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) else: denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) return denormalized_bboxes def pos2posemb3d(pos, num_pos_feats=128, temperature=10000): scale = 2 * math.pi pos = pos * scale dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats) pos_x = pos[..., 0, None] / dim_t pos_y = pos[..., 1, None] / dim_t pos_z = pos[..., 2, None] / dim_t pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2) pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2) posemb = torch.cat((pos_y, pos_x, pos_z), dim=-1) return posemb def bevpos2posemb(pos, num_pos_feats=128, temperature=10000): scale = 2 * math.pi pos = pos * scale dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats) pos_x = pos[..., 0, None] / dim_t pos_y = pos[..., 1, None] / dim_t pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2) posemb = torch.cat((pos_y, pos_x), dim=-1) return posemb def pos2posemb1d(pos, num_pos_feats=256, temperature=10000): scale = 2 * math.pi pos = pos * scale dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats) pos_x = pos[..., 0, None] / dim_t pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) return pos_x def nerf_positional_encoding( tensor, num_encoding_functions=6, include_input=False, log_sampling=True ) -> torch.Tensor: r"""Apply positional encoding to the input. Args: tensor (torch.Tensor): Input tensor to be positionally encoded. encoding_size (optional, int): Number of encoding functions used to compute a positional encoding (default: 6). include_input (optional, bool): Whether or not to include the input in the positional encoding (default: True). Returns: (torch.Tensor): Positional encoding of the input tensor. """ # TESTED # Trivially, the input tensor is added to the positional encoding. encoding = [tensor] if include_input else [] frequency_bands = None if log_sampling: frequency_bands = 2.0 ** torch.linspace( 0.0, num_encoding_functions - 1, num_encoding_functions, dtype=tensor.dtype, device=tensor.device, ) else: frequency_bands = torch.linspace( 2.0 ** 0.0, 2.0 ** (num_encoding_functions - 1), num_encoding_functions, dtype=tensor.dtype, device=tensor.device, ) for freq in frequency_bands: for func in [torch.sin, torch.cos]: encoding.append(func(tensor * freq)) # Special case, for no positional encoding if len(encoding) == 1: return encoding[0] else: return torch.cat(encoding, dim=-1) import torch import torch.nn as nn import numpy as np from mmdet.core import bbox_xyxy_to_cxcywh from mmdet.models.utils.transformer import inverse_sigmoid def memory_refresh(memory, prev_exist): memory_shape = memory.shape view_shape = [1 for _ in range(len(memory_shape))] prev_exist = prev_exist.view(-1, *view_shape[1:]) return memory * prev_exist def topk_gather(feat, topk_indexes): if topk_indexes is not None: feat_shape = feat.shape topk_shape = topk_indexes.shape view_shape = [1 for _ in range(len(feat_shape))] view_shape[:2] = topk_shape[:2] topk_indexes = topk_indexes.view(*view_shape) feat = torch.gather(feat, 1, topk_indexes.repeat(1, 1, *feat_shape[2:])) return feat def apply_ltrb(locations, pred_ltrb): """ :param locations: (1, H, W, 2) :param pred_ltrb: (N, H, W, 4) """ pred_boxes = torch.zeros_like(pred_ltrb) pred_boxes[..., 0] = (locations[..., 0] - pred_ltrb[..., 0])# x1 pred_boxes[..., 1] = (locations[..., 1] - pred_ltrb[..., 1])# y1 pred_boxes[..., 2] = (locations[..., 0] + pred_ltrb[..., 2])# x2 pred_boxes[..., 3] = (locations[..., 1] + pred_ltrb[..., 3])# y2 min_xy = pred_boxes[..., 0].new_tensor(0) max_xy = pred_boxes[..., 0].new_tensor(1) pred_boxes = torch.where(pred_boxes < min_xy, min_xy, pred_boxes) pred_boxes = torch.where(pred_boxes > max_xy, max_xy, pred_boxes) pred_boxes = bbox_xyxy_to_cxcywh(pred_boxes) return pred_boxes def apply_center_offset(locations, center_offset): """ :param locations: (1, H, W, 2) :param pred_ltrb: (N, H, W, 4) """ centers_2d = torch.zeros_like(center_offset) locations = inverse_sigmoid(locations) centers_2d[..., 0] = locations[..., 0] + center_offset[..., 0] # x1 centers_2d[..., 1] = locations[..., 1] + center_offset[..., 1] # y1 centers_2d = centers_2d.sigmoid() return centers_2d @torch.no_grad() def locations(features, stride, pad_h, pad_w): """ Arguments: features: (N, C, H, W) Return: locations: (H, W, 2) """ h, w = features.size()[-2:] device = features.device shifts_x = (torch.arange( 0, stride*w, step=stride, dtype=torch.float32, device=device ) + stride // 2 ) / pad_w shifts_y = (torch.arange( 0, h * stride, step=stride, dtype=torch.float32, device=device ) + stride // 2) / pad_h shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) locations = torch.stack((shift_x, shift_y), dim=1) locations = locations.reshape(h, w, 2) return locations def gaussian_2d(shape, sigma=1.0): """Generate gaussian map. Args: shape (list[int]): Shape of the map. sigma (float, optional): Sigma to generate gaussian map. Defaults to 1. Returns: np.ndarray: Generated gaussian map. """ m, n = [(ss - 1.) / 2. for ss in shape] y, x = np.ogrid[-m:m + 1, -n:n + 1] h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h def draw_heatmap_gaussian(heatmap, center, radius, k=1): """Get gaussian masked heatmap. Args: heatmap (torch.Tensor): Heatmap to be masked. center (torch.Tensor): Center coord of the heatmap. radius (int): Radius of gaussian. K (int, optional): Multiple of masked_gaussian. Defaults to 1. Returns: torch.Tensor: Masked heatmap. """ diameter = 2 * radius + 1 gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = torch.from_numpy( gaussian[radius - top:radius + bottom, radius - left:radius + right]).to(heatmap.device, torch.float32) if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap class SELayer_Linear(nn.Module): def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): super().__init__() self.conv_reduce = nn.Linear(channels, channels) self.act1 = act_layer() self.conv_expand = nn.Linear(channels, channels) self.gate = gate_layer() def forward(self, x, x_se): x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) return x * self.gate(x_se) class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.reset_parameters() def reset_parameters(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out def transform_reference_points(reference_points, egopose, reverse=False, translation=True): reference_points = torch.cat([reference_points, torch.ones_like(reference_points[..., 0:1])], dim=-1) if reverse: matrix = egopose.inverse() else: matrix = egopose if not translation: matrix[..., :3, 3] = 0.0 if reference_points.dim()==4: B, N, K, C = reference_points.shape reference_points = reference_points.view(B, N*K, C) reference_points = (matrix.unsqueeze(1) @ reference_points.unsqueeze(-1)).squeeze(-1)[..., :3] return reference_points.view(B, N, K, 3) else: reference_points = (matrix.unsqueeze(1) @ reference_points.unsqueeze(-1)).squeeze(-1)[..., :3] return reference_points ================================================ FILE: mmdet3d/models/fbbev/streampetr/streampetr_v2.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import torch import torch.nn as nn from mmcv.cnn import Linear, bias_init_with_prob, Scale from mmcv.runner import force_fp32 from mmdet.core import (build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.utils import build_transformer from mmdet.models import HEADS, build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.models.utils.transformer import inverse_sigmoid from mmdet3d.core.bbox.coders import build_bbox_coder from .streampetr_utils import * import copy from mmdet.models.utils import NormedLinear from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.fbbev.utils import save_tensor @HEADS.register_module() class SparseHead4BEV(AnchorFreeHead): """Implements the DETR transformer head. See `paper: End-to-End Object Detection with Transformers `_ for details. Args: num_classes (int): Number of categories excluding the background. in_channels (int): Number of channels in the input feature map. num_query (int): Number of query in Transformer. num_reg_fcs (int, optional): Number of fully-connected layers used in `FFN`, which is then used for the regression head. Default 2. transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer. Default: None. sync_cls_avg_factor (bool): Whether to sync the avg_factor of all ranks. Default to False. positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for position encoding. loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification loss. Default `CrossEntropyLoss`. loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the regression loss. Default `L1Loss`. loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the regression iou loss. Default `GIoULoss`. tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of transformer head. test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of transformer head. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ _version = 2 def __init__(self, num_classes, in_channels=256, stride=[16], embed_dims=256, num_query=100, num_reg_fcs=2, memory_len=1024, topk_proposals=256, num_propagated=256, with_dn=True, with_ego_pos=True, match_with_velo=True, match_costs=None, transformer=None, sync_cls_avg_factor=False, code_weights=None, bbox_coder=None, loss_cls=dict( type='CrossEntropyLoss', bg_cls_weight=0.1, use_sigmoid=False, loss_weight=1.0, class_weight=1.0), loss_bbox=dict(type='L1Loss', loss_weight=5.0), loss_iou=dict(type='GIoULoss', loss_weight=2.0), train_cfg=dict( assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=5.0), iou_cost=dict( type='IoUCost', iou_mode='giou', weight=2.0)),), test_cfg=dict(max_per_img=100), scalar = 5, noise_scale = 0.4, noise_trans = 0.0, dn_weight = 1.0, split = 0.5, init_cfg=None, normedlinear=False, different_heads=True, **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. self.different_heads = different_heads if 'code_size' in kwargs: self.code_size = kwargs['code_size'] else: self.code_size = 10 if code_weights is not None: self.code_weights = code_weights else: self.code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] self.code_weights = self.code_weights[:self.code_size] if match_costs is not None: self.match_costs = match_costs else: self.match_costs = self.code_weights self.bg_cls_weight = 0 self.sync_cls_avg_factor = sync_cls_avg_factor class_weight = loss_cls.get('class_weight', None) if class_weight is not None and (self.__class__ is SparseHead): assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight if train_cfg: assert 'assigner' in train_cfg, 'assigner should be provided '\ 'when train_cfg is set.' assigner = train_cfg['assigner'] self.assigner = build_assigner(assigner) # DETR sampling=False, so use PseudoSampler sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.num_query = num_query self.num_classes = num_classes self.in_channels = in_channels self.memory_len = memory_len self.topk_proposals = topk_proposals self.num_propagated = num_propagated self.with_dn = with_dn self.with_ego_pos = with_ego_pos self.match_with_velo = match_with_velo self.num_reg_fcs = num_reg_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.fp16_enabled = False self.embed_dims = embed_dims self.with_dn = with_dn self.stride=stride self.scalar = scalar self.bbox_noise_scale = noise_scale self.bbox_noise_trans = noise_trans self.dn_weight = dn_weight self.split = split self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.num_pred = transformer['decoder']['num_layers'] self.normedlinear = normedlinear super(SparseHead4BEV, self).__init__(num_classes, in_channels, init_cfg = init_cfg) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 self.transformer = build_transformer(transformer) self.code_weights = nn.Parameter(torch.tensor( self.code_weights), requires_grad=False) self.match_costs = nn.Parameter(torch.tensor( self.match_costs), requires_grad=False) self.bbox_coder = build_bbox_coder(bbox_coder) self.pc_range = nn.Parameter(torch.tensor( self.bbox_coder.pc_range), requires_grad=False) self._init_layers() self.reset_memory() self.count = 0 def _init_layers(self): """Initialize layers of the transformer head.""" cls_branch = [] for _ in range(self.num_reg_fcs): cls_branch.append(Linear(self.embed_dims, self.embed_dims)) cls_branch.append(nn.LayerNorm(self.embed_dims)) cls_branch.append(nn.ReLU(inplace=True)) if self.normedlinear: cls_branch.append(NormedLinear(self.embed_dims, self.cls_out_channels)) else: cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) fc_cls = nn.Sequential(*cls_branch) reg_branch = [] for _ in range(self.num_reg_fcs): reg_branch.append(Linear(self.embed_dims, self.embed_dims)) reg_branch.append(nn.ReLU()) reg_branch.append(Linear(self.embed_dims, self.code_size)) reg_branch = nn.Sequential(*reg_branch) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) if self.different_heads: self.cls_branches =_get_clones(fc_cls, self.num_pred) self.reg_branches = _get_clones(reg_branch, self.num_pred) else: self.cls_branches = nn.ModuleList( [fc_cls for _ in range(self.num_pred)]) self.reg_branches = nn.ModuleList( [reg_branch for _ in range(self.num_pred)]) self.reference_points = nn.Embedding(self.num_query, 3) if self.num_propagated > 0: self.pseudo_reference_points = nn.Embedding(self.num_propagated, 3) self.query_embedding = nn.Sequential( nn.Linear(self.embed_dims*3//2, self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) # self.spatial_alignment = MLN(14, use_ln=False) self.time_embedding = nn.Sequential( nn.Linear(self.embed_dims, self.embed_dims), nn.LayerNorm(self.embed_dims) ) # encoding ego pose if self.with_ego_pos: self.ego_pose_pe = MLN(180) self.ego_pose_memory = MLN(180) def temporal_alignment(self, query_pos, tgt, reference_points): B = query_pos.size(0) temp_reference_points = (self.memory_reference_point - self.pc_range[:3]) / (self.pc_range[3:6] - self.pc_range[0:3]) temp_pos = self.query_embedding(pos2posemb3d(temp_reference_points)) temp_memory = self.memory_embedding rec_ego_pose = torch.eye(4, device=query_pos.device).unsqueeze(0).unsqueeze(0).repeat(B, query_pos.size(1), 1, 1) if self.with_ego_pos: rec_ego_motion = torch.cat([torch.zeros_like(reference_points[...,:3]), rec_ego_pose[..., :3, :].flatten(-2)], dim=-1) rec_ego_motion = nerf_positional_encoding(rec_ego_motion) tgt = self.ego_pose_memory(tgt, rec_ego_motion) query_pos = self.ego_pose_pe(query_pos, rec_ego_motion) memory_ego_motion = torch.cat([self.memory_velo, self.memory_timestamp, self.memory_egopose[..., :3, :].flatten(-2)], dim=-1).float() memory_ego_motion = nerf_positional_encoding(memory_ego_motion) temp_pos = self.ego_pose_pe(temp_pos, memory_ego_motion) temp_memory = self.ego_pose_memory(temp_memory, memory_ego_motion) query_pos += self.time_embedding(pos2posemb1d(torch.zeros_like(reference_points[...,:1]))) temp_pos += self.time_embedding(pos2posemb1d(self.memory_timestamp).float()) if self.num_propagated > 0: tgt = torch.cat([tgt, temp_memory[:, :self.num_propagated]], dim=1) query_pos = torch.cat([query_pos, temp_pos[:, :self.num_propagated]], dim=1) reference_points = torch.cat([reference_points, temp_reference_points[:, :self.num_propagated]], dim=1) rec_ego_pose = torch.eye(4, device=query_pos.device).unsqueeze(0).unsqueeze(0).repeat(B, query_pos.shape[1]+self.num_propagated, 1, 1) temp_memory = temp_memory[:, self.num_propagated:] temp_pos = temp_pos[:, self.num_propagated:] temp_reference_points = temp_reference_points[:, self.num_propagated:] return tgt, query_pos, reference_points, temp_reference_points, temp_memory, temp_pos, rec_ego_pose def prepare_for_dn(self, batch_size, reference_points, img_metas, gt_bboxes_3d, gt_labels_3d ): if self.training and self.with_dn: targets = [torch.cat((each.gravity_center, each.tensor[:, 3:]),dim=1) for each in gt_bboxes_3d ] labels = [each for each in gt_labels_3d ] known = [(torch.ones_like(t)).cuda() for t in labels] know_idx = known unmask_bbox = unmask_label = torch.cat(known) #gt_num known_num = [t.size(0) for t in targets] labels = torch.cat([t for t in labels]) boxes = torch.cat([t for t in targets]) batch_idx = torch.cat([torch.full((t.size(0), ), i) for i, t in enumerate(targets)]) known_indice = torch.nonzero(unmask_label + unmask_bbox) known_indice = known_indice.view(-1) # add noise known_indice = known_indice.repeat(self.scalar, 1).view(-1) known_labels = labels.repeat(self.scalar, 1).view(-1).long().to(reference_points.device) known_bid = batch_idx.repeat(self.scalar, 1).view(-1) known_bboxs = boxes.repeat(self.scalar, 1).to(reference_points.device) known_bbox_center = known_bboxs[:, :3].clone() known_bbox_scale = known_bboxs[:, 3:6].clone() if self.bbox_noise_scale > 0: diff = known_bbox_scale / 2 + self.bbox_noise_trans rand_prob = torch.rand_like(known_bbox_center) * 2 - 1.0 known_bbox_center += torch.mul(rand_prob, diff) * self.bbox_noise_scale known_bbox_center[..., 0:3] = (known_bbox_center[..., 0:3] - self.pc_range[0:3]) / (self.pc_range[3:6] - self.pc_range[0:3]) known_bbox_center = known_bbox_center.clamp(min=0.0, max=1.0) mask = torch.norm(rand_prob, 2, 1) > self.split known_labels[mask] = self.num_classes single_pad = int(max(known_num)) pad_size = int(single_pad * self.scalar) padding_bbox = torch.zeros(pad_size, 3).to(reference_points.device) padded_reference_points = torch.cat([padding_bbox, reference_points], dim=0).unsqueeze(0).repeat(batch_size, 1, 1) if len(known_num): map_known_indice = torch.cat([torch.tensor(range(num)) for num in known_num]) # [1,2, 1,2,3] map_known_indice = torch.cat([map_known_indice + single_pad * i for i in range(self.scalar)]).long() if len(known_bid): padded_reference_points[(known_bid.long(), map_known_indice)] = known_bbox_center.to(reference_points.device) tgt_size = pad_size + self.num_query attn_mask = torch.ones(tgt_size, tgt_size).to(reference_points.device) < 0 # match query cannot see the reconstruct attn_mask[pad_size:, :pad_size] = True # reconstruct cannot see each other for i in range(self.scalar): if i == 0: attn_mask[single_pad * i:single_pad * (i + 1), single_pad * (i + 1):pad_size] = True if i == self.scalar - 1: attn_mask[single_pad * i:single_pad * (i + 1), :single_pad * i] = True else: attn_mask[single_pad * i:single_pad * (i + 1), single_pad * (i + 1):pad_size] = True attn_mask[single_pad * i:single_pad * (i + 1), :single_pad * i] = True # update dn mask for temporal modeling query_size = pad_size + self.num_query + self.num_propagated tgt_size = pad_size + self.num_query + self.memory_len temporal_attn_mask = torch.ones(query_size, tgt_size).to(reference_points.device) < 0 temporal_attn_mask[:attn_mask.size(0), :attn_mask.size(1)] = attn_mask temporal_attn_mask[pad_size:, :pad_size] = True attn_mask = temporal_attn_mask mask_dict = { 'known_indice': torch.as_tensor(known_indice).long(), 'batch_idx': torch.as_tensor(batch_idx).long(), 'map_known_indice': torch.as_tensor(map_known_indice).long(), 'known_lbs_bboxes': (known_labels, known_bboxs), 'know_idx': know_idx, 'pad_size': pad_size } else: padded_reference_points = reference_points.unsqueeze(0).repeat(batch_size, 1, 1) attn_mask = None mask_dict = None return padded_reference_points, attn_mask, mask_dict def init_weights(self): """Initialize weights of the transformer head.""" # The initialization for transformer is important nn.init.uniform_(self.reference_points.weight.data, 0, 1) if self.num_propagated > 0: nn.init.uniform_(self.pseudo_reference_points.weight.data, 0, 1) self.pseudo_reference_points.weight.requires_grad = False self.transformer.init_weights() if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) for m in self.cls_branches: nn.init.constant_(m[-1].bias, bias_init) def reset_memory(self): self.memory_embedding = None self.memory_reference_point = None self.memory_timestamp = None self.memory_egopose = None self.memory_velo = None def pre_update_memory(self, data): x = 1-data['start_of_sequence'] # original prev_exist, so we need do `not` B = x.size(0) # refresh the memory when the scene changes if self.memory_embedding is None: self.memory_embedding = x.new_zeros(B, self.memory_len, self.embed_dims) self.memory_reference_point = x.new_zeros(B, self.memory_len, 3) self.memory_timestamp = x.new_zeros(B, self.memory_len, 1) self.memory_egopose = x.new_zeros(B, self.memory_len, 4, 4) self.memory_velo = x.new_zeros(B, self.memory_len, 2) else: self.memory_timestamp += data['timestamp'].unsqueeze(-1).unsqueeze(-1) self.memory_egopose = data['ego_pose_inv'].unsqueeze(1) @ self.memory_egopose self.memory_reference_point = transform_reference_points(self.memory_reference_point, data['ego_pose_inv'], reverse=False) self.memory_timestamp = memory_refresh(self.memory_timestamp[:, :self.memory_len], x) self.memory_reference_point = memory_refresh(self.memory_reference_point[:, :self.memory_len], x) self.memory_embedding = memory_refresh(self.memory_embedding[:, :self.memory_len], x) self.memory_egopose = memory_refresh(self.memory_egopose[:, :self.memory_len], x) self.memory_velo = memory_refresh(self.memory_velo[:, :self.memory_len], x) # for the first frame, padding pseudo_reference_points (non-learnable) if self.num_propagated > 0: pseudo_reference_points = self.pseudo_reference_points.weight * (self.pc_range[3:6] - self.pc_range[0:3]) + self.pc_range[0:3] self.memory_reference_point[:, :self.num_propagated] = self.memory_reference_point[:, :self.num_propagated] + (1 - x).view(B, 1, 1) * pseudo_reference_points self.memory_egopose[:, :self.num_propagated] = self.memory_egopose[:, :self.num_propagated] + (1 - x).view(B, 1, 1, 1) * torch.eye(4, device=x.device) def post_update_memory(self, data, rec_ego_pose, all_cls_scores, all_bbox_preds, outs_dec, mask_dict): if self.training and mask_dict and mask_dict['pad_size'] > 0: rec_reference_points = all_bbox_preds[:, :, mask_dict['pad_size']:, :3][-1] rec_velo = all_bbox_preds[:, :, mask_dict['pad_size']:, -2:][-1] rec_memory = outs_dec[:, :, mask_dict['pad_size']:, :][-1] rec_score = all_cls_scores[:, :, mask_dict['pad_size']:, :][-1].sigmoid().topk(1, dim=-1).values[..., 0:1] rec_timestamp = torch.zeros_like(rec_score, dtype=torch.float64) else: rec_reference_points = all_bbox_preds[..., :3][-1] rec_velo = all_bbox_preds[..., -2:][-1] rec_memory = outs_dec[-1] rec_score = all_cls_scores[-1].sigmoid().topk(1, dim=-1).values[..., 0:1] rec_timestamp = torch.zeros_like(rec_score, dtype=torch.float64) # topk proposals _, topk_indexes = torch.topk(rec_score, self.topk_proposals, dim=1) rec_timestamp = topk_gather(rec_timestamp, topk_indexes) rec_reference_points = topk_gather(rec_reference_points, topk_indexes).detach() rec_memory = topk_gather(rec_memory, topk_indexes).detach() rec_ego_pose = topk_gather(rec_ego_pose, topk_indexes) rec_velo = topk_gather(rec_velo, topk_indexes).detach() # if self.count == 1: # from IPython import embed # embed() # exit() self.memory_embedding = torch.cat([rec_memory, self.memory_embedding], dim=1) self.memory_timestamp = torch.cat([rec_timestamp, self.memory_timestamp], dim=1) self.memory_egopose= torch.cat([rec_ego_pose, self.memory_egopose], dim=1) self.memory_reference_point = torch.cat([rec_reference_points, self.memory_reference_point], dim=1) self.memory_velo = torch.cat([rec_velo, self.memory_velo], dim=1) # self.memory_reference_point_copy = self.memory_reference_point.clone() self.memory_reference_point = transform_reference_points(self.memory_reference_point, data['ego_pose'], reverse=False) self.memory_timestamp -= data['timestamp'].unsqueeze(-1).unsqueeze(-1) self.memory_egopose = data['ego_pose'].unsqueeze(1) @ self.memory_egopose def forward(self, input_dict, img_metas, gt_bboxes_3d=None, gt_labels_3d=None, debug_info=None): """Forward function. Args: mlvl_feats (tuple[Tensor]): Features from the upstream network, each is a 5D-tensor with shape (B, N, C, H, W). Returns: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ Shape [nb_dec, bs, num_query, 9]. """ start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(input_dict['img_bev_feat'][0].device) timestamp = torch.FloatTensor([ single_img_metas['timestamp'] for single_img_metas in img_metas]).to(input_dict['img_bev_feat'][0].device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) data = dict( start_of_sequence = start_of_sequence, timestamp = timestamp, ego_pose_inv = ego_pose_inv, ego_pose = ego_pose, ) if input_dict['img_bev_feat'][0].dim() == 5: mlvl_feats = [level.mean(-1) for level in input_dict['img_bev_feat']] else: mlvl_feats = input_dict['img_bev_feat'] self.pre_update_memory(data) # mlvl_feats = data['img_feats'] B = mlvl_feats[0].size(0) # points_to_draw = (self.memory_reference_point - self.pc_range[0:3])/(self.pc_range[3:6] - self.pc_range[0:3]) * 128 # points_to_draw = points_to_draw[0, :, :2] # # print(points_to_draw.shape) # save_tensor(mlvl_feats[0].abs().std(1), f'bev_{self.count}.png') # import cv2 # img = cv2.imread(f'bev_{self.count}.png') # for i in range(10): # img = cv2.circle(img, center=points_to_draw[256*((self.count)%4)+i].cpu().numpy().astype(np.int), thickness=1, radius=1, color=(255,0,0)) # cv2.imwrite(f'a_{self.count}.png', img) # self.count +=1 # if self.count == 10: # from IPython import embed # embed() # exit() reference_points = self.reference_points.weight dtype = reference_points.dtype feat_flatten = [] spatial_flatten = [] for i in range(len(mlvl_feats)): B, C, H, W = mlvl_feats[i].shape mlvl_feat = mlvl_feats[i].reshape(B, C, -1).transpose(1, 2) # mlvl_feat = self.spatial_alignment(mlvl_feat, mln_input) feat_flatten.append(mlvl_feat.to(dtype)) spatial_flatten.append((H, W)) feat_flatten = torch.cat(feat_flatten, dim=1) spatial_flatten = torch.as_tensor(spatial_flatten, dtype=torch.long, device=mlvl_feats[0].device) level_start_index = torch.cat((spatial_flatten.new_zeros((1, )), spatial_flatten.prod(1).cumsum(0)[:-1])) reference_points, attn_mask, mask_dict = self.prepare_for_dn(B, reference_points, img_metas, gt_bboxes_3d, gt_labels_3d) query_pos = self.query_embedding(pos2posemb3d(reference_points)) tgt = torch.zeros_like(query_pos) # prepare for the tgt and query_pos using mln. tgt, query_pos, reference_points, temp_reference_points, temp_memory, temp_pos, rec_ego_pose = self.temporal_alignment(query_pos, tgt, reference_points) init_reference_points = reference_points.clone() outs_dec, intermediate_reference_points = self.transformer(tgt, query_pos, feat_flatten, spatial_flatten, level_start_index, temp_memory, temp_pos, attn_mask, reference_points, self.pc_range, data, img_metas, reg_branches=self.reg_branches, return_intermediate_pts=True, query_embedding=self.query_embedding, temp_reference_points=temp_reference_points) outs_dec = torch.nan_to_num(outs_dec) outputs_classes = [] outputs_coords = [] for lvl in range(outs_dec.shape[0]): outputs_class = self.cls_branches[lvl](outs_dec[lvl]) tmp = self.reg_branches[lvl](outs_dec[lvl]) if self.different_heads: reference = inverse_sigmoid(intermediate_reference_points[lvl]) else: reference = inverse_sigmoid(init_reference_points) assert reference.shape[-1] == 3 tmp[..., 0:3] += reference[..., 0:3] tmp[..., 0:3] = tmp[..., 0:3].sigmoid() outputs_coord = tmp outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) all_cls_scores = torch.stack(outputs_classes) all_bbox_preds = torch.stack(outputs_coords) all_bbox_preds[..., 0:3] = (all_bbox_preds[..., 0:3] * (self.pc_range[3:6] - self.pc_range[0:3]) + self.pc_range[0:3]) # update the memory bank self.post_update_memory(data, rec_ego_pose, all_cls_scores, all_bbox_preds, outs_dec, mask_dict) if mask_dict and mask_dict['pad_size'] > 0: output_known_class = all_cls_scores[:, :, :mask_dict['pad_size'], :] output_known_coord = all_bbox_preds[:, :, :mask_dict['pad_size'], :] outputs_class = all_cls_scores[:, :, mask_dict['pad_size']:, :] outputs_coord = all_bbox_preds[:, :, mask_dict['pad_size']:, :] mask_dict['output_known_lbs_bboxes']=(output_known_class, output_known_coord) outs = { 'agent_queries': outs_dec[-1, :, mask_dict['pad_size']:, :], 'all_cls_scores': outputs_class, 'all_bbox_preds': outputs_coord, 'dn_mask_dict':mask_dict, } else: outs = { 'agent_queries': outs_dec[-1], 'all_cls_scores': all_cls_scores, 'all_bbox_preds': all_bbox_preds, 'dn_mask_dict':None, } return outs def prepare_for_loss(self, mask_dict): """ prepare dn components to calculate loss Args: mask_dict: a dict that contains dn information """ output_known_class, output_known_coord = mask_dict['output_known_lbs_bboxes'] known_labels, known_bboxs = mask_dict['known_lbs_bboxes'] map_known_indice = mask_dict['map_known_indice'].long() known_indice = mask_dict['known_indice'].long().cpu() batch_idx = mask_dict['batch_idx'].long() bid = batch_idx[known_indice] if len(output_known_class) > 0: output_known_class = output_known_class.permute(1, 2, 0, 3)[(bid, map_known_indice)].permute(1, 0, 2) output_known_coord = output_known_coord.permute(1, 2, 0, 3)[(bid, map_known_indice)].permute(1, 0, 2) num_tgt = known_indice.numel() return known_labels, known_bboxs, output_known_class, output_known_coord, num_tgt def _get_target_single(self, cls_score, bbox_pred, gt_labels, gt_bboxes, gt_bboxes_ignore=None): """"Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: cls_score (Tensor): Box score logits from a single decoder layer for one image. Shape [num_query, cls_out_channels]. bbox_pred (Tensor): Sigmoid outputs from a single decoder layer for one image, with normalized coordinate (cx, cy, w, h) and shape [num_query, 4]. gt_bboxes (Tensor): Ground truth bboxes for one image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (Tensor): Ground truth class indexes for one image with shape (num_gts, ). gt_bboxes_ignore (Tensor, optional): Bounding boxes which can be ignored. Default None. Returns: tuple[Tensor]: a tuple containing the following for one image. - labels (Tensor): Labels of each image. - label_weights (Tensor]): Label weights of each image. - bbox_targets (Tensor): BBox targets of each image. - bbox_weights (Tensor): BBox weights of each image. - pos_inds (Tensor): Sampled positive indexes for each image. - neg_inds (Tensor): Sampled negative indexes for each image. """ num_bboxes = bbox_pred.size(0) # assigner and sampler assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, gt_labels, gt_bboxes_ignore, self.match_costs, self.match_with_velo) sampling_result = self.sampler.sample(assign_result, bbox_pred, gt_bboxes) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds # label targets labels = gt_bboxes.new_full((num_bboxes, ), self.num_classes, dtype=torch.long) label_weights = gt_bboxes.new_ones(num_bboxes) # bbox targets code_size = gt_bboxes.size(1) bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size] bbox_weights = torch.zeros_like(bbox_pred) # print(gt_bboxes.size(), bbox_pred.size()) # DETR if sampling_result.num_gts > 0: bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes bbox_weights[pos_inds] = 1.0 labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds] return (labels, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds) def get_targets(self, cls_scores_list, bbox_preds_list, gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list=None): """"Compute regression and classification targets for a batch image. Outputs from a single decoder layer of a single feature level are used. Args: cls_scores_list (list[Tensor]): Box score logits from a single decoder layer for each image with shape [num_query, cls_out_channels]. bbox_preds_list (list[Tensor]): Sigmoid outputs from a single decoder layer for each image, with normalized coordinate (cx, cy, w, h) and shape [num_query, 4]. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indexes for each image with shape (num_gts, ). gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all \ images. - bbox_targets_list (list[Tensor]): BBox targets for all \ images. - bbox_weights_list (list[Tensor]): BBox weights for all \ images. - num_total_pos (int): Number of positive samples in all \ images. - num_total_neg (int): Number of negative samples in all \ images. """ assert gt_bboxes_ignore_list is None, \ 'Only supports for gt_bboxes_ignore setting to None.' num_imgs = len(cls_scores_list) gt_bboxes_ignore_list = [ gt_bboxes_ignore_list for _ in range(num_imgs) ] (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply( self._get_target_single, cls_scores_list, bbox_preds_list, gt_labels_list, gt_bboxes_list, gt_bboxes_ignore_list) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) def loss_single(self, cls_scores, bbox_preds, gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list=None): """"Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images. Shape [bs, num_query, cls_out_channels]. bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape [bs, num_query, 4]. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indexes for each image with shape (num_gts, ). gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, gt_bboxes_list, gt_labels_list, gt_bboxes_ignore_list) (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg) = cls_reg_targets labels = torch.cat(labels_list, 0) label_weights = torch.cat(label_weights_list, 0) bbox_targets = torch.cat(bbox_targets_list, 0) bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss cls_scores = cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes accross all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # regression L1 loss bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) bbox_weights = bbox_weights * self.code_weights loss_bbox = self.loss_bbox( bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_total_pos) loss_cls = torch.nan_to_num(loss_cls) loss_bbox = torch.nan_to_num(loss_bbox) return loss_cls, loss_bbox def dn_loss_single(self, cls_scores, bbox_preds, known_bboxs, known_labels, num_total_pos=None): """"Loss function for outputs from a single decoder layer of a single feature level. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images. Shape [bs, num_query, cls_out_channels]. bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape [bs, num_query, 4]. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indexes for each image with shape (num_gts, ). gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ # classification loss cls_scores = cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 3.14159 / 6 * self.split * self.split * self.split ### positive rate if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) bbox_weights = torch.ones_like(bbox_preds) label_weights = torch.ones_like(known_labels) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, known_labels.long(), label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes accross all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # regression L1 loss bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) normalized_bbox_targets = normalize_bbox(known_bboxs, self.pc_range) isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) bbox_weights = bbox_weights * self.code_weights loss_bbox = self.loss_bbox( bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_total_pos) loss_cls = torch.nan_to_num(loss_cls) loss_bbox = torch.nan_to_num(loss_bbox) return self.dn_weight * loss_cls, self.dn_weight * loss_bbox @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_bboxes_list, gt_labels_list, preds_dicts, img_metas=None, gt_bboxes_ignore=None): """"Loss function. Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indexes for each image with shape (num_gts, ). preds_dicts: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_bbox_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape [nb_dec, bs, num_query, 4]. enc_cls_scores (Tensor): Classification scores of points on encode feature map , has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_bbox_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, 4). Only be passed when as_two_stage is True, otherwise is None. gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' all_cls_scores = preds_dicts['all_cls_scores'] all_bbox_preds = preds_dicts['all_bbox_preds'] num_dec_layers = len(all_cls_scores) device = gt_labels_list[0].device gt_bboxes_list = [torch.cat( (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device) for gt_bboxes in gt_bboxes_list] all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] all_gt_bboxes_ignore_list = [ gt_bboxes_ignore for _ in range(num_dec_layers) ] losses_cls, losses_bbox = multi_apply( self.loss_single, all_cls_scores, all_bbox_preds, all_gt_bboxes_list, all_gt_labels_list, all_gt_bboxes_ignore_list) loss_dict = dict() # loss_dict['size_loss'] = size_loss # loss from the last decoder layer loss_dict['loss_cls'] = losses_cls[-1] loss_dict['loss_bbox'] = losses_bbox[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i num_dec_layer += 1 if preds_dicts['dn_mask_dict'] is not None: known_labels, known_bboxs, output_known_class, output_known_coord, num_tgt = self.prepare_for_loss(preds_dicts['dn_mask_dict']) all_known_bboxs_list = [known_bboxs for _ in range(num_dec_layers)] all_known_labels_list = [known_labels for _ in range(num_dec_layers)] all_num_tgts_list = [ num_tgt for _ in range(num_dec_layers) ] dn_losses_cls, dn_losses_bbox = multi_apply( self.dn_loss_single, output_known_class, output_known_coord, all_known_bboxs_list, all_known_labels_list, all_num_tgts_list) loss_dict['dn_loss_cls'] = dn_losses_cls[-1] loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1] num_dec_layer = 0 for loss_cls_i, loss_bbox_i in zip(dn_losses_cls[:-1], dn_losses_bbox[:-1]): loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i num_dec_layer += 1 elif self.with_dn: dn_losses_cls, dn_losses_bbox = multi_apply( self.loss_single, all_cls_scores, all_bbox_preds, all_gt_bboxes_list, all_gt_labels_list, all_gt_bboxes_ignore_list) loss_dict['dn_loss_cls'] = dn_losses_cls[-1].detach() loss_dict['dn_loss_bbox'] = dn_losses_bbox[-1].detach() num_dec_layer = 0 for loss_cls_i, loss_bbox_i in zip(dn_losses_cls[:-1], dn_losses_bbox[:-1]): loss_dict[f'd{num_dec_layer}.dn_loss_cls'] = loss_cls_i.detach() loss_dict[f'd{num_dec_layer}.dn_loss_bbox'] = loss_bbox_i.detach() num_dec_layer += 1 return loss_dict, None @force_fp32(apply_to=('preds_dicts')) def get_bboxes(self, preds_dicts, img_metas, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ preds_dicts = self.bbox_coder.decode(preds_dicts, layer_index=-1) num_samples = len(preds_dicts) ret_list = [] for i in range(num_samples): preds = preds_dicts[i] bboxes = preds['bboxes'] bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1)) scores = preds['scores'] labels = preds['labels'] bbox_results = bbox3d2result(bboxes, scores, labels) ret_list.append(bbox_results) return ret_list class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256, use_ln=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.use_ln = use_ln self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) if self.use_ln: self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.init_weight() def init_weight(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): if self.use_ln: x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out ================================================ FILE: mmdet3d/models/fbbev/track_head/__init__.py ================================================ from .trackpetr import TackerHead from .losses.tracking_loss_combo import TrackingLossCombo from .track_nms_free_coder import TrackNMSFreeCoder ================================================ FILE: mmdet3d/models/fbbev/track_head/instances.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2023 toyota research instutute. # ------------------------------------------------------------------------ # Modified from MOTR (https://github.com/megvii-model/MOTR/) # ------------------------------------------------------------------------ # Modified from Detectron2 (https://github.com/facebookresearch/detectron2) # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved # ------------------------------------------------------------------------ import itertools from typing import Any, Dict, List, Tuple, Union import torch def topk_gather(feat, topk_indexes): if topk_indexes is not None: feat_shape = feat.shape topk_shape = topk_indexes.shape view_shape = [1 for _ in range(len(feat_shape))] view_shape[:2] = topk_shape[:2] topk_indexes = topk_indexes.view(*view_shape) feat = torch.gather(feat, 1, topk_indexes.repeat(1, 1, *feat_shape[2:])) return feat class Instances: """ This class represents a list of instances in an image. It stores the attributes of instances (e.g., boxes, masks, labels, scores) as "fields". All fields must have the same ``__len__`` which is the number of instances. All other (non-field) attributes of this class are considered private: they must start with '_' and are not modifiable by a user. Some basic usage: 1. Set/get/check a field: .. code-block:: python instances.gt_boxes = Boxes(...) print(instances.pred_masks) # a tensor of shape (N, H, W) print('gt_masks' in instances) 2. ``len(instances)`` returns the number of instances 3. Indexing: ``instances[indices]`` will apply the indexing on all the fields and returns a new :class:`Instances`. Typically, ``indices`` is a integer vector of indices, or a binary mask of length ``num_instances`` .. code-block:: python category_3_detections = instances[instances.pred_classes == 3] confident_detections = instances[instances.scores > 0.9] """ def __init__(self, image_size: Tuple[int, int], **kwargs: Any): """ Args: image_size (height, width): the spatial size of the image. kwargs: fields to add to this `Instances`. """ self._image_size = image_size self._fields: Dict[str, Any] = {} for k, v in kwargs.items(): self.set(k, v) @property def image_size(self) -> Tuple[int, int]: """ Returns: tuple: height, width """ return self._image_size def __setattr__(self, name: str, val: Any) -> None: if name.startswith("_"): super().__setattr__(name, val) else: self.set(name, val) def __getattr__(self, name: str) -> Any: if name == "_fields" or name not in self._fields: raise AttributeError("Cannot find field '{}' in the given Instances!".format(name)) return self._fields[name] def set(self, name: str, value: Any) -> None: """ Set the field named `name` to `value`. The length of `value` must be the number of instances, and must agree with other existing fields in this object. """ data_len = len(value) if len(self._fields): assert ( len(self) == data_len ), "Adding a field of length {} to a Instances of length {}".format(data_len, len(self)) self._fields[name] = value def has(self, name: str) -> bool: """ Returns: bool: whether the field called `name` exists. """ return name in self._fields def remove(self, name: str) -> None: """ Remove the field called `name`. """ del self._fields[name] def get(self, name: str) -> Any: """ Returns the field called `name`. """ return self._fields[name] def get_fields(self) -> Dict[str, Any]: """ Returns: dict: a dict which maps names (str) to data of the fields Modifying the returned dict will modify this instance. """ return self._fields # Tensor-like methods def to(self, *args: Any, **kwargs: Any) -> "Instances": """ Returns: Instances: all fields are called with a `to(device)`, if the field has this method. """ ret = Instances(self._image_size) for k, v in self._fields.items(): if hasattr(v, "to"): v = v.to(*args, **kwargs) ret.set(k, v) return ret def numpy(self): ret = Instances(self._image_size) for k, v in self._fields.items(): if hasattr(v, "numpy"): v = v.numpy() ret.set(k, v) return ret def instances_topk_gather(self, topk_indexes, valid_key_set=None): ret = Instances(self._image_size) for k, v in self._fields.items(): if valid_key_set is not None and k not in valid_key_set: pass else: # print(k, v.shape) v = topk_gather(v, topk_indexes) ret.set(k, v) return ret def __getitem__(self, item: Union[int, slice, torch.BoolTensor]) -> "Instances": """ Args: item: an index-like object and will be used to index all the fields. Returns: If `item` is a string, return the data in the corresponding field. Otherwise, returns an `Instances` where all fields are indexed by `item`. """ if type(item) == int: if item >= len(self) or item < -len(self): raise IndexError("Instances index out of range!") else: item = slice(item, None, len(self)) ret = Instances(self._image_size) for k, v in self._fields.items(): # print(k, type(item), 'getitem', item.type(), item.dtype) # if index by torch.BoolTensor if k == 'kalman_models' and isinstance(item, torch.Tensor): # print(item.shape, 'in get item') ret_list = [] for i, if_true in enumerate(item): if if_true: ret_list.append(self.kalman_models[i]) ret.set(k, ret_list) else: ret.set(k, v[item]) return ret def __len__(self) -> int: for v in self._fields.values(): # use __len__ because len() has to be int and is not friendly to tracing return v.__len__() raise NotImplementedError("Empty Instances does not support __len__!") def __iter__(self): raise NotImplementedError("`Instances` object is not iterable!") @staticmethod def cat(instance_lists: List["Instances"], dim=0) -> "Instances": """ Args: instance_lists (list[Instances]) Returns: Instances """ assert all(isinstance(i, Instances) for i in instance_lists) assert len(instance_lists) > 0 if len(instance_lists) == 1: return instance_lists[0] image_size = instance_lists[0].image_size for i in instance_lists[1:]: assert i.image_size == image_size ret = Instances(image_size) for k in instance_lists[0]._fields.keys(): values = [i.get(k) for i in instance_lists] v0 = values[0] if isinstance(v0, torch.Tensor): # print(k, values[0].shape, values[1].shape, dim) try: values = torch.cat(values, dim=dim) except: from IPython import embed embed() exit() elif isinstance(v0, list): values = list(itertools.chain(*values)) elif hasattr(type(v0), "cat"): values = type(v0).cat(values) else: raise ValueError("Unsupported type {} for concatenation".format(type(v0))) ret.set(k, values) return ret def clone(self): ret = Instances(self._image_size) for k, v in self._fields.items(): if hasattr(v, 'clone'): v = v.clone() ret.set(k, v) return ret def detach(self): ret = Instances(self._image_size) for k, v in self._fields.items(): if hasattr(v, 'detach'): v = v.detach() ret.set(k, v) return ret def __str__(self) -> str: s = self.__class__.__name__ + "(" s += "num_instances={}, \n".format(len(self)) s += "image_height={}, \n".format(self._image_size[0]) s += "image_width={}, \n".format(self._image_size[1]) s += "fields=[{}])".format(", ".join((f"{k}: {v.shape}\n" for k, v in self._fields.items()))) return s __repr__ = __str__ ================================================ FILE: mmdet3d/models/fbbev/track_head/losses/__init__.py ================================================ from .tracking_loss_base import TrackingLossBase from .tracking_loss import TrackingLoss from .tracking_loss_prediction import TrackingLossPrediction from .tracking_loss_mem_bank import TrackingLossMemBank from .tracking_loss_combo import TrackingLossCombo ================================================ FILE: mmdet3d/models/fbbev/track_head/losses/tracking_loss.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) Toyota Research Institute # ------------------------------------------------------------------------ # Modified from PETR (https://github.com/megvii-research/PETR) # Copyright (c) 2022 megvii-model. All Rights Reserved. # ------------------------------------------------------------------------ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.runner import force_fp32 from mmdet.models import LOSSES from mmdet.models import build_loss from mmdet.core import (build_assigner, reduce_mean, multi_apply, build_sampler) from mmdet3d.models.fbbev.track_head.streampetr_utils import normalize_bbox from .tracking_loss_base import TrackingLossBase @LOSSES.register_module() class TrackingLoss(TrackingLossBase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def loss_single_frame(self, frame_idx, gt_bboxes_list, gt_labels_list, instance_inds, preds_dicts, gt_bboxes_ignore): """Match according to both tracking and detection information Generate the single frame loss function, modify the ids of track instances """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' all_cls_scores = preds_dicts['all_cls_scores'] all_bbox_preds = preds_dicts['all_bbox_preds'] # enc_cls_scores = preds_dicts['enc_cls_scores'] # enc_bbox_preds = preds_dicts['enc_bbox_preds'] track_instances = preds_dicts['track_instances'] num_dec_layers, B, num_query = all_cls_scores.shape[:3] device = gt_labels_list[0].device # after this operation, [x, y, z-h/2] becomes [x, y, z] gt_bboxes_list = [torch.cat( (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device) for gt_bboxes in gt_bboxes_list] obj_idxes_list = instance_inds[0].tolist() obj_idx_to_gt_idx = [{obj_idx: gt_idx for gt_idx, obj_idx in enumerate(obj_idxes_list)} for obj_idxes_list in instance_inds] num_disappear_track = 0 # step 1. Inherit and Update the previous tracks for batch_idx in range(B): for trk_idx in range(num_query): obj_id = track_instances.obj_idxes[batch_idx, trk_idx].item() if obj_id >= 0: if obj_id in obj_idx_to_gt_idx[batch_idx]: track_instances.matched_gt_idxes[batch_idx, trk_idx] = obj_idx_to_gt_idx[batch_idx][obj_id] else: num_disappear_track += 1 track_instances.matched_gt_idxes[batch_idx, trk_idx] = -2 else: track_instances.matched_gt_idxes[batch_idx, trk_idx] = -1 full_track_idxes = torch.arange(num_query, dtype=torch.long)[None].repeat(B, 1).to(all_cls_scores.device) # previsouly tracked, which is matched by rule all_matched_track_idxes = (track_instances.obj_idxes >= 0).nonzero() # full_track_idxes[track_instances.obj_idxes >= 0] matched_track_idxes = (track_instances.matched_gt_idxes >= 0).nonzero() # full_track_idxes[track_instances.matched_gt_idxes >= 0] # step2. select the unmatched slots. # note that the FP tracks whose obj_idxes are -2 will not be selected here. unmatched_track_idxes = (track_instances.obj_idxes == -1).nonzero() # full_track_idxes[track_instances.obj_idxes == -1] m_idxes_list = [matched_track_idxes[matched_track_idxes[:, 0]==i][:, 1] for i in range(B)] um_idxes_list = [unmatched_track_idxes[unmatched_track_idxes[:, 0]==i][:, 1] for i in range(B)] # step3. select the untracked gt instances (new tracks). tgt_state = [torch.zeros(len(gt_bboxes_list[i])).to(all_cls_scores.device) for i in range(B)] tgt_indexes_list = [] for i in range(B): tgt_indexes = track_instances.matched_gt_idxes[i] tgt_indexes = tgt_indexes[tgt_indexes >= 0] tgt_indexes_list.append(tgt_indexes) tgt_state[i][tgt_indexes] = 1 # new tgt indexes untracked_tgt_indexes = [torch.arange(len(gt_bboxes_list[i])).to(all_cls_scores.device)[tgt_state[i] == 0] for i in range(B)] all_unmatched_gt_bboxes_list = [[gt_bboxes_list[i][untracked_tgt_indexes[i]] for i in range(B)] for _ in range(num_dec_layers)] all_unmatched_gt_labels_list = [[gt_labels_list[i][untracked_tgt_indexes[i]] for i in range(B)] for _ in range(num_dec_layers)] all_unmatched_gt_ids_list = [[torch.tensor(instance_inds[i], device=device)[untracked_tgt_indexes[i]] for i in range(B)] for _ in range(num_dec_layers)] all_unmatched_ignore_list = [None for _ in range(num_dec_layers)] # unmatched_cls_scores = [] # unmatched_bbox_preds = [] # for i in range(B): # unmatched_cls_scores.append(all_cls_scores[:, i, unmatched_track_idxes[unmatched_track_idxes[:, 0]==i]][:, 1]) # unmatched_bbox_preds.append(all_bbox_preds[:, i, unmatched_track_idxes[unmatched_track_idxes[:, 0]==i]][:, 1]) # unmatched_cls_scores = all_cls_scores[:, :, unmatched_track_idxes, :] # unmatched_bbox_preds = all_bbox_preds[:, :, unmatched_track_idxes, :] # step4. do matching between the unmatched slots and GTs. unmatched_track_matching_result = list() for dec_layer_idx in range(num_dec_layers): unmatched_cls_scores = [] unmatched_bbox_preds = [] for i in range(B): um_idxes = um_idxes_list[i] unmatched_cls_scores.append(all_cls_scores[dec_layer_idx, i, um_idxes]) unmatched_bbox_preds.append(all_bbox_preds[dec_layer_idx, i, um_idxes]) unmatched_track_dec_matching_result = self.get_targets( unmatched_cls_scores, unmatched_bbox_preds, all_unmatched_gt_bboxes_list[dec_layer_idx], all_unmatched_gt_labels_list[dec_layer_idx], all_unmatched_gt_ids_list[dec_layer_idx], all_unmatched_ignore_list[dec_layer_idx]) unmatched_track_matching_result.append(unmatched_track_dec_matching_result) if dec_layer_idx == num_dec_layers - 1: (labels_list, label_instance_ids_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg, gt_match_idxes_list) = unmatched_track_dec_matching_result # step5. update the obj_idxes according to the matching result with the last decoder layer for i in range(B): um_idxes = um_idxes_list[i] track_instances.obj_idxes[i][um_idxes] = label_instance_ids_list[i] track_instances.matched_gt_idxes[i][um_idxes] = gt_match_idxes_list[i] # step6. merge the matching results of tracking/query instances matched_labels = [gt_labels_list[i][tgt_indexes_list[i]].long() for i in range(B)] matched_label_weights = [gt_labels_list[i].new_ones(len(tgt_indexes_list[i])).float()] matched_bbox_targets = [gt_bboxes_list[i][tgt_indexes_list[i]] for i in range(B)] matched_bbox_weights = [torch.ones_like(track_instances.bboxes[i])[:len(tgt_indexes_list[i])] for i in range(B)] all_matching_list = list() # matched_track_idxes = full_track_idxes[matched_track_idxes] # unmatched_track_idxes = full_track_idxes[unmatched_track_idxes] for dec_layer_idx in range(num_dec_layers): (dec_labels, _, dec_label_weights, dec_bbox_targets, dec_bbox_weights, dec_num_total_pos, dec_num_total_neg, _) = unmatched_track_matching_result[dec_layer_idx] labels_list = [] label_weights_list = [] bbox_targets_list = [] bbox_weights_list = [] total_pos = dec_num_total_pos + len(matched_track_idxes) total_neg = dec_num_total_neg + num_disappear_track matched_gt_idxes_list = track_instances.obj_idxes.new_full((B, num_query), -1, dtype=torch.long) for i in range(B): m_idxes = m_idxes_list[i] um_idxes = um_idxes_list[i] labels = torch.ones_like(track_instances.obj_idxes[i]).long() * self.num_classes labels[m_idxes] = matched_labels[i] labels[um_idxes] = dec_labels[i] labels_list.append(labels) label_weights = torch.ones_like(track_instances.obj_idxes[i]).float() label_weights_list.append(label_weights) bbox_targets = torch.zeros_like(track_instances.bboxes[i])[:, :dec_bbox_targets[i].size(1)] bbox_targets[m_idxes] = matched_bbox_targets[i] bbox_targets[um_idxes] = dec_bbox_targets[i] bbox_targets_list.append(bbox_targets) bbox_weights = torch.zeros_like(track_instances.bboxes[i]) bbox_weights[m_idxes] = 1.0 bbox_weights[um_idxes] = dec_bbox_weights[i] bbox_weights_list.append(bbox_weights) matched_gt_idxes_list[i][m_idxes] = track_instances.matched_gt_idxes[i][m_idxes] matched_gt_idxes_list[i][um_idxes] = track_instances.matched_gt_idxes[i][um_idxes] dec_matching_results = (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, total_pos, total_neg, matched_gt_idxes_list) all_matching_list.append(dec_matching_results) # step 7. compute the single frame losses # after getting the matching result, we no longer need contents for gt_bboxes_list etc. if self.interm_loss: losses_cls, losses_bbox = multi_apply( self.loss_single_decoder, [frame_idx for _ in range(num_dec_layers)], all_cls_scores, all_bbox_preds, [None for _ in range(num_dec_layers)], [None for _ in range(num_dec_layers)], [None for _ in range(num_dec_layers)], [None for _ in range(num_dec_layers)], all_matching_list) else: losses_cls, losses_bbox = self.loss_single_decoder(frame_idx, all_cls_scores[-1], all_bbox_preds[-1], None, None, None, None, all_matching_list[-1]) losses_cls, losses_bbox = [losses_cls], [losses_bbox] loss_dict = dict() # loss from the last decoder layer loss_dict[f'f{frame_idx}.loss_cls'] = losses_cls[-1] loss_dict[f'f{frame_idx}.loss_bbox'] = losses_bbox[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): loss_dict[f'f{frame_idx}.d{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'f{frame_idx}.d{num_dec_layer}.loss_bbox'] = loss_bbox_i num_dec_layer += 1 return loss_dict ================================================ FILE: mmdet3d/models/fbbev/track_head/losses/tracking_loss_base.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2023 toyota research instutute. # ------------------------------------------------------------------------ # Modified from PETR (https://github.com/megvii-research/PETR) # Copyright (c) 2022 megvii-model. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from DETR3D (https://github.com/WangYueFt/detr3d) # Copyright (c) 2021 Wang, Yue # ------------------------------------------------------------------------ # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.runner import force_fp32 from mmdet.models import LOSSES from mmdet.models import build_loss from mmdet.core import (build_assigner, reduce_mean, multi_apply, build_sampler) from mmdet3d.models.fbbev.track_head.streampetr_utils import normalize_bbox @LOSSES.register_module() class TrackingLossBase(nn.Module): """ Naive multi-frame loss """ def __init__(self, num_classes, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], sync_cls_avg_factor=False, interm_loss=True, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0), assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]), match_costs=None): super().__init__() self.num_classes = num_classes self.interm_loss = interm_loss # if compute separate losses for all the decoders self.assigner = build_assigner(assigner) self.loss_cls = build_loss(loss_cls) self.loss_bbox = build_loss(loss_bbox) self.loss_iou = build_loss(loss_iou) sampler_cfg = dict(type='PseudoSampler') self.sampler = build_sampler(sampler_cfg, context=self) self.pc_range = self.assigner.pc_range if self.loss_cls.use_sigmoid: self.cls_out_channels = num_classes else: self.cls_out_channels = num_classes + 1 if code_weights is not None: self.code_weights = code_weights else: self.code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] if match_costs is not None: self.match_costs = match_costs else: self.match_costs = self.code_weights self.code_weights = nn.Parameter(torch.tensor( self.code_weights, requires_grad=False), requires_grad=False) self.match_costs = nn.Parameter(torch.tensor( self.match_costs), requires_grad=False) self.bg_cls_weight = 0 self.sync_cls_avg_factor = sync_cls_avg_factor class_weight = loss_cls.get('class_weight', None) if class_weight is not None: assert isinstance(class_weight, float), 'Expected ' \ 'class_weight to have type float. Found ' \ f'{type(class_weight)}.' # NOTE following the official DETR rep0, bg_cls_weight means # relative classification weight of the no-object class. bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight) assert isinstance(bg_cls_weight, float), 'Expected ' \ 'bg_cls_weight to have type float. Found ' \ f'{type(bg_cls_weight)}.' class_weight = torch.ones(num_classes + 1) * class_weight # set background class as the last indice class_weight[num_classes] = bg_cls_weight loss_cls.update({'class_weight': class_weight}) if 'bg_cls_weight' in loss_cls: loss_cls.pop('bg_cls_weight') self.bg_cls_weight = bg_cls_weight def _get_target_single(self, cls_score, bbox_pred, gt_labels, gt_bboxes, instance_inds, gt_bboxes_ignore=None): """"Compute regression and classification targets for one image. Outputs from a single decoder layer of a single feature level are used. Args: cls_score (Tensor): Box score logits from a single decoder layer for one image. Shape [num_query, cls_out_channels]. bbox_pred (Tensor): Sigmoid outputs from a single decoder layer for one image, with normalized coordinate (cx, cy, w, h) and shape [num_query, 4]. gt_bboxes (Tensor): Ground truth bboxes for one image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels (Tensor): Ground truth class indices for one image with shape (num_gts, ). gt_bboxes_ignore (Tensor, optional): Bounding boxes which can be ignored. Default None. Returns: tuple[Tensor]: a tuple containing the following for one image. - labels (Tensor): Labels of each image. - label_weights (Tensor]): Label weights of each image. - bbox_targets (Tensor): BBox targets of each image. - bbox_weights (Tensor): BBox weights of each image. - pos_inds (Tensor): Sampled positive indices for each image. - neg_inds (Tensor): Sampled negative indices for each image. """ num_bboxes = bbox_pred.size(0) # assigner and sampler assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes, gt_labels, gt_bboxes_ignore, self.match_costs) sampling_result = self.sampler.sample(assign_result, bbox_pred, gt_bboxes) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds # label targets labels = gt_bboxes.new_full((num_bboxes, ), self.num_classes, dtype=torch.long) label_instance_ids = gt_bboxes.new_full((num_bboxes,), -1, dtype=torch.long) gt_match_idxes = gt_bboxes.new_full((num_bboxes,), -1, dtype=torch.long) labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds].long() label_instance_ids[pos_inds] = instance_inds[sampling_result.pos_assigned_gt_inds].long() gt_match_idxes[pos_inds] = sampling_result.pos_assigned_gt_inds.clone().long() label_weights = gt_bboxes.new_ones(num_bboxes) # bbox targets code_size = gt_bboxes.size(1) bbox_targets = torch.zeros_like(bbox_pred)[..., :code_size] bbox_weights = torch.zeros_like(bbox_pred) bbox_weights[pos_inds] = 1.0 # hack for empty if pos_inds.numel() == 0: sampling_result.pos_gt_bboxes = gt_bboxes.new_empty((0, code_size)) bbox_targets[pos_inds] = sampling_result.pos_gt_bboxes return (labels, label_instance_ids, label_weights, bbox_targets, bbox_weights, pos_inds, neg_inds, gt_match_idxes) def get_targets(self, cls_scores_list, bbox_preds_list, gt_bboxes_list, gt_labels_list, instance_ids_list, gt_bboxes_ignore_list=None): """"Compute regression and classification targets for a batch image. Outputs from a single decoder layer of a single feature level are used. Args: cls_scores_list (list[Tensor]): Box score logits from a single decoder layer for each image with shape [num_query, cls_out_channels]. bbox_preds_list (list[Tensor]): Sigmoid outputs from a single decoder layer for each image, with normalized coordinate (cx, cy, w, h) and shape [num_query, 4]. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: tuple: a tuple containing the following targets. - labels_list (list[Tensor]): Labels for all images. - label_weights_list (list[Tensor]): Label weights for all \ images. - bbox_targets_list (list[Tensor]): BBox targets for all \ images. - bbox_weights_list (list[Tensor]): BBox weights for all \ images. - num_total_pos (int): Number of positive samples in all \ images. - num_total_neg (int): Number of negative samples in all \ images. """ assert gt_bboxes_ignore_list is None, \ 'Only supports for gt_bboxes_ignore setting to None.' num_imgs = len(cls_scores_list) gt_bboxes_ignore_list = [ gt_bboxes_ignore_list for _ in range(num_imgs) ] (labels_list, label_instance_ids_list, label_weights_list, bbox_targets_list, bbox_weights_list, pos_inds_list, neg_inds_list, gt_match_idxes_list) = multi_apply( self._get_target_single, cls_scores_list, bbox_preds_list, gt_labels_list, gt_bboxes_list, instance_ids_list, gt_bboxes_ignore_list) num_total_pos = sum((inds.numel() for inds in pos_inds_list)) num_total_neg = sum((inds.numel() for inds in neg_inds_list)) return (labels_list, label_instance_ids_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg, gt_match_idxes_list) def loss_single_decoder(self, frame_idx, cls_scores, bbox_preds, gt_bboxes_list, gt_labels_list, instance_ids_list, gt_bboxes_ignore_list=None, gt_matching=None, aux_infos=None): """"Loss function for outputs from a single decoder layer of a single feature level. The sub-function of frame-level loss. Args: cls_scores (Tensor): Box score logits from a single decoder layer for all images. Shape [bs, num_query, cls_out_channels]. bbox_preds (Tensor): Sigmoid outputs from a single decoder layer for all images, with normalized coordinate (cx, cy, w, h) and shape [bs, num_query, 4]. gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). gt_bboxes_ignore_list (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components for outputs from a single decoder layer. """ num_imgs = cls_scores.size(0) cls_scores_list = [cls_scores[i] for i in range(num_imgs)] bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)] if gt_matching is None: cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list, gt_bboxes_list, gt_labels_list, instance_ids_list, gt_bboxes_ignore_list) (labels_list, _, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg, gt_match_idxes_list) = cls_reg_targets else: (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list, num_total_pos, num_total_neg, gt_match_idxes_list) = gt_matching labels = torch.cat(labels_list, 0) label_weights = torch.cat(label_weights_list, 0) bbox_targets = torch.cat(bbox_targets_list, 0) bbox_weights = torch.cat(bbox_weights_list, 0) # classification loss cls_scores = cls_scores.reshape(-1, self.cls_out_channels) # construct weighted avg_factor to match with the official DETR repo cls_avg_factor = num_total_pos * 1.0 + \ num_total_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_cls( cls_scores, labels, label_weights, avg_factor=cls_avg_factor) # Compute the average number of gt boxes accross all gpus, for # normalization purposes num_total_pos = loss_cls.new_tensor([num_total_pos]) num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item() # regression L1 loss bbox_preds = bbox_preds.reshape(-1, bbox_preds.size(-1)) normalized_bbox_targets = normalize_bbox(bbox_targets, self.pc_range) isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) bbox_weights = bbox_weights * torch.tensor(self.code_weights).to(bbox_preds.device) loss_bbox = self.loss_bbox( bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=num_total_pos) try: loss_cls = torch.nan_to_num(loss_cls) loss_bbox = torch.nan_to_num(loss_bbox) except: loss_cls = nan_to_num(loss_cls) loss_bbox = nan_to_num(loss_bbox) return loss_cls, loss_bbox def loss_single_frame(self, frame_idx, gt_bboxes_list, gt_labels_list, instance_inds, preds_dicts, gt_bboxes_ignore): """Loss function on a single frame for classification and localization. Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indices for each image with shape (num_gts, ). preds_dicts: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_bbox_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape [nb_dec, bs, num_query, 4]. enc_cls_scores (Tensor): Classification scores of points on encode feature map , has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_bbox_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, 4). Only be passed when as_two_stage is True, otherwise is None. gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ assert gt_bboxes_ignore is None, \ f'{self.__class__.__name__} only supports ' \ f'for gt_bboxes_ignore setting to None.' all_cls_scores = preds_dicts['all_cls_scores'] all_bbox_preds = preds_dicts['all_bbox_preds'] # enc_cls_scores = preds_dicts['enc_cls_scores'] # enc_bbox_preds = preds_dicts['enc_bbox_preds'] num_dec_layers = len(all_cls_scores) device = gt_labels_list[0].device gt_bboxes_list = [torch.cat( (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device) for gt_bboxes in gt_bboxes_list] all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)] all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)] all_instance_ids_list = [instance_inds for _ in range(num_dec_layers)] all_gt_bboxes_ignore_list = [ gt_bboxes_ignore for _ in range(num_dec_layers) ] if self.interm_loss: losses_cls, losses_bbox = multi_apply( self.loss_single_decoder, [frame_idx for _ in range(num_dec_layers)], all_cls_scores, all_bbox_preds, all_gt_bboxes_list, all_gt_labels_list, all_instance_ids_list, all_gt_bboxes_ignore_list) else: losses_cls, losses_bbox = self.loss_single_decoder(num_dec_layers - 1, all_cls_scores[-1], all_bbox_preds[-1], all_gt_bboxes_list[-1], all_gt_labels_list[-1], all_instance_ids_list[-1], all_gt_bboxes_ignore_list[-1]) losses_cls, losses_bbox = [losses_cls], [losses_bbox] loss_dict = dict() # loss from the last decoder layer loss_dict[f'f{frame_idx}.loss_cls'] = losses_cls[-1] loss_dict[f'f{frame_idx}.loss_bbox'] = losses_bbox[-1] # loss from other decoder layers num_dec_layer = 0 for loss_cls_i, loss_bbox_i in zip(losses_cls[:-1], losses_bbox[:-1]): loss_dict[f'f{frame_idx}.d{num_dec_layer}.loss_cls'] = loss_cls_i loss_dict[f'f{frame_idx}.d{num_dec_layer}.loss_bbox'] = loss_bbox_i num_dec_layer += 1 return loss_dict @force_fp32(apply_to=('preds_dicts')) def forward(self, preds_dicts): """Loss function for multi-frame tracking """ frame_num = len(preds_dicts) losses_dicts = [p.pop('loss_dict') for p in preds_dicts] loss_dict = dict() for key in losses_dicts[-1].keys(): # example loss_dict["d2.loss_cls"] = losses_dicts[-1]["f0.d2.loss_cls"] loss_dict[key[3:]] = losses_dicts[-1][key] for frame_loss in losses_dicts[:-1]: loss_dict.update(frame_loss) return loss_dict def nan_to_num(x, nan=0.0, posinf=None, neginf=None): x[torch.isnan(x)]= nan if posinf is not None: x[torch.isposinf(x)] = posinf if neginf is not None: x[torch.isneginf(x)] = posinf return x ================================================ FILE: mmdet3d/models/fbbev/track_head/losses/tracking_loss_combo.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) Toyota Research Institute # ------------------------------------------------------------------------ # Modified from PETR (https://github.com/megvii-research/PETR) # Copyright (c) 2022 megvii-model. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from DETR3D (https://github.com/WangYueFt/detr3d) # Copyright (c) 2021 Wang, Yue # ------------------------------------------------------------------------ # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.runner import force_fp32 from mmdet.models import LOSSES from mmdet.models import build_loss from mmdet.core import (build_assigner, reduce_mean, multi_apply, build_sampler) from mmdet3d.models.fbbev.track_head.streampetr_utils import normalize_bbox from .tracking_loss import TrackingLoss @LOSSES.register_module() class TrackingLossCombo(TrackingLoss): """ Tracking loss with reference point supervision """ def __init__(self, num_classes, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], sync_cls_avg_factor=False, interm_loss=True, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0), loss_prediction=dict(type='L1Loss', loss_weight=1.0), assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])): super(TrackingLoss, self).__init__( num_classes, code_weights, sync_cls_avg_factor, interm_loss, loss_cls, loss_bbox, loss_iou, assigner) self.loss_traj = build_loss(loss_prediction) self.loss_mem_cls = build_loss(loss_cls) # self.loc_refine_code_weights = [1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] self.loc_refine_code_weights = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2] def loss_prediction(self, frame_idx, loss_dict, gt_trajs, gt_masks, pred_trajs, loss_key='for'): loss_prediction = self.loss_traj( gt_trajs[..., :2] * gt_masks.unsqueeze(-1), pred_trajs[..., :2] * gt_masks.unsqueeze(-1)) loss_dict[f'f{frame_idx}.loss_{loss_key}'] = loss_prediction return loss_dict def loss_mem_bank(self, frame_idx, loss_dict, gt_bboxes_list, gt_labels_list, instance_inds, track_instances): obj_idxes_list = instance_inds[0].detach().cpu().numpy().tolist() obj_idx_to_gt_idx = {obj_idx: gt_idx for gt_idx, obj_idx in enumerate(obj_idxes_list)} device = track_instances.query_feats.device # classification loss matched_labels = torch.ones((len(track_instances), ), dtype=torch.long, device=device) * self.num_classes matched_label_weights = torch.ones((len(track_instances), ), dtype=torch.float32, device=device) num_pos, num_neg = 0, 0 for track_idx, id in enumerate(track_instances.obj_idxes): cpu_id = id.cpu().numpy().tolist() if cpu_id not in obj_idx_to_gt_idx.keys(): num_neg += 1 continue index = obj_idx_to_gt_idx[cpu_id] matched_labels[track_idx] = gt_labels_list[0][index].long() num_pos += 1 labels_list = matched_labels label_weights_list = matched_label_weights cls_scores = track_instances.cache_logits cls_avg_factor = num_pos * 1.0 + \ num_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_mem_cls( cls_scores, labels_list, label_weights_list, avg_factor=cls_avg_factor) loss_cls = torch.nan_to_num(loss_cls) loss_dict[f'f{frame_idx}.loss_mem_cls'] = loss_cls # location refinement loss gt_bboxes_list = [torch.cat( (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device) for gt_bboxes in gt_bboxes_list] pos_bbox_num = 0 matched_bbox_targets = torch.zeros((len(track_instances), gt_bboxes_list[0].shape[1]), dtype=torch.float32, device=device) matched_bbox_weights = torch.zeros((len(track_instances),len(self.loc_refine_code_weights)), dtype=torch.float32, device=device) for track_idx, id in enumerate(track_instances.obj_idxes): cpu_id = id.cpu().numpy().tolist() if cpu_id not in obj_idx_to_gt_idx.keys(): matched_bbox_weights[track_idx] = 0.0 continue index = obj_idx_to_gt_idx[cpu_id] matched_bbox_targets[track_idx] = gt_bboxes_list[0][index].float() matched_bbox_weights[track_idx] = 1.0 pos_bbox_num += 1 normalized_bbox_targets = normalize_bbox(matched_bbox_targets, self.pc_range) isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) bbox_weights = matched_bbox_weights * torch.tensor(self.loc_refine_code_weights).to(device) loss_bbox = self.loss_bbox( track_instances.cache_bboxes[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=pos_bbox_num) loss_dict[f'f{frame_idx}.loss_mem_bbox'] = loss_bbox return loss_dict @force_fp32(apply_to=('preds_dicts')) def forward(self, preds_dicts): """Loss function for multi-frame tracking """ frame_num = len(preds_dicts) losses_dicts = [p.pop('loss_dict') for p in preds_dicts] loss_dict = dict() for key in losses_dicts[-1].keys(): # example loss_dict["d2.loss_cls"] = losses_dicts[-1]["f0.d2.loss_cls"] loss_dict[key[3:]] = losses_dicts[-1][key] for frame_loss in losses_dicts[:-1]: loss_dict.update(frame_loss) return loss_dict def nan_to_num(x, nan=0.0, posinf=None, neginf=None): x[torch.isnan(x)]= nan if posinf is not None: x[torch.isposinf(x)] = posinf if neginf is not None: x[torch.isneginf(x)] = posinf return x ================================================ FILE: mmdet3d/models/fbbev/track_head/losses/tracking_loss_mem_bank.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) Toyota Research Institute # ------------------------------------------------------------------------ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.runner import force_fp32 from mmdet.models import LOSSES from mmdet.models import build_loss from mmdet.core import (build_assigner, reduce_mean, multi_apply, build_sampler) from mmdet3d.models.fbbev.track_head.streampetr_utils import normalize_bbox from .tracking_loss import TrackingLoss @LOSSES.register_module() class TrackingLossMemBank(TrackingLoss): def __init__(self, num_classes, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], sync_cls_avg_factor=False, interm_loss=True, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0), assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])): super(TrackingLoss, self).__init__( num_classes, code_weights, sync_cls_avg_factor, interm_loss, loss_cls, loss_bbox, loss_iou, assigner) self.loss_mem_cls = build_loss(loss_cls) self.loc_refine_code_weights = [1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] def loss_mem_bank(self, frame_idx, loss_dict, gt_bboxes_list, gt_labels_list, instance_inds, track_instances): obj_idxes_list = instance_inds[0].detach().cpu().numpy().tolist() obj_idx_to_gt_idx = {obj_idx: gt_idx for gt_idx, obj_idx in enumerate(obj_idxes_list)} device = track_instances.output_embedding.device # classification loss matched_labels = torch.ones((len(track_instances), ), dtype=torch.long, device=device) * self.num_classes matched_label_weights = torch.ones((len(track_instances), ), dtype=torch.float32, device=device) num_pos, num_neg = 0, 0 for track_idx, id in enumerate(track_instances.obj_idxes): cpu_id = id.cpu().numpy().tolist() if cpu_id not in obj_idx_to_gt_idx.keys(): num_neg += 1 continue index = obj_idx_to_gt_idx[cpu_id] matched_labels[track_idx] = gt_labels_list[0][index].long() num_pos += 1 labels_list = matched_labels label_weights_list = matched_label_weights cls_scores = track_instances.mem_pred_logits[:, -1, :] cls_avg_factor = num_pos * 1.0 + \ num_neg * self.bg_cls_weight if self.sync_cls_avg_factor: cls_avg_factor = reduce_mean( cls_scores.new_tensor([cls_avg_factor])) cls_avg_factor = max(cls_avg_factor, 1) loss_cls = self.loss_mem_cls( cls_scores, labels_list, label_weights_list, avg_factor=cls_avg_factor) loss_cls = torch.nan_to_num(loss_cls) loss_dict[f'f{frame_idx}.loss_mem_cls'] = loss_cls # location refinement loss gt_bboxes_list = [torch.cat( (gt_bboxes.gravity_center, gt_bboxes.tensor[:, 3:]), dim=1).to(device) for gt_bboxes in gt_bboxes_list] pos_bbox_num = 0 matched_bbox_targets = torch.zeros((len(track_instances), gt_bboxes_list[0].shape[1]), dtype=torch.float32, device=device) matched_bbox_weights = torch.zeros((len(track_instances),len(self.loc_refine_code_weights)), dtype=torch.float32, device=device) for track_idx, id in enumerate(track_instances.obj_idxes): cpu_id = id.cpu().numpy().tolist() if cpu_id not in obj_idx_to_gt_idx.keys(): matched_bbox_weights[track_idx] = 0.0 continue index = obj_idx_to_gt_idx[cpu_id] matched_bbox_targets[track_idx] = gt_bboxes_list[0][index].float() matched_bbox_weights[track_idx] = 1.0 pos_bbox_num += 1 normalized_bbox_targets = normalize_bbox(matched_bbox_targets, self.pc_range) isnotnan = torch.isfinite(normalized_bbox_targets).all(dim=-1) bbox_weights = matched_bbox_weights * torch.tensor(self.loc_refine_code_weights).to(device) loss_bbox = self.loss_bbox( track_instances.bbox_preds[isnotnan, :10], normalized_bbox_targets[isnotnan, :10], bbox_weights[isnotnan, :10], avg_factor=pos_bbox_num) loss_dict[f'f{frame_idx}.loss_mem_bbox'] = loss_bbox return loss_dict @force_fp32(apply_to=('preds_dicts')) def forward(self, preds_dicts): """Loss function for multi-frame tracking """ frame_num = len(preds_dicts) losses_dicts = [p.pop('loss_dict') for p in preds_dicts] loss_dict = dict() for key in losses_dicts[-1].keys(): # example loss_dict["d2.loss_cls"] = losses_dicts[-1]["f0.d2.loss_cls"] loss_dict[key[3:]] = losses_dicts[-1][key] for frame_loss in losses_dicts[:-1]: loss_dict.update(frame_loss) return loss_dict ================================================ FILE: mmdet3d/models/fbbev/track_head/losses/tracking_loss_prediction.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) Toyota Research Institute # ------------------------------------------------------------------------ # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ import torch import torch.nn as nn import torch.nn.functional as F from mmcv.runner import force_fp32 from mmdet.models import LOSSES from mmdet.models import build_loss from mmdet.core import (build_assigner, reduce_mean, multi_apply, build_sampler) from mmdet3d.models.fbbev.track_head.streampetr_utils import normalize_bbox from .tracking_loss import TrackingLoss @LOSSES.register_module() class TrackingLossPrediction(TrackingLoss): """ Tracking loss with reference point supervision """ def __init__(self, num_classes, code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], sync_cls_avg_factor=False, interm_loss=True, loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0), loss_prediction=dict(type='L1Loss', loss_weight=1.0), assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0])): super(TrackingLoss, self).__init__( num_classes, code_weights, sync_cls_avg_factor, interm_loss, loss_cls, loss_bbox, loss_iou, assigner) self.loss_traj = build_loss(loss_prediction) def loss_prediction(self, frame_idx, loss_dict, gt_trajs, gt_masks, pred_trajs, loss_key='for'): loss_prediction = self.loss_traj( gt_trajs[..., :2] * gt_masks.unsqueeze(-1), pred_trajs[..., :2] * gt_masks.unsqueeze(-1)) loss_dict[f'f{frame_idx}.loss_{loss_key}'] = loss_prediction return loss_dict @force_fp32(apply_to=('preds_dicts')) def forward(self, preds_dicts): """Loss function for multi-frame tracking """ frame_num = len(preds_dicts) losses_dicts = [p.pop('loss_dict') for p in preds_dicts] loss_dict = dict() for key in losses_dicts[-1].keys(): # example loss_dict["d2.loss_cls"] = losses_dicts[-1]["f0.d2.loss_cls"] loss_dict[key[3:]] = losses_dicts[-1][key] for frame_loss in losses_dicts[:-1]: loss_dict.update(frame_loss) return loss_dict def nan_to_num(x, nan=0.0, posinf=None, neginf=None): x[torch.isnan(x)]= nan if posinf is not None: x[torch.isposinf(x)] = posinf if neginf is not None: x[torch.isneginf(x)] = posinf return x ================================================ FILE: mmdet3d/models/fbbev/track_head/runtime_tracker.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2023 toyota research instutute. # ------------------------------------------------------------------------ from .instances import Instances import torch import numpy as np class RunTimeTracker: def __init__(self, output_threshold=0.2, score_threshold=0.4, max_age_since_update=1, **kwargs): self.current_id = 1 self.current_seq = 0 self.timestamp = None self.time_delta = None self.query_embeddings = None self.reference_points = None self.frame_index = 0 self.track_instances = None self.timestamp = None self.first_frame = None self.threshold = score_threshold self.output_threshold = output_threshold self.max_age_since_update = max_age_since_update def update_active_tracks(self, track_instances, active_mask): # first frame if self.track_instances is None: self.track_instances = track_instances[active_mask] return live_mask = torch.zeros_like(track_instances.obj_idxes).bool().detach() for i in range(len(track_instances)): if active_mask[i]: track_instances.disappear_time[i] = 0 live_mask[i] = True elif track_instances.track_query_mask[i]: track_instances.disappear_time[i] += 1 if track_instances.disappear_time[i] < self.max_age_since_update: live_mask[i] = True self.track_instances = track_instances[live_mask] return def get_active_mask(self, track_instances, training=True): if training: active_mask = (track_instances.matched_gt_idxes >= 0) return active_mask def empty(self): """Copy the historical buffer parts from the init """ self.current_id = 1 self.current_seq = 0 self.timestamp = None self.query_embeddings = None self.reference_points = None self.frame_index = 0 self.track_instances = None self.timestamp = None self.first_frame = None ================================================ FILE: mmdet3d/models/fbbev/track_head/streampetr_utils.py ================================================ import torch def normalize_bbox(bboxes, pc_range): cx = bboxes[..., 0:1] cy = bboxes[..., 1:2] cz = bboxes[..., 2:3] w = bboxes[..., 3:4].log() l = bboxes[..., 4:5].log() h = bboxes[..., 5:6].log() rot = bboxes[..., 6:7] if bboxes.size(-1) > 7: vx = bboxes[..., 7:8] vy = bboxes[..., 8:9] normalized_bboxes = torch.cat( (cx, cy, cz, w, l, h, rot.sin(), rot.cos(), vx, vy), dim=-1 ) else: normalized_bboxes = torch.cat( (cx, cy, cz, w, l, h, rot.sin(), rot.cos()), dim=-1 ) return normalized_bboxes # ------------------------------------------------------------------------ # Copyright (c) 2022 megvii-model. All Rights Reserved. # ------------------------------------------------------------------------ # Modified from mmdetection (https://github.com/open-mmlab/mmdetection) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ # Modified by Shihao Wang # ------------------------------------------------------------------------ import math import torch import torch.nn as nn import numpy as np def denormalize_bbox(normalized_bboxes, pc_range): # rotation rot_sine = normalized_bboxes[..., 6:7] rot_cosine = normalized_bboxes[..., 7:8] rot = torch.atan2(rot_sine, rot_cosine) # center in the bev cx = normalized_bboxes[..., 0:1] cy = normalized_bboxes[..., 1:2] cz = normalized_bboxes[..., 2:3] # size w = normalized_bboxes[..., 3:4] l = normalized_bboxes[..., 4:5] h = normalized_bboxes[..., 5:6] w = w.exp() l = l.exp() h = h.exp() if normalized_bboxes.size(-1) > 8: # velocity vx = normalized_bboxes[:, 8:9] vy = normalized_bboxes[:, 9:10] denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot, vx, vy], dim=-1) else: denormalized_bboxes = torch.cat([cx, cy, cz, w, l, h, rot], dim=-1) return denormalized_bboxes def pos2posemb3d(pos, num_pos_feats=128, temperature=10000): scale = 2 * math.pi pos = pos * scale dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats) pos_x = pos[..., 0, None] / dim_t pos_y = pos[..., 1, None] / dim_t pos_z = pos[..., 2, None] / dim_t pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2) pos_z = torch.stack((pos_z[..., 0::2].sin(), pos_z[..., 1::2].cos()), dim=-1).flatten(-2) posemb = torch.cat((pos_y, pos_x, pos_z), dim=-1) return posemb def bevpos2posemb(pos, num_pos_feats=128, temperature=10000): scale = 2 * math.pi pos = pos * scale dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats) pos_x = pos[..., 0, None] / dim_t pos_y = pos[..., 1, None] / dim_t pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) pos_y = torch.stack((pos_y[..., 0::2].sin(), pos_y[..., 1::2].cos()), dim=-1).flatten(-2) posemb = torch.cat((pos_y, pos_x), dim=-1) return posemb def pos2posemb1d(pos, num_pos_feats=256, temperature=10000): scale = 2 * math.pi pos = pos * scale dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos.device) dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / num_pos_feats) pos_x = pos[..., 0, None] / dim_t pos_x = torch.stack((pos_x[..., 0::2].sin(), pos_x[..., 1::2].cos()), dim=-1).flatten(-2) return pos_x def nerf_positional_encoding( tensor, num_encoding_functions=6, include_input=False, log_sampling=True ) -> torch.Tensor: r"""Apply positional encoding to the input. Args: tensor (torch.Tensor): Input tensor to be positionally encoded. encoding_size (optional, int): Number of encoding functions used to compute a positional encoding (default: 6). include_input (optional, bool): Whether or not to include the input in the positional encoding (default: True). Returns: (torch.Tensor): Positional encoding of the input tensor. """ # TESTED # Trivially, the input tensor is added to the positional encoding. encoding = [tensor] if include_input else [] frequency_bands = None if log_sampling: frequency_bands = 2.0 ** torch.linspace( 0.0, num_encoding_functions - 1, num_encoding_functions, dtype=tensor.dtype, device=tensor.device, ) else: frequency_bands = torch.linspace( 2.0 ** 0.0, 2.0 ** (num_encoding_functions - 1), num_encoding_functions, dtype=tensor.dtype, device=tensor.device, ) for freq in frequency_bands: for func in [torch.sin, torch.cos]: encoding.append(func(tensor * freq)) # Special case, for no positional encoding if len(encoding) == 1: return encoding[0] else: return torch.cat(encoding, dim=-1) import torch import torch.nn as nn import numpy as np from mmdet.core import bbox_xyxy_to_cxcywh from mmdet.models.utils.transformer import inverse_sigmoid def memory_refresh(memory, prev_exist): memory_shape = memory.shape view_shape = [1 for _ in range(len(memory_shape))] prev_exist = prev_exist.view(-1, *view_shape[1:]) return memory * prev_exist def topk_gather(feat, topk_indexes): if topk_indexes is not None: feat_shape = feat.shape topk_shape = topk_indexes.shape view_shape = [1 for _ in range(len(feat_shape))] view_shape[:2] = topk_shape[:2] topk_indexes = topk_indexes.view(*view_shape) feat = torch.gather(feat, 1, topk_indexes.repeat(1, 1, *feat_shape[2:])) return feat def apply_ltrb(locations, pred_ltrb): """ :param locations: (1, H, W, 2) :param pred_ltrb: (N, H, W, 4) """ pred_boxes = torch.zeros_like(pred_ltrb) pred_boxes[..., 0] = (locations[..., 0] - pred_ltrb[..., 0])# x1 pred_boxes[..., 1] = (locations[..., 1] - pred_ltrb[..., 1])# y1 pred_boxes[..., 2] = (locations[..., 0] + pred_ltrb[..., 2])# x2 pred_boxes[..., 3] = (locations[..., 1] + pred_ltrb[..., 3])# y2 min_xy = pred_boxes[..., 0].new_tensor(0) max_xy = pred_boxes[..., 0].new_tensor(1) pred_boxes = torch.where(pred_boxes < min_xy, min_xy, pred_boxes) pred_boxes = torch.where(pred_boxes > max_xy, max_xy, pred_boxes) pred_boxes = bbox_xyxy_to_cxcywh(pred_boxes) return pred_boxes def apply_center_offset(locations, center_offset): """ :param locations: (1, H, W, 2) :param pred_ltrb: (N, H, W, 4) """ centers_2d = torch.zeros_like(center_offset) locations = inverse_sigmoid(locations) centers_2d[..., 0] = locations[..., 0] + center_offset[..., 0] # x1 centers_2d[..., 1] = locations[..., 1] + center_offset[..., 1] # y1 centers_2d = centers_2d.sigmoid() return centers_2d @torch.no_grad() def locations(features, stride, pad_h, pad_w): """ Arguments: features: (N, C, H, W) Return: locations: (H, W, 2) """ h, w = features.size()[-2:] device = features.device shifts_x = (torch.arange( 0, stride*w, step=stride, dtype=torch.float32, device=device ) + stride // 2 ) / pad_w shifts_y = (torch.arange( 0, h * stride, step=stride, dtype=torch.float32, device=device ) + stride // 2) / pad_h shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x) shift_x = shift_x.reshape(-1) shift_y = shift_y.reshape(-1) locations = torch.stack((shift_x, shift_y), dim=1) locations = locations.reshape(h, w, 2) return locations def gaussian_2d(shape, sigma=1.0): """Generate gaussian map. Args: shape (list[int]): Shape of the map. sigma (float, optional): Sigma to generate gaussian map. Defaults to 1. Returns: np.ndarray: Generated gaussian map. """ m, n = [(ss - 1.) / 2. for ss in shape] y, x = np.ogrid[-m:m + 1, -n:n + 1] h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h def draw_heatmap_gaussian(heatmap, center, radius, k=1): """Get gaussian masked heatmap. Args: heatmap (torch.Tensor): Heatmap to be masked. center (torch.Tensor): Center coord of the heatmap. radius (int): Radius of gaussian. K (int, optional): Multiple of masked_gaussian. Defaults to 1. Returns: torch.Tensor: Masked heatmap. """ diameter = 2 * radius + 1 gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = torch.from_numpy( gaussian[radius - top:radius + bottom, radius - left:radius + right]).to(heatmap.device, torch.float32) if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap class SELayer_Linear(nn.Module): def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): super().__init__() self.conv_reduce = nn.Linear(channels, channels) self.act1 = act_layer() self.conv_expand = nn.Linear(channels, channels) self.gate = gate_layer() def forward(self, x, x_se): x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) return x * self.gate(x_se) class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.reset_parameters() def reset_parameters(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out def transform_reference_points(reference_points, egopose, reverse=False, translation=True): reference_points = torch.cat([reference_points, torch.ones_like(reference_points[..., 0:1])], dim=-1) if reverse: matrix = egopose.inverse() else: matrix = egopose if not translation: matrix[..., :3, 3] = 0.0 if reference_points.dim()==4: B, N, K, C = reference_points.shape reference_points = reference_points.view(B, N*K, C) reference_points = (matrix.unsqueeze(1) @ reference_points.unsqueeze(-1)).squeeze(-1)[..., :3] return reference_points.view(B, N, K, -1) else: reference_points = (matrix.unsqueeze(1) @ reference_points.unsqueeze(-1)).squeeze(-1)[..., :3] return reference_points def transform_velo(velo, egopose, reverse=False, translation=False): # velo = torch.cat([velo, torch.ones_like(velo[..., 0:1])], dim=-1) if reverse: matrix = egopose.inverse() else: matrix = egopose if not translation: matrix[..., :3, 3] = 0.0 if velo.dim()==4: B, N, K, C = velo.shape velo = velo.view(B, N*K, C) velo = (matrix.unsqueeze(1)[..., :2, :2] @ velo.unsqueeze(-1)).squeeze(-1) return velo.view(B, N, K, -1) else: velo = (matrix.unsqueeze(1)[..., :2, :2] @ velo.unsqueeze(-1)).squeeze(-1) return velo ================================================ FILE: mmdet3d/models/fbbev/track_head/track_nms_free_coder.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2023 toyota research instutute. # ------------------------------------------------------------------------ # Modified from DETR3D (https://github.com/WangYueFt/detr3d) # Copyright (c) 2021 Wang, Yue # ------------------------------------------------------------------------ # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ import torch from mmdet.core.bbox import BaseBBoxCoder from mmdet.core.bbox.builder import BBOX_CODERS from .streampetr_utils import denormalize_bbox import torch.nn.functional as F @BBOX_CODERS.register_module() class TrackNMSFreeCoder(BaseBBoxCoder): """Bbox coder for NMS-free detector. Including the fields for tracking Args: pc_range (list[float]): Range of point cloud. post_center_range (list[float]): Limit of the center. Default: None. max_num (int): Max number to be kept. Default: 100. score_threshold (float): Threshold to filter boxes based on score. Default: None. code_size (int): Code size of bboxes. Default: 9 """ def __init__(self, pc_range, voxel_size=None, post_center_range=None, max_num=100, score_threshold=None, remove_ego_car=False, num_classes=10): self.pc_range = pc_range self.voxel_size = voxel_size self.post_center_range = post_center_range self.max_num = max_num self.score_threshold = score_threshold self.num_classes = num_classes self.remove_ego_car = remove_ego_car def encode(self): pass def decode_single(self, cls_scores, bbox_preds, obj_idxes=None, track_scores=None, motion_forecasting=None, masks=None): """Decode bboxes. Args: cls_scores (Tensor): Outputs from the classification head, \ shape [num_query, cls_out_channels]. Note \ cls_out_channels should includes background. bbox_preds (Tensor): Outputs from the regression \ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ Shape [num_query, 9]. obj_idxes (Tensor): The idxes of the track instances track_scores (Tensor): The scores of the bbox motion_forecasting (Tensor): The predicted trajectories, [num_query, T, 2] all_masks (Tensor): The masks for valid query output Returns: list[dict]: Decoded boxes. """ max_num = self.max_num cls_scores = cls_scores.sigmoid() if masks is not None: """ If we remove the low scores """ # cls_scores = cls_scores[masks] # bbox_preds = bbox_preds[masks] # obj_idxes = obj_idxes[masks] # track_scores = track_scores[masks] det_scores = track_scores.clone() track_scores[~masks] = -1. #if motion_forecasting is not None: # motion_forecasting = motion_forecasting[masks] # tracking mode decode if obj_idxes is not None: _, indexs = cls_scores.max(dim=-1) labels = indexs % self.num_classes _, bbox_index = det_scores.topk(min(max_num, len(obj_idxes))) det_scores = det_scores[bbox_index] track_scores = track_scores[bbox_index] obj_idxes = obj_idxes[bbox_index] bbox_preds = bbox_preds[bbox_index] labels = labels[bbox_index] # scores = track_scores if motion_forecasting is not None: motion_forecasting = motion_forecasting[bbox_index] # detection mode decode else: cls_scores_topk = cls_scores.view(-1) # scores, indexs = cls_scores_topk.topk(min(max_num, cls_scores_topk.size(0))) # labels = indexs % self.num_classes det_scores, indexs = cls_scores_topk.topk(min(max_num, cls_scores_topk.size(0))) labels = indexs % self.num_classes bbox_index = torch.div(indexs, self.num_classes, rounding_mode='floor') bbox_preds = bbox_preds[bbox_index] final_box_preds = denormalize_bbox(bbox_preds, self.pc_range) final_scores = det_scores final_preds = labels final_motion_forecasting = motion_forecasting # use score threshold if self.score_threshold is not None: thresh_mask = final_scores >= self.score_threshold if self.post_center_range is not None: self.post_center_range = torch.tensor(self.post_center_range, device=final_scores.device) mask = (final_box_preds[..., :3] >= self.post_center_range[:3]).all(1) mask &= (final_box_preds[..., :3] <= self.post_center_range[3:]).all(1) if self.remove_ego_car: ego_range = torch.tensor([2.5, 1], device=final_scores.device) mask &= (final_box_preds[..., :2].abs() >= ego_range).any(1) if self.score_threshold: mask &= thresh_mask boxes3d = final_box_preds[mask] det_scores = final_scores[mask] labels = final_preds[mask] if final_motion_forecasting is not None: motion_forecasting = final_motion_forecasting[mask] if obj_idxes is not None: obj_idxes = obj_idxes[mask] if track_scores is not None: track_scores = track_scores[mask] predictions_dict = { 'bboxes': boxes3d, 'scores': det_scores, 'labels': labels, 'track_scores': track_scores, 'obj_idxes': obj_idxes, 'forecasting': motion_forecasting } else: raise NotImplementedError( 'Need to reorganize output as a batch, only ' 'support post_center_range is not None for now!') return predictions_dict def decode(self, preds_dicts, layer_index=-1): """Decode bboxes. Args: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format (cx, cy, w, l, cz, h, rot_sine, rot_cosine, vx, vy). \ Shape [nb_dec, bs, num_query, 9]. track_instances (Instances): Instances containing track information. Available for tracking evaluation. Returns: list[dict]: Decoded boxes. """ all_cls_scores = preds_dicts['all_cls_scores'][layer_index].clone() all_bbox_preds = preds_dicts['all_bbox_preds'][layer_index].clone() batch_size = all_cls_scores.size()[0] if 'track_instances' in preds_dicts.keys(): track_instances = preds_dicts['track_instances'].clone() obj_idxes = track_instances.obj_idxes.clone() track_scores = track_instances.scores.clone() if 'all_masks' in preds_dicts.keys(): all_masks = preds_dicts['all_masks'].clone() else: all_masks = [None] if 'all_motion_forecasting' in preds_dicts.keys() and preds_dicts['all_motion_forecasting'] is not None: motion_forecasting = preds_dicts['all_motion_forecasting'].clone() else: motion_forecasting = [None] else: obj_idxes = [None for _ in range(batch_size)] track_scores = [None for _ in range(batch_size)] motion_forecasting = [None for _ in range(batch_size)] all_masks = [None for _ in range(batch_size)] predictions_list = [] for i in range(batch_size): predictions_list.append(self.decode_single( all_cls_scores[i], all_bbox_preds[i], obj_idxes[i], track_scores[i], motion_forecasting[i], all_masks[i])) return predictions_list ================================================ FILE: mmdet3d/models/fbbev/track_head/trackpetr.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import torch import torch.nn as nn from mmcv.cnn import Linear, bias_init_with_prob, Scale from mmcv.runner import force_fp32 from mmdet.core import (build_assigner, build_sampler, multi_apply, reduce_mean) from mmdet.models.utils import build_transformer from mmdet.models import HEADS, build_loss from mmdet.models.dense_heads.anchor_free_head import AnchorFreeHead from mmdet.models.utils.transformer import inverse_sigmoid from mmdet3d.core.bbox.coders import build_bbox_coder from .streampetr_utils import * from .instances import Instances from .runtime_tracker import RunTimeTracker import copy from mmdet.models.utils import NormedLinear from mmdet3d.core import bbox3d2result, merge_aug_bboxes_3d from mmdet3d.models.fbbev.utils import save_tensor @HEADS.register_module() class TackerHead(AnchorFreeHead): """Implements the DETR transformer head. See `paper: End-to-End Object Detection with Transformers `_ for details. Args: num_classes (int): Number of categories excluding the background. in_channels (int): Number of channels in the input feature map. num_query (int): Number of query in Transformer. num_reg_fcs (int, optional): Number of fully-connected layers used in `FFN`, which is then used for the regression head. Default 2. transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer. Default: None. sync_cls_avg_factor (bool): Whether to sync the avg_factor of all ranks. Default to False. positional_encoding (obj:`mmcv.ConfigDict`|dict): Config for position encoding. loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the classification loss. Default `CrossEntropyLoss`. loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the regression loss. Default `L1Loss`. loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the regression iou loss. Default `GIoULoss`. tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of transformer head. test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of transformer head. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ _version = 2 def __init__(self, num_classes, in_channels=256, stride=[16], embed_dims=256, num_query=100, num_reg_fcs=2, memory_len=1024, topk_proposals=256, num_propagated=256, with_dn=True, with_ego_pos=True, match_with_velo=True, match_costs=None, transformer=None, sync_cls_avg_factor=False, code_weights=None, bbox_coder=None, loss=dict( type='TrackingLossCombo', num_classes=10, interm_loss=True, code_weights = [2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], loss_cls=dict( type='FocalLoss', use_sigmoid=True, gamma=2.0, alpha=0.25, loss_weight=2.0), loss_bbox=dict(type='L1Loss', loss_weight=0.25), loss_iou=dict(type='GIoULoss', loss_weight=0.0), # loss_prediction=dict(type='L1Loss', loss_weight=0.5), assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='FocalLossCost', weight=2.0), reg_cost=dict(type='BBox3DL1Cost', weight=0.25), iou_cost=dict(type='IoUCost', weight=0.0), # Fake cost. This is just to make it compatible with DETR head. pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]) ), train_cfg=dict( assigner=dict( type='HungarianAssigner3D', cls_cost=dict(type='ClassificationCost', weight=1.), reg_cost=dict(type='BBoxL1Cost', weight=5.0), iou_cost=dict( type='IoUCost', iou_mode='giou', weight=2.0)),), test_cfg=dict(max_per_img=100), scalar = 5, noise_scale = 0.4, noise_trans = 0.0, dn_weight = 1.0, split = 0.5, init_cfg=None, normedlinear=False, runtime_tracker=dict( output_threshold=0.2, score_threshold=0.2, record_threshold=0.4, max_age_since_update=7), tracking=True, layer_index=-1, **kwargs): # NOTE here use `AnchorFreeHead` instead of `TransformerHead`, # since it brings inconvenience when the initialization of # `AnchorFreeHead` is called. self.num_query = num_query self.num_classes = num_classes self.in_channels = in_channels self.memory_len = memory_len self.topk_proposals = topk_proposals self.num_propagated = num_propagated self.with_dn = with_dn self.with_ego_pos = with_ego_pos self.match_with_velo = match_with_velo self.num_reg_fcs = num_reg_fcs self.train_cfg = train_cfg self.test_cfg = test_cfg self.fp16_enabled = False self.embed_dims = embed_dims self.with_dn = with_dn self.stride=stride self.layer_index = layer_index if 'code_size' in kwargs: self.code_size = kwargs['code_size'] else: self.code_size = 10 self.scalar = scalar self.bbox_noise_scale = noise_scale self.bbox_noise_trans = noise_trans self.dn_weight = dn_weight self.split = split self.act_cfg = transformer.get('act_cfg', dict(type='ReLU', inplace=True)) self.num_pred = transformer['decoder']['num_layers'] self.normedlinear = normedlinear self.tracking = tracking super(TackerHead, self).__init__(num_classes, in_channels, init_cfg = init_cfg) self.criterion = build_loss(loss) self.transformer = build_transformer(transformer) self.bbox_coder = build_bbox_coder(bbox_coder) self.pc_range = nn.Parameter(torch.tensor( self.bbox_coder.pc_range), requires_grad=False) self._init_layers() self.reset_history_track_instances() self.count = 0 self.hist_len = 4 # self.fut_len = 8 if runtime_tracker: self.runtime_tracker = RunTimeTracker(**runtime_tracker) self.runtime_tracker.empty() def _init_layers(self): """Initialize layers of the transformer head.""" cls_branch = [] for _ in range(self.num_reg_fcs): cls_branch.append(Linear(self.embed_dims, self.embed_dims)) cls_branch.append(nn.LayerNorm(self.embed_dims)) cls_branch.append(nn.ReLU(inplace=True)) if self.normedlinear: cls_branch.append(NormedLinear(self.embed_dims, self.cls_out_channels)) else: cls_branch.append(Linear(self.embed_dims, self.cls_out_channels)) fc_cls = nn.Sequential(*cls_branch) reg_branch = [] for _ in range(self.num_reg_fcs): reg_branch.append(Linear(self.embed_dims, self.embed_dims)) reg_branch.append(nn.ReLU()) reg_branch.append(Linear(self.embed_dims, self.code_size)) reg_branch = nn.Sequential(*reg_branch) # self.cls_branches = nn.ModuleList( # [fc_cls for _ in range(self.num_pred)]) # self.reg_branches = nn.ModuleList( # [reg_branch for _ in range(self.num_pred)]) def _get_clones(module, N): return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) self.cls_branches =_get_clones(fc_cls, self.num_pred) self.reg_branches = _get_clones(reg_branch, self.num_pred) self.reference_points = nn.Embedding(self.num_query, 3) if self.num_propagated > 0: self.pseudo_reference_points = nn.Embedding(self.num_propagated, 3) self.query_embedding = nn.Sequential( nn.Linear(self.embed_dims*3//2, self.embed_dims), nn.ReLU(), nn.Linear(self.embed_dims, self.embed_dims), ) if self.tracking: self.query_feat_embedding = nn.Embedding(self.num_query, self.embed_dims) # self.spatial_alignment = MLN(14, use_ln=False) self.time_embedding = nn.Sequential( nn.Linear(self.embed_dims, self.embed_dims), nn.LayerNorm(self.embed_dims) ) # encoding ego pose if self.with_ego_pos: self.ego_pose_pe = MLN(180) self.ego_pose_memory = MLN(180) def init_weights(self): """Initialize weights of the transformer head.""" # The initialization for transformer is important nn.init.uniform_(self.reference_points.weight.data, 0, 1) if self.num_propagated > 0: nn.init.uniform_(self.pseudo_reference_points.weight.data, 0, 1) self.pseudo_reference_points.weight.requires_grad = False self.transformer.init_weights() if self.loss_cls.use_sigmoid: bias_init = bias_init_with_prob(0.01) for m in self.cls_branches: nn.init.constant_(m[-1].bias, bias_init) def reset_history_track_instances(self): self.history_track_instances = None def generate_empty_instance(self, B, init_memory_instances=False): """Generate empty instance slots at the beginning of tracking""" track_instances = Instances((1, 1)) device = self.reference_points.weight.device """Detection queries""" # reference points, query embeds, and query targets (features) if init_memory_instances: reference_points = self.reference_points.weight.new_zeros(self.memory_len, 3)[None].repeat(B, 1, 1) len_track_instances = self.memory_len else: reference_points = self.reference_points.weight[None].repeat(B, 1, 1) len_track_instances = self.num_query query_pos = self.query_embedding(pos2posemb3d(reference_points)) track_instances.reference_points = reference_points.clone() track_instances.query_pos = query_pos.clone() if self.tracking: if init_memory_instances: track_instances.query_feats = self.query_feat_embedding.weight.new_zeros(len_track_instances, self.embed_dims)[None].repeat(B, 1, 1) else: track_instances.query_feats = self.query_feat_embedding.weight.clone()[None].repeat(B, 1, 1) else: track_instances.query_feats = torch.zeros_like(query_pos) """ StreamPETR memory information""" track_instances.timestamp = torch.zeros(B, len_track_instances, 1, dtype=torch.float, device=device) track_instances.ego_pose = torch.zeros(B, len_track_instances, 4, 4, dtype=torch.float, device=device) track_instances.velo = torch.zeros(B, len_track_instances, 2, dtype=torch.float, device=device) """Tracking information""" # id for the tracks track_instances.obj_idxes = torch.full( (B, len_track_instances,), -1, dtype=torch.long, device=device) # matched gt indexes, for loss computation track_instances.matched_gt_idxes = torch.full( (B, len_track_instances,), -1, dtype=torch.long, device=device) # life cycle management track_instances.disappear_time = torch.zeros( (B, len_track_instances, ), dtype=torch.long, device=device) track_instances.track_query_mask = torch.zeros( (B, len_track_instances, ), dtype=torch.bool, device=device) """Current frame information""" # classification scores track_instances.logits = torch.zeros( (B, len_track_instances, self.num_classes), dtype=torch.float, device=device) # bounding boxes track_instances.bboxes = torch.zeros( (B, len_track_instances, 10), dtype=torch.float, device=device) # track scores, normally the scores for the highest class track_instances.scores = torch.zeros( (B, len_track_instances, 1), dtype=torch.float, device=device) # # motion prediction, not normalized # track_instances.motion_predictions = torch.zeros( # (B, len_track_instances, self.fut_len, 2), dtype=torch.float, device=device) # """Cache for current frame information, loading temporary data for spatial-temporal reasoining""" # track_instances.cache_logits = torch.zeros( # (B, len_track_instances, self.num_classes), dtype=torch.float, device=device) # track_instances.cache_bboxes = torch.zeros( # (B, len_track_instances, 10), dtype=torch.float, device=device) # track_instances.cache_scores = torch.zeros( # (B, len_track_instances,), dtype=torch.float, device=device) # track_instances.cache_reference_points = reference_points.clone() # track_instances.cache_query_pos = query_pos.clone() # if self.tracking: # track_instances.cache_query_feats = self.query_feat_embedding.weight.clone()[None].repeat(B, 1, 1) # else: # track_instances.cache_query_feats = torch.zeros_like(query_pos) # track_instances.cache_motion_predictions = torch.zeros_like(track_instances.motion_predictions) # """History Reasoning""" # # embeddings track_instances.hist_query_feats = torch.zeros( (B, len_track_instances, self.hist_len, self.embed_dims), dtype=torch.float32, device=device) # # padding mask, follow MultiHeadAttention, 1 indicates padded # track_instances.hist_padding_masks = torch.ones( # (B, len_track_instances, self.hist_len), dtype=torch.bool, device=device) # # positions, global coord track_instances.hist_xyz = torch.zeros( (B, len_track_instances, self.hist_len, 3), dtype=torch.float, device=device) # # positional embeds # track_instances.hist_position_embeds = torch.zeros( # (B, len_track_instances, self.hist_len, self.embed_dims), dtype=torch.float32, device=device) # # bboxes track_instances.hist_velo = torch.zeros( (B, len_track_instances, self.hist_len, 2), dtype=torch.float, device=device) track_instances.hist_mask = torch.zeros( (B, len_track_instances, self.hist_len), dtype=torch.float, device=device) # # logits # track_instances.hist_logits = torch.zeros( # (B, len_track_instances, self.hist_len, self.num_classes), dtype=torch.float, device=device) # # scores # track_instances.hist_scores = torch.zeros( # (B, len_track_instances, self.hist_len), dtype=torch.float, device=device) # """Future Reasoning""" # # embeddings # track_instances.fut_embeds = torch.zeros( # (B, len_track_instances, self.fut_len, self.embed_dims), dtype=torch.float32, device=device) # # padding mask, follow MultiHeadAttention, 1 indicates padded # track_instances.fut_padding_masks = torch.ones( # (B, len_track_instances, self.fut_len), dtype=torch.bool, device=device) # # positions # track_instances.fut_xyz = torch.zeros( # (B, len_track_instances, self.fut_len, 3), dtype=torch.float, device=device) # # positional embeds # track_instances.fut_position_embeds = torch.zeros( # (B, len_track_instances, self.fut_len, self.embed_dims), dtype=torch.float32, device=device) # # bboxes # track_instances.fut_bboxes = torch.zeros( # (B, len_track_instances, self.fut_len, 10), dtype=torch.float, device=device) # # logits # track_instances.fut_logits = torch.zeros( # (B, len_track_instances, self.fut_len, self.num_classes), dtype=torch.float, device=device) # # scores # track_instances.fut_scores = torch.zeros( # (B, len_track_instances, self.fut_len), dtype=torch.float, device=device) return track_instances def instance_temporal_alignment(self): B = self.track_instances.query_pos.size(0) temp_history_track_instances = self.history_track_instances.clone() temp_reference_points = (temp_history_track_instances.reference_points - self.pc_range[:3]) / (self.pc_range[3:6] - self.pc_range[0:3]) temp_history_track_instances.query_pos = self.query_embedding(pos2posemb3d(temp_reference_points)) rec_ego_pose = torch.eye(4, device= self.track_instances.query_pos.device).unsqueeze(0).unsqueeze(0).repeat(B, self.track_instances.query_pos.size(1), 1, 1) tmp_ego_pose = torch.eye(4, device= self.track_instances.query_pos.device).unsqueeze(0).unsqueeze(0).repeat(B, temp_history_track_instances.query_pos.size(1), 1, 1) if self.with_ego_pos: "current ego pose" rec_ego_motion = torch.cat([torch.zeros_like(self.track_instances.reference_points[...,:3]), rec_ego_pose[..., :3, :].flatten(-2)], dim=-1) rec_ego_motion = nerf_positional_encoding(rec_ego_motion) self.track_instances.query_pos = self.ego_pose_pe(self.track_instances.query_pos, rec_ego_motion) self.track_instances.query_feats = self.ego_pose_memory(self.track_instances.query_feats, rec_ego_motion) "memory ego pose" memory_ego_motion = torch.cat([ temp_history_track_instances.velo, temp_history_track_instances.timestamp, tmp_ego_pose[..., :3, :].flatten(-2)], dim=-1).float() memory_ego_motion = nerf_positional_encoding(memory_ego_motion) temp_history_track_instances.query_pos = self.ego_pose_pe(temp_history_track_instances.query_pos, memory_ego_motion) temp_history_track_instances.query_feats = self.ego_pose_memory(temp_history_track_instances.query_feats, memory_ego_motion) self.track_instances.query_pos += self.time_embedding(pos2posemb1d(torch.zeros_like(self.track_instances.reference_points[...,:1]))) temp_history_track_instances.query_pos += self.time_embedding(pos2posemb1d(temp_history_track_instances.timestamp).float()) if self.num_propagated > 0: reference_points = torch.cat([self.track_instances.reference_points, temp_reference_points[:, :self.num_propagated]], dim=1) self.track_instances = Instances.cat([self.track_instances, temp_history_track_instances[:, :self.num_propagated]], dim=1) temp_history_track_instances = temp_history_track_instances[:, self.num_propagated:] temp_reference_points = temp_reference_points[:, self.num_propagated:] rec_ego_pose = torch.eye(4, device=self.track_instances.query_pos.device).unsqueeze(0).unsqueeze(0).repeat(B, self.track_instances.query_pos.shape[1], 1, 1) return reference_points, temp_history_track_instances, temp_reference_points, rec_ego_pose def pre_update_instances(self, data): x = 1-data['start_of_sequence'] # original prev_exist, so we need do `not` B = x.size(0) self.track_instances = self.generate_empty_instance(B, init_memory_instances=False) if self.history_track_instances is None: self.history_track_instances = self.generate_empty_instance(B, init_memory_instances=True) else: self.history_track_instances.timestamp += data['timestamp'].unsqueeze(-1).unsqueeze(-1) self.history_track_instances.ego_pose = data['ego_pose_inv'].unsqueeze(1) @ self.history_track_instances.ego_pose self.history_track_instances.reference_points = transform_reference_points(self.history_track_instances.reference_points, data['ego_pose_inv'], reverse=False) ## hist self.history_track_instances.hist_xyz = transform_reference_points(self.history_track_instances.hist_xyz, data['ego_pose_inv'], reverse=False) self.history_track_instances.hist_velo = transform_velo(self.history_track_instances.hist_velo, data['ego_pose_inv'], reverse=False) # hist self.history_track_instances = self.history_track_instances[:, :self.memory_len] if data['start_of_sequence'].any(): self.history_track_instances.timestamp = memory_refresh(self.history_track_instances.timestamp, x) self.history_track_instances.reference_points = memory_refresh(self.history_track_instances.reference_points, x) self.history_track_instances.query_feats = memory_refresh(self.history_track_instances.query_feats, x) self.history_track_instances.ego_pose = memory_refresh(self.history_track_instances.ego_pose, x) self.history_track_instances.velo = memory_refresh(self.history_track_instances.velo, x) self.history_track_instances.scores = memory_refresh(self.history_track_instances.scores, x) ## hist self.history_track_instances.hist_xyz = memory_refresh(self.history_track_instances.hist_xyz, x) self.history_track_instances.hist_velo = memory_refresh(self.history_track_instances.hist_velo, x) self.history_track_instances.hist_mask = memory_refresh(self.history_track_instances.hist_mask, x) ## device = self.reference_points.weight.device self.history_track_instances.matched_gt_idxes = (memory_refresh(self.history_track_instances.matched_gt_idxes, x) + (1 - x).view(B, 1) * torch.full( (B, self.memory_len,), -1, dtype=torch.long, device=device)).to(torch.long) self.history_track_instances.obj_idxes = (memory_refresh(self.history_track_instances.obj_idxes, x) + (1 - x).view(B, 1) * torch.full( (B, self.memory_len,), -1, dtype=torch.long, device=device)).to(torch.long) # for the first frame, padding pseudo_reference_points (non-learnable) if self.num_propagated > 0: pseudo_reference_points = self.pseudo_reference_points.weight * (self.pc_range[3:6] - self.pc_range[0:3]) + self.pc_range[0:3] self.history_track_instances.reference_points[:, :self.num_propagated] = self.history_track_instances.reference_points[:, :self.num_propagated] + (1 - x).view(B, 1, 1) * pseudo_reference_points self.history_track_instances.ego_pose[:, :self.num_propagated] = self.history_track_instances.ego_pose[:, :self.num_propagated] + (1 - x).view(B, 1, 1, 1) * torch.eye(4, device=x.device) def post_update_instances(self, data, rec_ego_pose, all_cls_scores, all_bbox_preds, outs_dec, mask_dict): if self.training and mask_dict and mask_dict['pad_size'] > 0: rec_reference_points = all_bbox_preds[:, :, mask_dict['pad_size']:, :3][self.layer_index] rec_velo = all_bbox_preds[:, :, mask_dict['pad_size']:, -2:][self.layer_index] rec_memory = outs_dec[:, :, mask_dict['pad_size']:, :][self.layer_index] rec_score = all_cls_scores[:, :, mask_dict['pad_size']:, :][self.layer_index].sigmoid().topk(1, dim=-1).values[..., 0:1] rec_timestamp = torch.zeros_like(rec_score, dtype=torch.float64) rec_bboxes = all_bbox_preds[:, :, mask_dict['pad_size']:, :][self.layer_index] else: rec_reference_points = all_bbox_preds[..., :3][self.layer_index] rec_velo = all_bbox_preds[..., -2:][self.layer_index] rec_memory = outs_dec[self.layer_index] rec_score = all_cls_scores[self.layer_index].sigmoid().topk(1, dim=-1).values[..., 0:1] rec_timestamp = torch.zeros_like(rec_score, dtype=torch.float64) rec_bboxes = all_bbox_preds[self.layer_index] # topk proposals self.track_instances.timestamp = rec_timestamp self.track_instances.query_feats = rec_memory self.track_instances.ego_pose = rec_ego_pose self.track_instances.velo = rec_velo self.track_instances.reference_points = rec_reference_points self.track_instances.scores = rec_score self.track_instances.bboxes = rec_bboxes ## update hist self.track_instances.hist_xyz = torch.cat([self.track_instances.hist_xyz[:, :, 1:], rec_reference_points.unsqueeze(-2)], -2) self.track_instances.hist_velo = torch.cat([self.track_instances.hist_velo[:, :, 1:], rec_velo.unsqueeze(-2)], -2) self.track_instances.hist_query_feats = torch.cat([self.track_instances.hist_query_feats[:, :, 1:], rec_memory.unsqueeze(-2)], -2) self.track_instances.hist_mask[..., -1] = 1 def post_merge_instances(self, data, kept_indicator=0): """During training, we kept all activate instances, so the mergeing part should be after the assignment. """ active_instances = (self.track_instances.matched_gt_idxes>=kept_indicator).nonzero() B = len(self.track_instances) topk_indexes_list = [] for i in range(B): active_idxes_i = active_instances[active_instances[:, 0] == i][:, 1] scores = self.track_instances.scores[i:i+1].clone() scores[:, active_idxes_i] = -1 _, topk_indexes = torch.topk(scores, self.topk_proposals - min(len(active_idxes_i), self.topk_proposals), dim=1) self.track_instances.obj_idxes[i, topk_indexes[0, :, 0]] = -1 topk_indexes_list.append(torch.cat([active_idxes_i[None, :, None], topk_indexes], 1)) topk_indexes = torch.cat(topk_indexes_list) # valid_key_set = ['reference_points', 'query_pos', 'query_feats', 'timestamp', 'velo', 'ego_pose', 'obj_idxes', 'matched_gt_idxes', 'disappear_time'] topk_instances = self.track_instances.instances_topk_gather(topk_indexes, valid_key_set=None) re_track_instances = Instances.detach(topk_instances) self.history_track_instances = Instances.cat([re_track_instances, self.history_track_instances], dim=1) # self.memory_reference_point_copy = self.memory_reference_point.clone() self.history_track_instances.reference_points = transform_reference_points(self.history_track_instances.reference_points, data['ego_pose'], reverse=False) self.history_track_instances.timestamp -= data['timestamp'].unsqueeze(-1).unsqueeze(-1) self.history_track_instances.ego_pose = data['ego_pose'].unsqueeze(1) @ self.history_track_instances.ego_pose self.history_track_instances.hist_xyz = transform_reference_points(self.history_track_instances.hist_xyz, data['ego_pose'], reverse=False) self.history_track_instances.hist_velo = transform_velo(self.history_track_instances.hist_velo, data['ego_pose'], reverse=False) return topk_instances def forward(self, input_dict, img_metas, gt_bboxes_3d=None, gt_labels_3d=None, debug_info=None): """Forward function. Args: mlvl_feats (tuple[Tensor]): Features from the upstream network, each is a 5D-tensor with shape (B, N, C, H, W). Returns: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ Shape [nb_dec, bs, num_query, 9]. """ start_of_sequence = torch.FloatTensor([ single_img_metas['start_of_sequence'] for single_img_metas in img_metas]).to(input_dict['img_bev_feat'][0].device) timestamp = torch.FloatTensor([ single_img_metas['timestamp'] for single_img_metas in img_metas]).to(input_dict['img_bev_feat'][0].device) ego_pose_inv = torch.stack([ single_img_metas['ego_pose_inv'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) ego_pose = torch.stack([ single_img_metas['ego_pose'] for single_img_metas in img_metas], 0).to(input_dict['img_bev_feat'][0].device) data = dict( start_of_sequence = start_of_sequence, timestamp = timestamp, ego_pose_inv = ego_pose_inv, ego_pose = ego_pose, ) if input_dict['img_bev_feat'][0].dim() == 5: mlvl_feats = [level.mean(-1) for level in input_dict['img_bev_feat']] else: mlvl_feats = input_dict['img_bev_feat'] # self.pre_update_memory(data) self.pre_update_instances(data) # mlvl_feats = data['img_feats'] B = mlvl_feats[0].size(0) # reference_points = self.reference_points.weight dtype = self.track_instances.reference_points.dtype feat_flatten = [] spatial_flatten = [] for i in range(len(mlvl_feats)): B, C, H, W = mlvl_feats[i].shape mlvl_feat = mlvl_feats[i].reshape(B, C, -1).transpose(1, 2) # mlvl_feat = self.spatial_alignment(mlvl_feat, mln_input) feat_flatten.append(mlvl_feat.to(dtype)) spatial_flatten.append((H, W)) feat_flatten = torch.cat(feat_flatten, dim=1) spatial_flatten = torch.as_tensor(spatial_flatten, dtype=torch.long, device=mlvl_feats[0].device) level_start_index = torch.cat((spatial_flatten.new_zeros((1, )), spatial_flatten.prod(1).cumsum(0)[:-1])) # reference_points, attn_mask, mask_dict = self.prepare_for_dn(B, reference_points, img_metas, gt_bboxes_3d, gt_labels_3d) attn_mask, mask_dict = None, None # prepare for the tgt and query_pos using mln. reference_points, temp_history_track_instances, temp_reference_points, rec_ego_pose = self.instance_temporal_alignment() tgt = self.track_instances.query_feats query_pos = self.track_instances.query_pos # reference_points = self.track_instances.reference_points temp_pos = temp_history_track_instances.query_pos temp_memory = temp_history_track_instances.query_feats outs_dec, intermediate_reference_points = self.transformer(tgt, query_pos, feat_flatten, spatial_flatten, level_start_index, temp_memory, temp_pos, attn_mask, reference_points, self.pc_range, data, img_metas, reg_branches=self.reg_branches, return_intermediate_pts=True, query_embedding=self.query_embedding, temp_reference_points=temp_reference_points) outs_dec = torch.nan_to_num(outs_dec) outputs_classes = [] outputs_coords = [] for lvl in range(outs_dec.shape[0]): reference = inverse_sigmoid(intermediate_reference_points[lvl]) assert reference.shape[-1] == 3 outputs_class = self.cls_branches[lvl](outs_dec[lvl]) tmp = self.reg_branches[lvl](outs_dec[lvl]) tmp[..., 0:3] += reference[..., 0:3] tmp[..., 0:3] = tmp[..., 0:3].sigmoid() outputs_coord = tmp outputs_classes.append(outputs_class) outputs_coords.append(outputs_coord) all_cls_scores = torch.stack(outputs_classes) all_bbox_preds = torch.stack(outputs_coords) all_bbox_preds[..., 0:3] = (all_bbox_preds[..., 0:3] * (self.pc_range[3:6] - self.pc_range[0:3]) + self.pc_range[0:3]) # update the memory bank self.post_update_instances(data, rec_ego_pose, all_cls_scores, all_bbox_preds, outs_dec, mask_dict) if mask_dict and mask_dict['pad_size'] > 0: assert False output_known_class = all_cls_scores[:, :, :mask_dict['pad_size'], :] output_known_coord = all_bbox_preds[:, :, :mask_dict['pad_size'], :] outputs_class = all_cls_scores[:, :, mask_dict['pad_size']:, :] outputs_coord = all_bbox_preds[:, :, mask_dict['pad_size']:, :] mask_dict['output_known_lbs_bboxes']=(output_known_class, output_known_coord) outs = { 'all_cls_scores': outputs_class, 'all_bbox_preds': outputs_coord, 'dn_mask_dict':mask_dict, } else: outs = { 'agent_queries': self.track_instances.query_feats, 'all_cls_scores': all_cls_scores, 'all_bbox_preds': all_bbox_preds, 'dn_mask_dict':None, 'track_instances': self.track_instances, 'data': data } return outs @force_fp32(apply_to=('preds_dicts')) def loss(self, gt_bboxes_list, gt_labels_list, preds_dicts, img_metas=None, gt_bboxes_ignore=None): """"Loss function. Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format. gt_labels_list (list[Tensor]): Ground truth class indexes for each image with shape (num_gts, ). preds_dicts: all_cls_scores (Tensor): Classification score of all decoder layers, has shape [nb_dec, bs, num_query, cls_out_channels]. all_bbox_preds (Tensor): Sigmoid regression outputs of all decode layers. Each is a 4D-tensor with normalized coordinate format (cx, cy, w, h) and shape [nb_dec, bs, num_query, 4]. enc_cls_scores (Tensor): Classification scores of points on encode feature map , has shape (N, h*w, num_classes). Only be passed when as_two_stage is True, otherwise is None. enc_bbox_preds (Tensor): Regression results of each points on the encode feature map, has shape (N, h*w, 4). Only be passed when as_two_stage is True, otherwise is None. gt_bboxes_ignore (list[Tensor], optional): Bounding boxes which can be ignored for each image. Default None. Returns: dict[str, Tensor]: A dictionary of loss components. """ instance_inds = [ single_img_metas['instance_inds'] for single_img_metas in img_metas] loss = self.criterion.loss_single_frame(0, gt_bboxes_list, gt_labels_list, instance_inds, preds_dicts, gt_bboxes_ignore) topk_instances = self.post_merge_instances(preds_dicts['data']) return loss, topk_instances def get_targets(self): pass def forward_tracking(self, input_dict, img_metas): pred_dicts = self.forward(input_dict, img_metas) # prev_active_track_instances = self.runtime_tracker.track_instances track_instances= pred_dicts['track_instances'] # assign ids # active_mask = (track_instances.scores > self.runtime_tracker.threshold) B = len(track_instances) appear_mask = (track_instances.obj_idxes< 0) & (track_instances.scores[..., 0] > self.runtime_tracker.threshold) kept_mask = (track_instances.obj_idxes>=0) & (track_instances.scores[..., 0] > self.runtime_tracker.threshold) disappear_mask = (track_instances.obj_idxes>=0) & (track_instances.scores[..., 0] <= self.runtime_tracker.threshold) non_mask = (track_instances.obj_idxes<0) & (track_instances.scores[..., 0] <= self.runtime_tracker.threshold) track_instances.matched_gt_idxes[appear_mask|kept_mask] = 1 track_instances.matched_gt_idxes[disappear_mask] -= 1 track_instances.matched_gt_idxes[non_mask] = -10000 track_instances.obj_idxes[appear_mask] = torch.arange(self.runtime_tracker.current_id, self.runtime_tracker.current_id+appear_mask.sum(), device=appear_mask.device)[None] self.runtime_tracker.current_id += appear_mask.sum() pred_dicts['track_instances'] = track_instances.clone() pred_dicts['track_instances'].scores = pred_dicts['track_instances'].scores.squeeze(-1) score_mask = (pred_dicts['track_instances'].scores > self.runtime_tracker.output_threshold) pred_dicts['all_masks'] = score_mask.clone() topk_instances = self.post_merge_instances(pred_dicts['data'], kept_indicator=0) return pred_dicts, topk_instances @force_fp32(apply_to=('preds_dicts')) def get_bboxes(self, preds_dicts, img_metas, rescale=False): """Generate bboxes from bbox head predictions. Args: preds_dicts (tuple[list[dict]]): Prediction results. img_metas (list[dict]): Point cloud and image's meta info. Returns: list[dict]: Decoded bbox, scores and labels after nms. """ preds_dicts = self.bbox_coder.decode(preds_dicts, layer_index=self.layer_index) num_samples = len(preds_dicts) ret_list = [] for i in range(num_samples): preds = preds_dicts[i] bboxes = preds['bboxes'] bboxes[:, 2] = bboxes[:, 2] - bboxes[:, 5] * 0.5 bboxes = img_metas[i]['box_type_3d'](bboxes, bboxes.size(-1)) scores = preds['scores'] labels = preds['labels'] bbox_results = bbox3d2result(bboxes, scores, labels) for key in ['track_scores', 'obj_idxes']: bbox_results[key] = preds[key].cpu() ret_list.append(bbox_results) return ret_list class MLN(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=256, use_ln=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.use_ln = use_ln self.reduce = nn.Sequential( nn.Linear(c_dim, f_dim), nn.ReLU(), ) self.gamma = nn.Linear(f_dim, f_dim) self.beta = nn.Linear(f_dim, f_dim) if self.use_ln: self.ln = nn.LayerNorm(f_dim, elementwise_affine=False) self.init_weight() def init_weight(self): nn.init.zeros_(self.gamma.weight) nn.init.zeros_(self.beta.weight) nn.init.ones_(self.gamma.bias) nn.init.zeros_(self.beta.bias) def forward(self, x, c): if self.use_ln: x = self.ln(x) c = self.reduce(c) gamma = self.gamma(c) beta = self.beta(c) out = gamma * x + beta return out ================================================ FILE: mmdet3d/models/fbbev/track_head/utils.py ================================================ import torch import copy import math import torch import torch.nn as nn import numpy as np from mmcv.cnn import bias_init_with_prob, xavier_init class StreamTensorMemory(object): def __init__(self, batch_size): self.train_bs = batch_size self.training = True self.bs = self.train_bs self.train_memory_list = [None for i in range(self.bs)] self.train_img_metas_memory = [None for i in range(self.bs)] self.test_memory_list = [None] # bs = 1 when testing self.test_img_metas_memory = [None] @property def memory_list(self): if self.training: return self.train_memory_list else: return self.test_memory_list @property def img_metas_memory(self): if self.training: return self.train_img_metas_memory else: return self.test_img_metas_memory def update(self, memory, img_metas): for i in range(self.bs): self.memory_list[i] = memory[i].clone().detach() self.img_metas_memory[i] = copy.deepcopy(img_metas[i]) def reset_single(self, idx): self.memory_list[idx] = None self.img_metas_memory[idx] = None def get(self, img_metas): ''' img_metas: list[img_metas] ''' tensor_list = [] img_metas_list = [] is_first_frame_list = [] for i in range(self.bs): if not self.img_metas_memory[i]: is_first_frame = True else: is_first_frame = (img_metas[i]['scene_name'] != self.img_metas_memory[i]['scene_name']) if is_first_frame: self.reset_single(i) tensor_list.append(self.memory_list[i]) img_metas_list.append(self.img_metas_memory[i]) is_first_frame_list.append(is_first_frame) result = { 'tensor': tensor_list, 'img_metas': img_metas_list, 'is_first_frame': is_first_frame_list, } return result def train(self, mode=True): self.training = mode if mode: self.bs = self.train_bs else: self.bs = 1 def eval(self): self.train(False) class MotionMLP(nn.Module): ''' Args: c_dim (int): dimension of latent code c f_dim (int): feature dimension ''' def __init__(self, c_dim, f_dim=512, identity=True): super().__init__() self.c_dim = c_dim self.f_dim = f_dim self.identity = identity self.fc = nn.Sequential( nn.Linear(c_dim + f_dim, 2*f_dim), nn.LayerNorm(2*f_dim), nn.ReLU(), nn.Linear(2*f_dim, f_dim) ) self.init_weights() def init_weights(self): for m in self.fc: for param in m.parameters(): if param.dim() > 1: if self.identity: nn.init.zeros_(param) else: nn.init.xavier_uniform_(param) def forward(self, x, c): xc = torch.cat([x, c], dim=-1) out = self.fc(xc) if self.identity: out = out + x return out ================================================ FILE: mmdet3d/models/fbbev/utils/__init__.py ================================================ from .bricks import save_tensor, run_time from .wechat_logger import MyWechatLoggerHook from .draw_bbox import * from .eval_hook import CustomDistEvalHook from .timer_cp import TimerCP ================================================ FILE: mmdet3d/models/fbbev/utils/bricks.py ================================================ import torch from torchvision.utils import make_grid import torchvision import matplotlib.pyplot as plt import cv2 from array import array from collections.abc import Iterable, Mapping from sys import getsizeof from types import GeneratorType def compute_allocation(obj) -> int: my_ids = set([id(obj)]) # store the ids of previously seen objects to_compute = [obj] allocation_size = 0 container_allocation = 0 # return the memory spent in containers like list or dictionaryes while len(to_compute) > 0: obj_to_check = to_compute.pop() allocation_size += getsizeof(obj_to_check) if type(obj_to_check) == str: # string just return the actual size continue if type(obj_to_check) == array: # array just return the actual size continue # if we have other object that only return the actual size, use the same logic as above elif isinstance(obj_to_check, GeneratorType): # generator objet takes little memory continue elif isinstance(obj_to_check, Mapping): # for dic need to count the keys and values container_allocation += getsizeof(obj_to_check) for ikey, ivalue in obj_to_check.items(): if id(ikey) not in my_ids: my_ids.add(id(ikey)) to_compute.append(ikey) if id(ivalue) not in my_ids: my_ids.add(id(ivalue)) to_compute.append(ivalue) elif isinstance(obj_to_check, Iterable): # for iterable like object ,use the same logic above container_allocation += getsizeof(obj_to_check) for inner in obj_to_check: if id(inner) not in my_ids: my_ids.add(id(inner)) to_compute.append(inner) return allocation_size, allocation_size - container_allocation def convert_color(img_path): plt.figure() img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) plt.imsave(img_path, img, cmap=plt.get_cmap('viridis')) plt.close() def save_tensor(tensor, path, pad_value=254.0,normalize=False): print('save_tensor', path) tensor = tensor.to(torch.float).detach().cpu() max_ = tensor.flatten(1).max(-1).values[:, None, None] min_ = tensor.flatten(1).min(-1).values[:, None, None] tensor = (tensor-min_)/(max_-min_) if tensor.type() == 'torch.BoolTensor': tensor = tensor*255 if len(tensor.shape) == 3: tensor = tensor.unsqueeze(1) tensor = make_grid(tensor, pad_value=pad_value, normalize=normalize).permute(1, 2, 0).numpy().copy() torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path) convert_color(path) import functools import time from collections import defaultdict time_maps = defaultdict(lambda :0.) count_maps = defaultdict(lambda :0.) def run_time(name): def middle(fn): def wrapper(*args, **kwargs): torch.cuda.synchronize() start_time = time.perf_counter() res = fn(*args, **kwargs) torch.cuda.synchronize() elapsed = time.perf_counter() - start_time time_maps['%s : %s'%(name, fn.__name__) ] += elapsed count_maps['%s : %s'%(name, fn.__name__) ] +=1 print("%s : %s takes up %f "% (name, fn.__name__,time_maps['%s : %s'%(name, fn.__name__) ] /count_maps['%s : %s'%(name, fn.__name__) ] )) return res return wrapper return middle ================================================ FILE: mmdet3d/models/fbbev/utils/draw_bbox.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE ## copy-paste from mmdet3d. Used to debug import mmcv import numpy as np from mmdet3d.core.visualizer.image_vis import (draw_camera_bbox3d_on_img, draw_depth_bbox3d_on_img) import cv2 import torch import copy import os.path as osp from mmdet3d.core.bbox.iou_calculators import BboxOverlaps3D from IPython import embed c_iou = BboxOverlaps3D(coordinate='lidar') def plot_rect3d_on_img(img, num_rects, rect_corners, color=(0, 255, 0), thickness=1, img_metas=None, scores=None, types=None ): """Plot the boundary lines of 3D rectangular on 2D images. Args: img (numpy.array): The numpy array of image. num_rects (int): Number of 3D rectangulars. rect_corners (numpy.array): Coordinates of the corners of 3D rectangulars. Should be in the shape of [num_rect, 8, 2]. color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ line_indices = [(0, 1), (0, 3), (0, 4), (1, 2), (1, 5), (3, 2), (3, 7), (4, 5), (4, 7), (2, 6), (5, 6), (6, 7), (0, 5), (1, 4)] for i in range(num_rects): corners = rect_corners[i].astype(np.int) try: color = [(255, 0, 0), (61, 102, 255), (241, 101, 72), (125, 125, 0), (61, 102, 255)][int(types[i])] except: color = (61, 102, 255) back_mid = ((corners[0, 0] + corners[3, 0])//2, (corners[0, 1] + corners[3, 1])//2) front_mid = ((corners[7, 0] + corners[4, 0]) // 2, (corners[7, 1] + corners[4, 1]) // 2) bottom_center = ((front_mid[0] + back_mid[0])//2, (front_mid[1] + back_mid[1])//2) try: cv2.line(img, front_mid, bottom_center, color, thickness+1, cv2.LINE_AA) except: pass for j, (start, end) in enumerate(line_indices): try: if j in [12, 13]: # front_thickness = thickness # cv2.line(img, (corners[start, 0], corners[start, 1]), # (corners[end, 0], corners[end, 1]), (0, 160, 0), front_thickness, # cv2.LINE_AA) pass else: cv2.line(img, (corners[start, 0], corners[start, 1]), (corners[end, 0], corners[end, 1]), color, thickness+1, cv2.LINE_AA) except: pass # for p in range(8): # try: # cv2.putText(img, str(p), corners[p,:2], cv2.FONT_HERSHEY_COMPLEX, 1.0, (0, 0, 255), 2) # except: # pass # if img_metas != 0 and j == 0: # text = img_metas[i] # try: # cv2.putText(img, '%.1f %.1f %.1f' % (text[0], text[1], text[2]), (corners[start, 0], corners[start, 1]), # cv2.FONT_HERSHEY_COMPLEX, 1.0, (0, 0, 255), 2) # except: # pass # print('bug in plot_rect3d_on_img') # print(str(scores[i])[1:4]) try: if scores[i] >= 1.0: scores[i] = str(01.0) # cv2.putText(img, str(scores[i])[1:4], (corners[6, 0], corners[6, 1]), cv2.FONT_HERSHEY_COMPLEX, 1.0, (0, 0, 255), 2) except: pass return img.astype(np.uint8) def draw_lidar_bbox3d_on_img(bboxes3d, raw_img, lidar2img_rt, img_metas, color=(0, 255, 0), camera_params=None, scores=None, types=None, thickness=1): """Project the 3D bbox on 2D plane and draw on input image. Args: bboxes3d (:obj:`LiDARInstance3DBoxes`): 3d bbox in lidar coordinate system to visualize. raw_img (numpy.array): The numpy array of image. lidar2img_rt (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. img_metas (dict): Useless here. color (tuple[int]): The color to draw bboxes. Default: (0, 255, 0). thickness (int, optional): The thickness of bboxes. Default: 1. """ img = raw_img.copy() corners_3d = bboxes3d.corners num_bbox = corners_3d.shape[0] if camera_params is None: pts_4d = np.concatenate( [corners_3d.reshape(-1, 3), np.ones((num_bbox * 8, 1))], axis=-1) lidar2img_rt = copy.deepcopy(lidar2img_rt).reshape(4, 4) if isinstance(lidar2img_rt, torch.Tensor): lidar2img_rt = lidar2img_rt.cpu().numpy() pts_2d = pts_4d @ lidar2img_rt.T pts_2d[:, 2] = np.clip(pts_2d[:, 2], a_min=1e-5, a_max=1e5) pts_2d[:, 0] /= pts_2d[:, 2] pts_2d[:, 1] /= pts_2d[:, 2] imgfov_pts_2d = pts_2d[..., :2].reshape(num_bbox, 8, 2) else: rots, trans, intrins, post_rots, post_trans, bda, i = camera_params B = 1 N = 6 num_frame=rots.size(0)//N extra = [ rots.view(B, num_frame, N, 3, 3), trans.view(B, num_frame, N, 3), intrins.view(B, num_frame, N, 3, 3), post_rots.view(B, num_frame, N, 3, 3), post_trans.view(B, num_frame, N, 3) ] extra = [torch.split(t, 1, 1) for t in extra] extra = [[p.squeeze(1) for p in t] for t in extra] rots, trans, intrins, post_rots, post_trans = extra cam_params = [rots[0], trans[0], intrins[0], post_rots[0], post_trans[0]] rots, trans, intrins, post_rots, post_trans = cam_params reference_points = bboxes3d.corners[None] eps = 1e-5 _, ogfH, ogfW = img.shape reference_points = reference_points[None, None].repeat(B, N, 1, 1, 1, 1) reference_points = torch.inverse(bda).view(B, 1, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points -= trans.view(B, N, 1, 1, 1, 3) combine = rots.matmul(torch.inverse(intrins)).inverse() reference_points_cam = combine.view(B, N, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points_cam = torch.cat([reference_points_cam[..., 0:2] / torch.maximum( reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3])*eps), reference_points_cam[..., 2:3]], 5 ) reference_points_cam = post_rots.view(B, N, 1, 1, 1, 3, 3).matmul(reference_points_cam.unsqueeze(-1)).squeeze(-1) reference_points_cam += post_trans.view(B, N, 1, 1, 1, 3) # reference_points_cam[..., 0] /= ogfW # reference_points_cam[..., 1] /= ogfH imgfov_pts_2d = reference_points_cam[0,i,0].cpu().numpy() return plot_rect3d_on_img(img, num_bbox, imgfov_pts_2d, color, thickness, img_metas,scores=scores, types=types) def show_multi_modality_result(img, gt_bboxes, pred_bboxes, proj_mat, out_dir, filename, box_mode='lidar', img_metas=None, show=True, scores=None, types=None, camera_params=None, gt_bbox_color=(61, 102, 255), pred_bbox_color=(241, 101, 72)): """Convert multi-modality detection results into 2D results. Project the predicted 3D bbox to 2D image plane and visualize them. Args: img (np.ndarray): The numpy array of image in cv2 fashion. gt_bboxes (:obj:`BaseInstance3DBoxes`): Ground truth boxes. pred_bboxes (:obj:`BaseInstance3DBoxes`): Predicted boxes. proj_mat (numpy.array, shape=[4, 4]): The projection matrix according to the camera intrinsic parameters. out_dir (str): Path of output directory. filename (str): Filename of the current frame. box_mode (str): Coordinate system the boxes are in. Should be one of 'depth', 'lidar' and 'camera'. Defaults to 'lidar'. img_metas (dict): Used in projecting depth bbox. show (bool): Visualize the results online. Defaults to False. gt_bbox_color (str or tuple(int)): Color of bbox lines. The tuple of color should be in BGR order. Default: (255, 102, 61) pred_bbox_color (str or tuple(int)): Color of bbox lines. The tuple of color should be in BGR order. Default: (72, 101, 241) """ if box_mode == 'depth': draw_bbox = draw_depth_bbox3d_on_img elif box_mode == 'lidar': draw_bbox = draw_lidar_bbox3d_on_img elif box_mode == 'camera': draw_bbox = draw_camera_bbox3d_on_img else: raise NotImplementedError(f'unsupported box mode {box_mode}') result_path = osp.join(out_dir, filename) # embed() # exit() # mmcv.mkdir_or_exist(out_dir) if scores is not None: keep = scores > 0.3 scores = scores[keep] pred_bboxes = pred_bboxes[keep] if show: show_img = img.copy() if gt_bboxes is not None: text = [[bbox[0], bbox[1], bbox[6]] for bbox in gt_bboxes.tensor.cpu().numpy()] #list(c_iou(gt_bboxes.tensor, pred_bboxes.tensor).max(1).values.cpu().numpy()) img_metas = text show_img = draw_bbox( gt_bboxes, show_img, proj_mat, img_metas, color=gt_bbox_color, camera_params=camera_params) if pred_bboxes is not None: show_img = draw_bbox( pred_bboxes, show_img, proj_mat, None, scores=scores, types=types, camera_params=camera_params, color=pred_bbox_color) # print('bug in show_multi_modality_result') mmcv.imwrite(show_img, result_path.replace('.png', '.jpg')) # mmcv.imshow(show_img, win_name='project_bbox3d_img', wait_time=0) # print() # embed() return ================================================ FILE: mmdet3d/models/fbbev/utils/eval_hook.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE import bisect import os.path as osp from mmdet3d.core.hook.utils import is_parallel import mmcv import torch.distributed as dist from mmcv.runner import DistEvalHook as BaseDistEvalHook from mmcv.runner import EvalHook as BaseEvalHook from torch.nn.modules.batchnorm import _BatchNorm from mmdet.core.evaluation.eval_hooks import DistEvalHook def _calc_dynamic_intervals(start_interval, dynamic_interval_list): assert mmcv.is_list_of(dynamic_interval_list, tuple) dynamic_milestones = [0] dynamic_milestones.extend( [dynamic_interval[0] for dynamic_interval in dynamic_interval_list]) dynamic_intervals = [start_interval] dynamic_intervals.extend( [dynamic_interval[1] for dynamic_interval in dynamic_interval_list]) return dynamic_milestones, dynamic_intervals class CustomDistEvalHook(BaseDistEvalHook): def __init__(self, *args, dynamic_intervals=None, work_dir='test', **kwargs): super(CustomDistEvalHook, self).__init__(*args, **kwargs) self.use_dynamic_intervals = dynamic_intervals is not None if self.use_dynamic_intervals: self.dynamic_milestones, self.dynamic_intervals = \ _calc_dynamic_intervals(self.interval, dynamic_intervals) self.work_dir = work_dir def _decide_interval(self, runner): if self.use_dynamic_intervals: progress = runner.epoch if self.by_epoch else runner.iter step = bisect.bisect(self.dynamic_milestones, (progress + 1)) # Dynamically modify the evaluation interval self.interval = self.dynamic_intervals[step - 1] def before_train_epoch(self, runner): """Evaluate the model only at the start of training by epoch.""" self._decide_interval(runner) super().before_train_epoch(runner) def before_train_iter(self, runner): self._decide_interval(runner) super().before_train_iter(runner) def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" # Synchronization of BatchNorm's buffer (running_mean # and running_var) is not supported in the DDP of pytorch, # which may cause the inconsistent performance of models in # different ranks, so we broadcast BatchNorm's buffers # of rank 0 to other ranks to avoid this. if is_parallel(runner.model): if runner.model.module.history_bev is not None: history_bev = runner.model.module.history_bev.clone() history_seq_ids = runner.model.module.history_seq_ids.clone() history_forward_augs = runner.model.module.history_forward_augs.clone() history_sweep_time = runner.model.module.history_sweep_time.clone() else: history_bev = None runner.model.module.history_bev=None runner.ema_model.ema_model.module.history_bev=None else: runner.ema_model.ema_model.history_bev=None runner.model.history_bev = None if self.broadcast_bn_buffer: model = runner.model for name, module in model.named_modules(): if isinstance(module, _BatchNorm) and module.track_running_stats: dist.broadcast(module.running_var, 0) dist.broadcast(module.running_mean, 0) if not self._should_evaluate(runner): return tmpdir = self.tmpdir if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') from mmdet3d.apis.test import custom_multi_gpu_test # to solve circlur import results = custom_multi_gpu_test( runner.ema_model.ema_model, self.dataloader, tmpdir=tmpdir, gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) if is_parallel(runner.model): if history_bev is not None: runner.model.module.history_bev = history_bev.clone() runner.model.module.history_seq_ids = history_seq_ids.clone() runner.model.module.history_forward_augs = history_forward_augs.clone() runner.model.module.history_sweep_time = history_sweep_time.clone() else: runner.model.module.history_bev = None runner.ema_model.ema_model.module.history_bev = None else: runner.model.history_bev = None runner.ema_model.ema_model.history_bev = None # ema_model def evaluate(self, runner, results): """Evaluate the results. Args: runner (:obj:`mmcv.Runner`): The underlined training runner. results (list): Output results. """ if 'jsonfile_prefix' not in self.eval_kwargs: self.eval_kwargs['jsonfile_prefix'] = osp.join(self.work_dir, 'test') eval_res = self.dataloader.dataset.evaluate( results, logger=runner.logger , **self.eval_kwargs) for name, val in eval_res.items(): runner.log_buffer.output[name] = val runner.log_buffer.ready = True if self.save_best is not None: # If the performance of model is pool, the `eval_res` may be an # empty dict and it will raise exception when `self.save_best` is # not None. More details at # https://github.com/open-mmlab/mmdetection/issues/6265. if not eval_res: warnings.warn( 'Since `eval_res` is an empty dict, the behavior to save ' 'the best checkpoint will be skipped in this evaluation.') return None if self.key_indicator == 'auto': # infer from eval_results self._init_rule(self.rule, list(eval_res.keys())[0]) return eval_res[self.key_indicator] return None ================================================ FILE: mmdet3d/models/fbbev/utils/grid_mask.py ================================================ import torch import torch.nn as nn import numpy as np from PIL import Image class Grid(object): def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode=mode self.st_prob = prob self.prob = prob def set_prob(self, epoch, max_epoch): self.prob = self.st_prob * epoch / max_epoch def __call__(self, img, label): if np.random.rand() > self.prob: return img, label h = img.size(1) w = img.size(2) self.d1 = 2 self.d2 = min(h, w) hh = int(1.5*h) ww = int(1.5*w) d = np.random.randint(self.d1, self.d2) if self.ratio == 1: self.l = np.random.randint(1, d) else: self.l = min(max(int(d*self.ratio+0.5),1),d-1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh//d): s = d*i + st_h t = min(s+self.l, hh) mask[s:t,:] *= 0 if self.use_w: for i in range(ww//d): s = d*i + st_w t = min(s+self.l, ww) mask[:,s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] mask = torch.from_numpy(mask).float() if self.mode == 1: mask = 1-mask mask = mask.expand_as(img) if self.offset: offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float() offset = (1 - mask) * offset img = img * mask + offset else: img = img * mask return img, label class GridMask(nn.Module): def __init__(self, use_h, use_w, rotate = 1, offset=False, ratio = 0.5, mode=0, prob = 1.): super(GridMask, self).__init__() self.use_h = use_h self.use_w = use_w self.rotate = rotate self.offset = offset self.ratio = ratio self.mode = mode self.st_prob = prob self.prob = prob def set_prob(self, epoch, max_epoch): self.prob = self.st_prob * epoch / max_epoch #+ 1.#0.5 def forward(self, x): if np.random.rand() > self.prob or not self.training: return x n,c,h,w = x.size() x = x.view(-1,h,w) hh = int(1.5*h) ww = int(1.5*w) d = np.random.randint(2, h) self.l = min(max(int(d*self.ratio+0.5),1),d-1) mask = np.ones((hh, ww), np.float32) st_h = np.random.randint(d) st_w = np.random.randint(d) if self.use_h: for i in range(hh//d): s = d*i + st_h t = min(s+self.l, hh) mask[s:t,:] *= 0 if self.use_w: for i in range(ww//d): s = d*i + st_w t = min(s+self.l, ww) mask[:,s:t] *= 0 r = np.random.randint(self.rotate) mask = Image.fromarray(np.uint8(mask)) mask = mask.rotate(r) mask = np.asarray(mask) mask = mask[(hh-h)//2:(hh-h)//2+h, (ww-w)//2:(ww-w)//2+w] mask = torch.from_numpy(mask).float().cuda() if self.mode == 1: mask = 1-mask mask = mask.expand_as(x) if self.offset: offset = torch.from_numpy(2 * (np.random.rand(h,w) - 0.5)).float().cuda() x = x * mask + offset * (1 - mask) else: x = x * mask return x.view(n,c,h,w) ================================================ FILE: mmdet3d/models/fbbev/utils/timer_cp.py ================================================ from mmcv.utils import Registry, is_method_overridden from mmcv.runner.hooks import HOOKS, CheckpointHook, Hook from mmcv.runner.dist_utils import allreduce_params, master_only import time @HOOKS.register_module() class TimerCP(CheckpointHook): # designed for NVIDIA ORD, each job can only run for 4 hours. # period = 4h = 4 * 3600 def __init__(self, period=14400): super().__init__() self.period = period - 180 # 3 mins redundancy self.not_save = True def before_run(self, runner): super().before_run(runner) self.start_time = time.time() def after_train_epoch(self, runner): pass def before_train_iter(self, runner): running_time = (time.time() - self.start_time) if running_time > self.period and self.not_save: runner.logger.info( f'TimerCP: Saving checkpoint at {runner.iter + 1} iterations. Period: '+'%.1fh' % (self.period/3600) ) if self.sync_buffer: allreduce_params(runner.model.buffers()) self._save_checkpoint(runner) self.not_save = False @master_only def _save_checkpoint(self, runner): super()._save_checkpoint(runner) ================================================ FILE: mmdet3d/models/fbbev/utils/wechat_logger.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE from typing import Dict import numpy as np import os.path as osp from mmcv.runner.dist_utils import master_only from mmcv.runner.hooks import HOOKS, Hook from mmcv.runner.hooks.logger.base import LoggerHook from urllib import request, parse import json from urllib.error import HTTPError, URLError import socket @HOOKS.register_module() class GradChecker(Hook): def after_train_iter(self, runner): max_key = None max_val = -1e5 min_key = None min_val = 1e5 for key, val in runner.model.named_parameters(): # if val.grad.max() > max_val: # max_val = val.grad.max() # max_key = key # if val.grad.min() < min_val: # min_val = val.grad.min() # min_key = key if val.grad == None and val.requires_grad: print('WARNNING: {key}\'s parameters are not be used!!!!'.format(key=key)) # print('max grd', max_key, ' ', max_val) # print('min grad', min_key, ' ', min_val) @HOOKS.register_module() class MyWechatLoggerHook(LoggerHook): """Class to log metrics to Wechat. Get your latest training results immediately! Args: interval (int): Logging interval (every k iterations). Default 10. ignore_last (bool): Ignore the log of last iterations in each epoch if less than `interval`. Default: True. reset_flag (bool): Whether to clear the output buffer after logging. Default: False. by_epoch (bool): Whether EpochBasedRunner is used. Default: True. allowed_subkeys: No need to send all results to your phone. Catch the point! miao_code: Get your own code from https://www.showdoc.com.cn/miaotixing/9175237605891603 """ def __init__(self, interval: int = 10, ignore_last: bool = True, reset_flag: bool = False, commit: bool = True, by_epoch: bool = True, allowed_subkeys = ['NDS', 'mAP'], miao_code='xxxxx'): super().__init__(interval, ignore_last, reset_flag, by_epoch) self.miao_code = miao_code self.allowed_subkeys = allowed_subkeys self.notification = True @master_only def before_run(self, runner) -> None: super().before_run(runner) @master_only def get_table_text(self, runner, tags): row_lists = [] row_lists.append([runner.meta['exp_name'], '']) if self.by_epoch: row_lists.append(['Epoch', runner.epoch+1]) for key in tags.keys(): for allowed_subkey in self.allowed_subkeys: if allowed_subkey in key: row_lists.append([key, tags[key]]) table_txt = '' for each_row in row_lists: table_txt += '{key}: {value}\n'.format(key=each_row[0], value=str(each_row[1])) return table_txt @master_only def log(self, runner) -> None: if not self.notification: return mode=self.get_mode(runner) tags = self.get_loggable_tags(runner) text = None if mode == 'train': if np.isnan(tags['train/loss']) or np.isinf(tags['train/loss']): text = runner.meta['exp_name'] + 'got NaN/INF loss' runner.logger.info('got NaN/INF loss value, we will not send any notification to your phone later') self.notification = False elif mode == 'val': text = self.get_table_text(runner, tags) else: assert False, 'what is the running status?' self._send(runner, text) @master_only def _send(self, runner, text) -> None: if text is None: return page=None try: page = request.urlopen("http://miaotixing.com/trigger?" + parse.urlencode({"id":self.miao_code, "text":text, "type":"json"}), timeout=5) except HTTPError as error: runner.logger.info('Data not retrieved because %s\nURL: %s', error, url) except URLError as error: if isinstance(error.reason, socket.timeout): runner.logger.info('MiaoTiXing: socket timed out - URL %s', url) else: runner.logger.info('MiaoTiXing: some other error happened ') else: runner.logger.info('MiaoTiXing: Access successful.') if page is None: return result = page.read() jsonObj = json.loads(result) @master_only def after_run(self, runner) -> None: text = runner.meta['exp_name'] + ' Done!!!' self._send(runner, text) pass ================================================ FILE: mmdet3d/models/fbbev/view_transformation/__init__.py ================================================ from .forward_projection import * from .backward_projection import * ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/__init__.py ================================================ from .backward_projection import BackwardProjection from .bevformer_utils import * ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/backward_projection.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import copy import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import Linear, bias_init_with_prob from mmcv.utils import TORCH_VERSION, digit_version from mmcv.runner.base_module import BaseModule from mmdet.core import (multi_apply, multi_apply, reduce_mean) from mmdet.models.utils.transformer import inverse_sigmoid from mmdet.models import HEADS from mmdet.models.dense_heads import DETRHead from mmdet3d.core.bbox.coders import build_bbox_coder from mmcv.cnn.bricks.transformer import build_positional_encoding from mmcv.runner import force_fp32, auto_fp16 import numpy as np import mmcv import cv2 as cv from mmcv.cnn.bricks.transformer import FFN, build_positional_encoding from mmdet.models.utils import build_transformer @HEADS.register_module() class BackwardProjection(BaseModule): """Head of Detr3D. Args: with_box_refine (bool): Whether to refine the reference points in the decoder. Defaults to False. as_two_stage (bool) : Whether to generate the proposal from the outputs of encoder. transformer (obj:`ConfigDict`): ConfigDict is used for building the Encoder and Decoder. bev_h, bev_w (int): spatial shape of BEV queries. """ def __init__(self, *args, transformer=None, positional_encoding=None, pc_range=None, in_channels=64, out_channels=64, use_zero_embedding=False, bev_h=30, bev_w=30, **kwargs): super().__init__() self.bev_h = bev_h self.bev_w = bev_w self.fp16_enabled = False self.pc_range = pc_range self.use_zero_embedding = use_zero_embedding self.real_w = self.pc_range[3] - self.pc_range[0] self.real_h = self.pc_range[4] - self.pc_range[1] self.positional_encoding = build_positional_encoding( positional_encoding) self.transformer = build_transformer(transformer) self.embed_dims = self.transformer.embed_dims self._init_layers() def _init_layers(self): self.bev_embedding = nn.Embedding( self.bev_h * self.bev_w, self.embed_dims) def init_weights(self): """Initialize weights of the DeformDETR head.""" self.transformer.init_weights() @auto_fp16(apply_to=('mlvl_feats')) def forward(self, mlvl_feats, img_metas, lss_bev=None, gt_bboxes_3d=None, cam_params=None, pred_img_depth=None, bev_mask=None): """Forward function. Args: mlvl_feats (tuple[Tensor]): Features from the upstream network, each is a 5D-tensor with shape (B, N, C, H, W). Returns: all_cls_scores (Tensor): Outputs from the classification head, \ shape [nb_dec, bs, num_query, cls_out_channels]. Note \ cls_out_channels should includes background. all_bbox_preds (Tensor): Sigmoid outputs from the regression \ head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \ Shape [nb_dec, bs, num_query, 9]. """ bs, num_cam, _, _, _ = mlvl_feats[0].shape dtype = mlvl_feats[0].dtype bev_queries = self.bev_embedding.weight.to(dtype) bev_queries = bev_queries.unsqueeze(1).repeat(1, bs, 1) if lss_bev is not None: lss_bev = lss_bev.flatten(2).permute(2, 0, 1) bev_queries = bev_queries + lss_bev if bev_mask is not None: bev_mask = bev_mask.reshape(bs, -1) bev_pos = self.positional_encoding(bs, self.bev_h, self.bev_w, bev_queries.device).to(dtype) bev = self.transformer( mlvl_feats, bev_queries, self.bev_h, self.bev_w, grid_length=(self.real_h / self.bev_h, self.real_w / self.bev_w), bev_pos=bev_pos, img_metas=img_metas, cam_params=cam_params, gt_bboxes_3d=gt_bboxes_3d, pred_img_depth=pred_img_depth, prev_bev=None, bev_mask=bev_mask, ) bev = bev.permute(0, 2, 1).view(bs, -1, self.bev_h, self.bev_w).contiguous() return bev ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/__init__.py ================================================ from .bevformer import BEVFormer from .bevformer_encoder import bevformer_encoder, BEVFormerEncoderLayer from .spatial_cross_attention_depth import DA_MSDeformableAttention, DA_SpatialCrossAttention from .positional_encoding import CustormLearnedPositionalEncoding ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/bevformer.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE import numpy as np import torch import torch.nn as nn from mmcv.cnn import xavier_init from mmcv.cnn.bricks.transformer import build_transformer_layer_sequence from mmcv.runner.base_module import BaseModule from mmdet.models.utils.builder import TRANSFORMER from torch.nn.init import normal_ from mmcv.runner.base_module import BaseModule from torchvision.transforms.functional import rotate from .spatial_cross_attention_depth import DA_MSDeformableAttention from mmcv.runner import force_fp32, auto_fp16 from mmdet.models import build_neck @TRANSFORMER.register_module() class BEVFormer(BaseModule): """Implements the Detr3D transformer. Args: as_two_stage (bool): Generate query from encoder features. Default: False. num_feature_levels (int): Number of feature maps from FPN: Default: 4. two_stage_num_proposals (int): Number of proposals when set `as_two_stage` as True. Default: 300. """ def __init__(self, num_cams=6, encoder=None, embed_dims=256, output_dims=256, use_cams_embeds=True, **kwargs): super(BEVFormer, self).__init__(**kwargs) self.encoder = build_transformer_layer_sequence(encoder) self.embed_dims = embed_dims self.num_cams = num_cams self.fp16_enabled = False self.output_dims = output_dims self.use_cams_embeds = use_cams_embeds self.init_layers() def init_layers(self): """Initialize layers of the Detr3DTransformer.""" self.cams_embeds = nn.Parameter( torch.Tensor(self.num_cams, self.embed_dims)) def init_weights(self): """Initialize the transformer weights.""" for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) for m in self.modules(): if isinstance(m, DA_MSDeformableAttention): try: m.init_weight() except AttributeError: m.init_weights() normal_(self.cams_embeds) @auto_fp16(apply_to=('mlvl_feats', 'bev_queries', 'prev_bev', 'bev_pos')) def forward( self, mlvl_feats, bev_queries, bev_h, bev_w, # grid_length=[0.512, 0.512], bev_pos=None, cam_params=None, gt_bboxes_3d=None, pred_img_depth=None, prev_bev=None, bev_mask=None, **kwargs): """ obtain bev features. """ bs = mlvl_feats[0].size(0) bev_pos = bev_pos.flatten(2).permute(2, 0, 1) feat_flatten = [] spatial_shapes = [] for lvl, feat in enumerate(mlvl_feats): bs, num_cam, c, h, w = feat.shape spatial_shape = (h, w) feat = feat.flatten(3).permute(1, 0, 3, 2) if self.use_cams_embeds: feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) else: feat = feat + self.cams_embeds[:, None, None, :].to(feat.dtype) * 0 spatial_shapes.append(spatial_shape) feat_flatten.append(feat) feat_flatten = torch.cat(feat_flatten, 2) spatial_shapes = torch.as_tensor( spatial_shapes, dtype=torch.long, device=bev_pos.device) level_start_index = torch.cat((spatial_shapes.new_zeros( (1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) feat_flatten = feat_flatten.permute(0, 2, 1, 3) # (num_cam, H*W, bs, embed_dims) bev_embed = self.encoder( bev_queries, feat_flatten, feat_flatten, bev_h=bev_h, bev_w=bev_w, bev_pos=bev_pos, spatial_shapes=spatial_shapes, level_start_index=level_start_index, cam_params=cam_params, gt_bboxes_3d=gt_bboxes_3d, pred_img_depth=pred_img_depth, prev_bev=prev_bev, bev_mask=bev_mask, **kwargs ) return bev_embed ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/bevformer_encoder.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE from .custom_base_transformer_layer import MyCustomBaseTransformerLayer import copy import warnings from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import TransformerLayerSequence from mmcv.runner import force_fp32, auto_fp16 import numpy as np import torch import cv2 as cv import mmcv import time from mmcv.utils import TORCH_VERSION, digit_version from mmcv.utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @TRANSFORMER_LAYER_SEQUENCE.register_module() class bevformer_encoder(TransformerLayerSequence): """ Attention with both self and cross Implements the decoder in DETR transformer. Args: return_intermediate (bool): Whether to return intermediate outputs. coder_norm_cfg (dict): Config of last normalization layer. Default: `LN`. """ def __init__(self, *args, pc_range=None, grid_config=None, data_config=None, return_intermediate=False, dataset_type='nuscenes', fix_bug=False, **kwargs): super(bevformer_encoder, self).__init__(*args, **kwargs) self.return_intermediate = return_intermediate self.fix_bug = fix_bug self.x_bound = grid_config['x'] self.y_bound = grid_config['y'] self.z_bound = grid_config['z'] self.final_dim = data_config['input_size'] self.pc_range = pc_range self.fp16_enabled = False def get_reference_points(self,H, W, Z=8, dim='3d', bs=1, device='cuda', dtype=torch.float): """Get the reference points used in SCA and TSA. Args: H, W: spatial shape of bev. Z: hight of pillar. D: sample D points uniformly from each pillar. device (obj:`device`): The device where reference_points should be. Returns: Tensor: reference points used in decoder, has \ shape (bs, num_keys, num_levels, 2). """ # reference points in 3D space, used in spatial cross-attention (SCA) if dim == '3d': X = torch.arange(*self.x_bound, dtype=torch.float) + self.x_bound[-1]/2 Y = torch.arange(*self.y_bound, dtype=torch.float) + self.y_bound[-1]/2 Z = torch.arange(*self.z_bound, dtype=torch.float) + self.z_bound[-1]/2 Y, X, Z = torch.meshgrid([Y, X, Z]) coords = torch.stack([X, Y, Z], dim=-1) coords = coords.to(dtype).to(device) # frustum = torch.cat([coords, torch.ones_like(coords[...,0:1])], dim=-1) #(x, y, z, 4) return coords # reference points on 2D bev plane, used in temporal self-attention (TSA). elif dim == '2d': ref_y, ref_x = torch.meshgrid( torch.linspace( 0.5, H - 0.5, H, dtype=dtype, device=device), torch.linspace( 0.5, W - 0.5, W, dtype=dtype, device=device) ) ref_y = ref_y.reshape(-1)[None] / H ref_x = ref_x.reshape(-1)[None] / W ref_2d = torch.stack((ref_x, ref_y), -1) ref_2d = ref_2d.repeat(bs, 1, 1).unsqueeze(2) return ref_2d @force_fp32(apply_to=('reference_points', 'cam_params')) def point_sampling(self, reference_points, pc_range, img_metas, cam_params=None, gt_bboxes_3d=None): rots, trans, intrins, post_rots, post_trans, bda = cam_params B, N, _ = trans.shape eps = 1e-5 ogfH, ogfW = self.final_dim reference_points = reference_points[None, None].repeat(B, N, 1, 1, 1, 1) reference_points = torch.inverse(bda).view(B, 1, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points -= trans.view(B, N, 1, 1, 1, 3) combine = rots.matmul(torch.inverse(intrins)).inverse() reference_points_cam = combine.view(B, N, 1, 1, 1, 3, 3).matmul(reference_points.unsqueeze(-1)).squeeze(-1) reference_points_cam = torch.cat([reference_points_cam[..., 0:2] / torch.maximum( reference_points_cam[..., 2:3], torch.ones_like(reference_points_cam[..., 2:3])*eps), reference_points_cam[..., 2:3]], 5 ) reference_points_cam = post_rots.view(B, N, 1, 1, 1, 3, 3).matmul(reference_points_cam.unsqueeze(-1)).squeeze(-1) reference_points_cam += post_trans.view(B, N, 1, 1, 1, 3) reference_points_cam[..., 0] /= ogfW reference_points_cam[..., 1] /= ogfH mask = (reference_points_cam[..., 2:3] > eps) mask = (mask & (reference_points_cam[..., 0:1] > eps) & (reference_points_cam[..., 0:1] < (1.0-eps)) & (reference_points_cam[..., 1:2] > eps) & (reference_points_cam[..., 1:2] < (1.0-eps))) B, N, H, W, D, _ = reference_points_cam.shape reference_points_cam = reference_points_cam.permute(1, 0, 2, 3, 4, 5).reshape(N, B, H*W, D, 3) mask = mask.permute(1, 0, 2, 3, 4, 5).reshape(N, B, H*W, D, 1).squeeze(-1) return reference_points, reference_points_cam[..., :2], mask, reference_points_cam[..., 2:3] @auto_fp16() def forward(self, bev_query, key, value, *args, bev_h=None, bev_w=None, bev_pos=None, spatial_shapes=None, level_start_index=None, valid_ratios=None, cam_params=None, gt_bboxes_3d=None, pred_img_depth=None, bev_mask=None, prev_bev=None, **kwargs): """Forward function for `TransformerDecoder`. Args: bev_query (Tensor): Input BEV query with shape `(num_query, bs, embed_dims)`. key & value (Tensor): Input multi-cameta features with shape (num_cam, num_value, bs, embed_dims) reference_points (Tensor): The reference points of offset. has shape (bs, num_query, 4) when as_two_stage, otherwise has shape ((bs, num_query, 2). valid_ratios (Tensor): The radios of valid points on the feature map, has shape (bs, num_levels, 2) Returns: Tensor: Results with shape [1, num_query, bs, embed_dims] when return_intermediate is `False`, otherwise it has shape [num_layers, num_query, bs, embed_dims]. """ output = bev_query intermediate = [] ref_3d = self.get_reference_points( bev_h, bev_w, self.pc_range[5]-self.pc_range[2], dim='3d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) ref_2d = self.get_reference_points( bev_h, bev_w, dim='2d', bs=bev_query.size(1), device=bev_query.device, dtype=bev_query.dtype) ref_3d, reference_points_cam, per_cam_mask_list, bev_query_depth = self.point_sampling( ref_3d, self.pc_range, kwargs['img_metas'], cam_params=cam_params, gt_bboxes_3d=gt_bboxes_3d) bev_query = bev_query.permute(1, 0, 2) bev_pos = bev_pos.permute(1, 0, 2) bs, len_bev, num_bev_level, _ = ref_2d.shape for lid, layer in enumerate(self.layers): output = layer( bev_query, key, value, *args, bev_pos=bev_pos, ref_2d=ref_2d, ref_3d=ref_3d, bev_h=bev_h, bev_w=bev_w, prev_bev=prev_bev, spatial_shapes=spatial_shapes, level_start_index=level_start_index, reference_points_cam=reference_points_cam, per_cam_mask_list=per_cam_mask_list, bev_mask=bev_mask, bev_query_depth=bev_query_depth, pred_img_depth=pred_img_depth, **kwargs) bev_query = output if self.return_intermediate: intermediate.append(output) if self.return_intermediate: return torch.stack(intermediate) return output @TRANSFORMER_LAYER.register_module() class BEVFormerEncoderLayer(MyCustomBaseTransformerLayer): """Implements decoder layer in DETR transformer. Args: attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )): Configs for self_attention or cross_attention, the order should be consistent with it in `operation_order`. If it is a dict, it would be expand to the number of attention in `operation_order`. feedforward_channels (int): The hidden dimension for FFNs. ffn_dropout (float): Probability of an element to be zeroed in ffn. Default 0.0. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Default:None act_cfg (dict): The activation config for FFNs. Default: `LN` norm_cfg (dict): Config dict for normalization layer. Default: `LN`. ffn_num_fcs (int): The number of fully-connected layers in FFNs. Default:2. """ def __init__(self, attn_cfgs, feedforward_channels=512, ffn_dropout=0.0, operation_order=None, act_cfg=dict(type='ReLU', inplace=True), norm_cfg=dict(type='LN'), ffn_num_fcs=2, **kwargs): super(BEVFormerEncoderLayer, self).__init__( attn_cfgs=attn_cfgs, feedforward_channels=feedforward_channels, ffn_dropout=ffn_dropout, operation_order=operation_order, act_cfg=act_cfg, norm_cfg=norm_cfg, ffn_num_fcs=ffn_num_fcs, **kwargs) self.fp16_enabled = False assert len(operation_order) in {2, 4, 6} # assert set(operation_order) in set(['self_attn', 'norm', 'cross_attn', 'ffn']) @force_fp32() def forward(self, query, key=None, value=None, bev_pos=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, ref_2d=None, ref_3d=None, bev_h=None, bev_w=None, reference_points_cam=None, mask=None, spatial_shapes=None, level_start_index=None, prev_bev=None, debug=False, bev_mask=None, bev_query_depth=None, per_cam_mask_list=None, lidar_bev=None, pred_img_depth=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: # temporal self attention if layer == 'self_attn': query = self.attentions[attn_index]( query, None, None, identity if self.pre_norm else None, query_pos=bev_pos, key_pos=bev_pos, attn_mask=attn_masks[attn_index], key_padding_mask=bev_mask, reference_points=ref_2d, spatial_shapes=torch.tensor( [[bev_h, bev_w]], device=query.device), level_start_index=torch.tensor([0], device=query.device), **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 # spaital cross attention elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=bev_pos, key_pos=key_pos, reference_points=ref_3d, reference_points_cam=reference_points_cam, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, spatial_shapes=spatial_shapes, level_start_index=level_start_index, bev_query_depth=bev_query_depth, pred_img_depth=pred_img_depth, bev_mask=bev_mask, per_cam_mask_list=per_cam_mask_list, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/custom_base_transformer_layer.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import copy import warnings import torch import torch.nn as nn from mmcv import ConfigDict, deprecated_api_warning from mmcv.cnn import Linear, build_activation_layer, build_norm_layer from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.cnn.bricks.registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) # Avoid BC-breaking of importing MultiScaleDeformableAttention from this file try: from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 warnings.warn( ImportWarning( '``MultiScaleDeformableAttention`` has been moved to ' '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 )) except ImportError: warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' '``mmcv.ops.multi_scale_deform_attn``, ' 'You should install ``mmcv-full`` if you need this module. ') from mmcv.cnn.bricks.transformer import build_feedforward_network, build_attention # @TRANSFORMER_LAYER.register_module() class MyCustomBaseTransformerLayer(BaseModule): """Base `TransformerLayer` for vision transformer. It can be built from `mmcv.ConfigDict` and support more flexible customization, for example, using any number of `FFN or LN ` and use different kinds of `attention` by specifying a list of `ConfigDict` named `attn_cfgs`. It is worth mentioning that it supports `prenorm` when you specifying `norm` as the first element of `operation_order`. More details about the `prenorm`: `On Layer Normalization in the Transformer Architecture `_ . Args: attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for `self_attention` or `cross_attention` modules, The order of the configs in the list should be consistent with corresponding attentions in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. Default: None. ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): Configs for FFN, The order of the configs in the list should be consistent with corresponding ffn in operation_order. If it is a dict, all of the attention modules in operation_order will be built with this config. operation_order (tuple[str]): The execution order of operation in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). Support `prenorm` when you specifying first element as `norm`. Default:None. norm_cfg (dict): Config dict for normalization layer. Default: dict(type='LN'). init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. """ def __init__(self, attn_cfgs=None, ffn_cfgs=dict( type='FFN', embed_dims=256, feedforward_channels=1024, num_fcs=2, ffn_drop=0., act_cfg=dict(type='ReLU', inplace=True), ), operation_order=None, norm_cfg=dict(type='LN'), init_cfg=None, batch_first=True, **kwargs): deprecated_args = dict( feedforward_channels='feedforward_channels', ffn_dropout='ffn_drop', ffn_num_fcs='num_fcs') for ori_name, new_name in deprecated_args.items(): if ori_name in kwargs: warnings.warn( f'The arguments `{ori_name}` in BaseTransformerLayer ' f'has been deprecated, now you should set `{new_name}` ' f'and other FFN related arguments ' f'to a dict named `ffn_cfgs`. ') if ffn_cfgs: ffn_cfgs[new_name] = kwargs[ori_name] super(MyCustomBaseTransformerLayer, self).__init__(init_cfg) self.batch_first = batch_first # assert set(operation_order) & set( # ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ # set(operation_order), f'The operation_order of' \ # f' {self.__class__.__name__} should ' \ # f'contains all four operation type ' \ # f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" num_attn = operation_order.count('self_attn') + operation_order.count( 'cross_attn') if isinstance(attn_cfgs, dict): attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] else: assert num_attn == len(attn_cfgs), f'The length ' \ f'of attn_cfg {num_attn} is ' \ f'not consistent with the number of attention' \ f'in operation_order {operation_order}.' self.num_attn = num_attn self.operation_order = operation_order self.norm_cfg = norm_cfg self.pre_norm = operation_order[0] == 'norm' self.attentions = ModuleList() index = 0 for operation_name in operation_order: if operation_name in ['self_attn', 'cross_attn']: if 'batch_first' in attn_cfgs[index]: assert self.batch_first == attn_cfgs[index]['batch_first'] else: attn_cfgs[index]['batch_first'] = self.batch_first attention = build_attention(attn_cfgs[index]) # Some custom attentions used as `self_attn` # or `cross_attn` can have different behavior. attention.operation_name = operation_name self.attentions.append(attention) index += 1 self.embed_dims = self.attentions[0].embed_dims if ffn_cfgs: self.ffns = ModuleList() num_ffns = operation_order.count('ffn') if isinstance(ffn_cfgs, dict): ffn_cfgs = ConfigDict(ffn_cfgs) if isinstance(ffn_cfgs, dict): ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] assert len(ffn_cfgs) == num_ffns for ffn_index in range(num_ffns): if 'embed_dims' not in ffn_cfgs[ffn_index]: ffn_cfgs['embed_dims'] = self.embed_dims else: assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims self.ffns.append( build_feedforward_network(ffn_cfgs[ffn_index])) self.norms = ModuleList() num_norms = operation_order.count('norm') for _ in range(num_norms): self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) def forward(self, query, key=None, value=None, query_pos=None, key_pos=None, attn_masks=None, query_key_padding_mask=None, key_padding_mask=None, **kwargs): """Forward function for `TransformerDecoderLayer`. **kwargs contains some specific arguments of attentions. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims] if self.batch_first is False, else [bs, num_queries embed_dims]. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims] if self.batch_first is False, else [bs, num_keys, embed_dims] . value (Tensor): The value tensor with same shape as `key`. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default: None. attn_masks (List[Tensor] | None): 2D Tensor used in calculation of corresponding attention. The length of it should equal to the number of `attention` in `operation_order`. Default: None. query_key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_queries]. Only used in `self_attn` layer. Defaults to None. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_keys]. Default: None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ norm_index = 0 attn_index = 0 ffn_index = 0 identity = query if attn_masks is None: attn_masks = [None for _ in range(self.num_attn)] elif isinstance(attn_masks, torch.Tensor): attn_masks = [ copy.deepcopy(attn_masks) for _ in range(self.num_attn) ] warnings.warn(f'Use same attn_mask in all attentions in ' f'{self.__class__.__name__} ') else: assert len(attn_masks) == self.num_attn, f'The length of ' \ f'attn_masks {len(attn_masks)} must be equal ' \ f'to the number of attention in ' \ f'operation_order {self.num_attn}' for layer in self.operation_order: if layer == 'self_attn': temp_key = temp_value = query query = self.attentions[attn_index]( query, temp_key, temp_value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=query_pos, attn_mask=attn_masks[attn_index], key_padding_mask=query_key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'norm': query = self.norms[norm_index](query) norm_index += 1 elif layer == 'cross_attn': query = self.attentions[attn_index]( query, key, value, identity if self.pre_norm else None, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_masks[attn_index], key_padding_mask=key_padding_mask, **kwargs) attn_index += 1 identity = query elif layer == 'ffn': query = self.ffns[ffn_index]( query, identity if self.pre_norm else None) ffn_index += 1 return query ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/multi_scale_deformable_attn_function.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import torch from torch.cuda.amp import custom_bwd, custom_fwd from torch.autograd.function import Function, once_differentiable from mmcv.utils import ext_loader ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) class MultiScaleDeformableAttnFunction_fp16(Function): @staticmethod @custom_fwd(cast_inputs=torch.float16) def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index, \ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None class MultiScaleDeformableAttnFunction_fp32(Function): @staticmethod @custom_fwd(cast_inputs=torch.float32) def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): """GPU version of multi-scale deformable attention. Args: value (Tensor): The value has shape (bs, num_keys, mum_heads, embed_dims//num_heads) value_spatial_shapes (Tensor): Spatial shape of each feature map, has shape (num_levels, 2), last dimension 2 represent (h, w) sampling_locations (Tensor): The location of sampling points, has shape (bs ,num_queries, num_heads, num_levels, num_points, 2), the last dimension 2 represent (x, y). attention_weights (Tensor): The weight of sampling points used when calculate the attention, has shape (bs ,num_queries, num_heads, num_levels, num_points), im2col_step (Tensor): The step used in image to column. Returns: Tensor: has shape (bs, num_queries, embed_dims) """ ctx.im2col_step = im2col_step output = ext_module.ms_deform_attn_forward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step=ctx.im2col_step) ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): """GPU version of backward function. Args: grad_output (Tensor): Gradient of output tensor of forward. Returns: Tuple[Tensor]: Gradient of input tensors in forward. """ value, value_spatial_shapes, value_level_start_index, \ sampling_locations, attention_weights = ctx.saved_tensors grad_value = torch.zeros_like(value) grad_sampling_loc = torch.zeros_like(sampling_locations) grad_attn_weight = torch.zeros_like(attention_weights) ext_module.ms_deform_attn_backward( value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output.contiguous(), grad_value, grad_sampling_loc, grad_attn_weight, im2col_step=ctx.im2col_step) return grad_value, None, None, \ grad_sampling_loc, grad_attn_weight, None ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/positional_encoding.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import torch import torch.nn as nn from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING from mmcv.runner import BaseModule @POSITIONAL_ENCODING.register_module() class CustormLearnedPositionalEncoding(BaseModule): """Position embedding with learnable embedding weights. Args: num_feats (int): The feature dimension for each position along x-axis or y-axis. The final returned dimension for each position is 2 times of this value. row_num_embed (int, optional): The dictionary size of row embeddings. Default 50. col_num_embed (int, optional): The dictionary size of col embeddings. Default 50. init_cfg (dict or list[dict], optional): Initialization config dict. """ def __init__(self, num_feats, row_num_embed=50, col_num_embed=50, init_cfg=dict(type='Uniform', layer='Embedding')): super(CustormLearnedPositionalEncoding, self).__init__(init_cfg) self.row_embed = nn.Embedding(row_num_embed, num_feats) self.col_embed = nn.Embedding(col_num_embed, num_feats) self.num_feats = num_feats self.row_num_embed = row_num_embed self.col_num_embed = col_num_embed def forward(self, bs, h, w, device): """Forward function for `LearnedPositionalEncoding`. Args: mask (Tensor): ByteTensor mask. Non-zero values representing ignored positions, while zero values means valid positions for this image. Shape [bs, h, w]. Returns: pos (Tensor): Returned position embedding with shape [bs, num_feats*2, h, w]. """ # h, w = mask.shape[-2:] x = torch.arange(w, device=device) y = torch.arange(h, device=device) x_embed = self.col_embed(x) y_embed = self.row_embed(y) pos = torch.cat( (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat( 1, w, 1)), dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(bs, 1, 1, 1) return pos def __repr__(self): """str: a string that describes the module""" repr_str = self.__class__.__name__ repr_str += f'(num_feats={self.num_feats}, ' repr_str += f'row_num_embed={self.row_num_embed}, ' repr_str += f'col_num_embed={self.col_num_embed})' return repr_str ================================================ FILE: mmdet3d/models/fbbev/view_transformation/backward_projection/bevformer_utils/spatial_cross_attention_depth.py ================================================ # Copyright (c) 2022-2023, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # https://github.com/NVlabs/FB-BEV/blob/main/LICENSE from mmcv.ops.multi_scale_deform_attn import multi_scale_deformable_attn_pytorch import warnings import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import xavier_init, constant_init from mmcv.cnn.bricks.registry import (ATTENTION, TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) from mmcv.cnn.bricks.transformer import build_attention import math from mmcv.runner import force_fp32, auto_fp16 from mmcv.runner.base_module import BaseModule, ModuleList, Sequential from mmcv.utils import ext_loader from .multi_scale_deformable_attn_function import MultiScaleDeformableAttnFunction_fp32, \ MultiScaleDeformableAttnFunction_fp16 ext_module = ext_loader.load_ext( '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) @ATTENTION.register_module() class DA_SpatialCrossAttention(BaseModule): """An attention module used in BEVFormer. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_cams (int): The number of cameras dropout (float): A Dropout layer on `inp_residual`. Default: 0.. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. deformable_attention: (dict): The config for the deformable attention used in SCA. """ def __init__(self, embed_dims=256, num_cams=6, pc_range=None, dropout=0.1, init_cfg=None, batch_first=False, deformable_attention=dict( type='MSDeformableAttention3D', embed_dims=256, num_levels=4), layer_scale=None, dbound=None, **kwargs ): super(DA_SpatialCrossAttention, self).__init__(init_cfg) self.init_cfg = init_cfg self.dropout = nn.Dropout(dropout) self.pc_range = pc_range self.fp16_enabled = False self.deformable_attention = build_attention(deformable_attention) self.embed_dims = embed_dims self.num_cams = num_cams self.dbound = dbound self.output_proj = nn.Linear(embed_dims, embed_dims) self.batch_first = batch_first if layer_scale is not None: self.layer_scale = nn.Parameter( layer_scale * torch.ones(embed_dims), requires_grad=True) else: self.layer_scale = None self.init_weight() self.count = 0 def init_weight(self): """Default initialization for Parameters of Module.""" xavier_init(self.output_proj, distribution='uniform', bias=0.) @force_fp32(apply_to=('query', 'key', 'value', 'query_pos', 'reference_points_cam')) def forward(self, query, key, value, residual=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, reference_points_cam=None, level_start_index=None, flag='encoder', bev_query_depth=None, pred_img_depth=None, bev_mask=None, per_cam_mask_list=None, **kwargs): """Forward Function of Detr3DCrossAtten. Args: query (Tensor): Query of Transformer with shape (num_query, bs, embed_dims). key (Tensor): The key tensor with shape `(num_key, bs, embed_dims)`. value (Tensor): The value tensor with shape `(num_key, bs, embed_dims)`. (B, N, C, H, W) residual (Tensor): The tensor used for addition, with the same shape as `x`. Default None. If None, `x` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, 4), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different level. With shape (num_levels, 2), last dimension represent (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape (num_levels) and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ N, B, len_query, Z, _ = bev_query_depth.shape B, N, DC, H, W = pred_img_depth.shape bev_query_depth = bev_query_depth.permute(1, 0, 2, 3, 4) pred_img_depth = pred_img_depth.view(B*N, DC, H, W) pred_img_depth = pred_img_depth.flatten(2).permute(0, 2, 1) if key is None: key = query if value is None: value = key if residual is None: inp_residual = query slots = torch.zeros_like(query) if query_pos is not None: query = query + query_pos bs, num_query, _ = query.size() D = reference_points_cam.size(3) indexes = [[] for _ in range(bs)] if bev_mask is not None: per_cam_mask_list_ = per_cam_mask_list & bev_mask[None, :, :, None] else: per_cam_mask_list_ = per_cam_mask_list max_len = 0 for j in range(bs): for i, per_cam_mask in enumerate(per_cam_mask_list_): index_query_per_img = per_cam_mask[j].sum(-1).nonzero().squeeze(-1) if len(index_query_per_img) == 0: index_query_per_img = per_cam_mask_list[i][j].sum(-1).nonzero().squeeze(-1)[0:1] indexes[j].append(index_query_per_img) max_len = max(max_len, len(index_query_per_img)) # each camera only interacts with its corresponding BEV queries. This step can greatly save GPU memory. queries_rebatch = query.new_zeros( [bs, self.num_cams, max_len, self.embed_dims]) reference_points_rebatch = reference_points_cam.new_zeros( [bs, self.num_cams, max_len, D, 2]) bev_query_depth_rebatch = reference_points_cam.new_zeros( [bs, self.num_cams, max_len, D, 1]) for j in range(bs): for i, reference_points_per_img in enumerate(reference_points_cam): index_query_per_img = indexes[j][i] queries_rebatch[j, i, :len(index_query_per_img)] = query[j, index_query_per_img] bev_query_depth_rebatch[j, i, :len(index_query_per_img)] = bev_query_depth[j, i, index_query_per_img] reference_points_rebatch[j, i, :len(index_query_per_img)] = reference_points_per_img[j, index_query_per_img] num_cams, l, bs, embed_dims = key.shape key = key.permute(2, 0, 1, 3).reshape( bs * self.num_cams, l, self.embed_dims) value = value.permute(2, 0, 1, 3).reshape( bs * self.num_cams, l, self.embed_dims) bev_query_depth_rebatch = (bev_query_depth_rebatch- self.dbound[0])/ self.dbound[2] bev_query_depth_rebatch = torch.clip(torch.floor(bev_query_depth_rebatch), 0, DC-1).to(torch.long) bev_query_depth_rebatch = F.one_hot(bev_query_depth_rebatch.squeeze(-1), num_classes=DC) queries = self.deformable_attention(query=queries_rebatch.view(bs*self.num_cams, max_len, self.embed_dims), key=key, value=value,\ reference_points=reference_points_rebatch.view(bs*self.num_cams, max_len, D, 2), spatial_shapes=spatial_shapes,\ level_start_index=level_start_index,\ bev_query_depth=bev_query_depth_rebatch.view(bs*self.num_cams, max_len, D, DC),\ pred_img_depth=pred_img_depth, \ ).view(bs, self.num_cams, max_len, self.embed_dims) for j in range(bs): for i in range(num_cams): index_query_per_img = indexes[j][i] slots[j, index_query_per_img] += queries[j, i, :len(index_query_per_img)] count = per_cam_mask_list_.sum(-1) > 0 count = count.permute(1, 2, 0).sum(-1) count = torch.clamp(count, min=1.0) slots = slots / count[..., None] slots = self.output_proj(slots) if self.layer_scale is None: return self.dropout(slots) + inp_residual else: return self.dropout(self.layer_scale * slots) + inp_residual @ATTENTION.register_module() class DA_MSDeformableAttention(BaseModule): """An attention module used in BEVFormer based on Deformable-Detr. `Deformable DETR: Deformable Transformers for End-to-End Object Detection. `_. Args: embed_dims (int): The embedding dimension of Attention. Default: 256. num_heads (int): Parallel attention heads. Default: 64. num_levels (int): The number of feature map used in Attention. Default: 4. num_points (int): The number of sampling points for each query in each head. Default: 4. im2col_step (int): The step used in image_to_column. Default: 64. dropout (float): A Dropout layer on `inp_identity`. Default: 0.1. batch_first (bool): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Default to False. norm_cfg (dict): Config dict for normalization layer. Default: None. init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. Default: None. """ def __init__(self, embed_dims=256, num_heads=8, num_levels=4, num_points=8, num_Z_anchors=4, im2col_step=64, dropout=0.1, batch_first=True, disable_deformable=False, norm_cfg=None, init_cfg=None): super().__init__(init_cfg) if embed_dims % num_heads != 0: raise ValueError(f'embed_dims must be divisible by num_heads, ' f'but got {embed_dims} and {num_heads}') dim_per_head = embed_dims // num_heads self.norm_cfg = norm_cfg self.batch_first = batch_first self.output_proj = None self.fp16_enabled = False self.disable_deformable = disable_deformable self.num_Z_anchors = num_Z_anchors # you'd better set dim_per_head to a power of 2 # which is more efficient in the CUDA implementation def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( 'invalid input for _is_power_of_2: {} (type: {})'.format( n, type(n))) return (n & (n - 1) == 0) and n != 0 if not _is_power_of_2(dim_per_head): warnings.warn( "You'd better set embed_dims in " 'MultiScaleDeformAttention to make ' 'the dimension of each attention head a power of 2 ' 'which is more efficient in our CUDA implementation.') self.im2col_step = im2col_step self.embed_dims = embed_dims self.num_levels = num_levels self.num_heads = num_heads self.num_points = num_points self.sampling_offsets = nn.Linear( embed_dims, num_heads * num_levels * num_points * 2) self.attention_weights = nn.Linear(embed_dims, num_heads * num_levels * num_points) self.value_proj = nn.Linear(embed_dims, embed_dims) self.init_weights() def init_weights(self): """Default initialization for Parameters of Module.""" constant_init(self.sampling_offsets, 0.) thetas = torch.arange( self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) self.each_anchor_points = self.num_points // self.num_Z_anchors grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view( self.num_heads, 1, 1, 1, 2).repeat(1, self.num_levels, self.each_anchor_points, self.num_Z_anchors, 1) for i in range(self.each_anchor_points): for j in range(self.num_Z_anchors): grid_init[:, :, i, j, :] *= i + 1 self.sampling_offsets.bias.data = grid_init.view(-1) constant_init(self.attention_weights, val=0., bias=0.) xavier_init(self.value_proj, distribution='uniform', bias=0.) xavier_init(self.output_proj, distribution='uniform', bias=0.) self._is_init = True @force_fp32() def forward(self, query, key=None, value=None, identity=None, query_pos=None, key_padding_mask=None, reference_points=None, spatial_shapes=None, level_start_index=None, bev_query_depth=None, pred_img_depth=None, **kwargs): """Forward Function of MultiScaleDeformAttention. Args: query (Tensor): Query of Transformer with shape ( bs, num_query, embed_dims). key (Tensor): The key tensor with shape `(bs, num_key, embed_dims)`. value (Tensor): The value tensor with shape `(bs, num_key, embed_dims)`. identity (Tensor): The tensor used for addition, with the same shape as `query`. Default None. If None, `query` will be used. query_pos (Tensor): The positional encoding for `query`. Default: None. key_pos (Tensor): The positional encoding for `key`. Default None. reference_points (Tensor): The normalized reference points with shape (bs, num_query, num_levels, 2), all elements is range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area. or (N, Length_{query}, num_levels, 4), add additional two dimensions is (w, h) to form reference boxes. key_padding_mask (Tensor): ByteTensor for `query`, with shape [bs, num_key]. spatial_shapes (Tensor): Spatial shape of features in different levels. With shape (num_levels, 2), last dimension represents (h, w). level_start_index (Tensor): The start index of each level. A tensor has shape ``(num_levels, )`` and can be represented as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. Returns: Tensor: forwarded results with shape [num_query, bs, embed_dims]. """ if value is None: value = query if identity is None: identity = query if query_pos is not None: query = query + query_pos if not self.batch_first: # change to (bs, num_query ,embed_dims) query = query.permute(1, 0, 2) value = value.permute(1, 0, 2) bs, num_query, _ = query.shape bs, num_value, _ = value.shape assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value value = self.value_proj(value) if key_padding_mask is not None: value = value.masked_fill(key_padding_mask[..., None], 0.0) value = value.view(bs, num_value, self.num_heads, -1) sampling_offsets = self.sampling_offsets(query).view( bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) attention_weights = self.attention_weights(query).view( bs, num_query, self.num_heads, self.num_levels * self.num_points) if self.disable_deformable: sampling_offsets = sampling_offsets * 0 attention_weights = attention_weights * 0 attention_weights = attention_weights.softmax(-1) attention_weights = attention_weights.view(bs, num_query, self.num_heads, self.num_levels, self.num_points) if reference_points.shape[-1] == 2: """ For each BEV query, it owns `num_Z_anchors` in 3D space that having different heights. After proejcting, each BEV query has `num_Z_anchors` reference points in each 2D image. For each referent point, we sample `num_points` sampling points. For `num_Z_anchors` reference points, it has overall `num_points * num_Z_anchors` sampling points. """ offset_normalizer = torch.stack( [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) bs, num_query, num_Z_anchors, xy = reference_points.shape reference_points = reference_points[:, :, None, None, None, :, :] sampling_offsets = sampling_offsets / \ offset_normalizer[None, None, None, :, None, :] bs, num_query, num_heads, num_levels, num_all_points, xy = sampling_offsets.shape sampling_offsets = sampling_offsets.view( bs, num_query, num_heads, num_levels, num_all_points // num_Z_anchors, num_Z_anchors, xy) sampling_locations = reference_points + sampling_offsets bs, num_query, num_heads, num_levels, num_points, num_Z_anchors, xy = sampling_locations.shape assert num_all_points == num_points * num_Z_anchors sampling_locations = sampling_locations.view( bs, num_query, num_heads, num_levels, num_all_points, xy) elif reference_points.shape[-1] == 4: assert False else: raise ValueError( f'Last dim of reference_points must be' f' 2 or 4, but get {reference_points.shape[-1]} instead.') if torch.cuda.is_available() and value.is_cuda: if value.dtype == torch.float16: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 else: MultiScaleDeformableAttnFunction = MultiScaleDeformableAttnFunction_fp32 depth_reference_points = reference_points.reshape(bs, num_query * num_Z_anchors, 1, 1, 1, 2).contiguous() depth_attention_weights = torch.ones_like(depth_reference_points[...,0]).contiguous() depth_weights = MultiScaleDeformableAttnFunction.apply( pred_img_depth.unsqueeze(2).contiguous(), spatial_shapes[0:1], level_start_index[0:1], depth_reference_points, depth_attention_weights, self.im2col_step).reshape(bs, num_query, num_Z_anchors, -1) depth_weights = (depth_weights * bev_query_depth).sum(-1) depth_weights = depth_weights.unsqueeze(2).repeat(1,1, num_points, 1).reshape(bs, num_query, num_all_points) attention_weights = attention_weights * depth_weights[:, :, None, None, :] output = MultiScaleDeformableAttnFunction.apply( value, spatial_shapes, level_start_index, sampling_locations, attention_weights, self.im2col_step) else: output = multi_scale_deformable_attn_pytorch( value, spatial_shapes, sampling_locations, attention_weights) if not self.batch_first: output = output.permute(1, 0, 2) return output ================================================ FILE: mmdet3d/models/fbbev/view_transformation/forward_projection/__init__.py ================================================ from .view_transformer import LSSViewTransformerFunction, LSSViewTransformerFunction3D ================================================ FILE: mmdet3d/models/fbbev/view_transformation/forward_projection/view_transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_conv_layer from mmcv.runner import BaseModule, force_fp32 from torch.cuda.amp.autocast_mode import autocast from torch.utils.checkpoint import checkpoint from mmdet3d.ops.bev_pool_v2.bev_pool import bev_pool_v2 from mmdet.models.backbones.resnet import BasicBlock from mmdet3d.models.builder import NECKS import torch.utils.checkpoint as cp import time def gen_dx_bx(xbound, ybound, zbound): dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]]) nx = torch.Tensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) return dx, bx, nx @NECKS.register_module() class LSSViewTransformerFunction(BaseModule): r"""Lift-Splat-Shoot view transformer. Please refer to the `paper `_ Args: grid_config (dict): Config of grid alone each axis in format of (lower_bound, upper_bound, interval). axis in {x,y,z,depth}. input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample factor from the input size to the feature size. in_channels (int): Channels of input feature. out_channels (int): Channels of transformed feature. accelerate (bool): Whether the view transformation is conducted with acceleration. Note: the intrinsic and extrinsic of cameras should be constant when 'accelerate' is set true. """ def __init__( self, grid_config, input_size, downsample=16, accelerate=False, uniform=False, with_cp=False ): super(LSSViewTransformerFunction, self).__init__() self.uniform = uniform self.with_cp = with_cp self.grid_config = grid_config self.downsample = downsample self.create_grid_infos(**grid_config) dx, bx, nx = gen_dx_bx(self.grid_config['x'], self.grid_config['y'], self.grid_config['z'], ) self.dx = nn.Parameter(dx, requires_grad=False) self.bx = nn.Parameter(bx, requires_grad=False) self.nx = nn.Parameter(nx, requires_grad=False) self.create_frustum(grid_config['depth'], input_size, downsample) self.accelerate = accelerate self.initial_flag = True def create_grid_infos(self, x, y, z, **kwargs): """Generate the grid information including the lower bound, interval, and size. Args: x (tuple(float)): Config of grid alone x axis in format of (lower_bound, upper_bound, interval). y (tuple(float)): Config of grid alone y axis in format of (lower_bound, upper_bound, interval). z (tuple(float)): Config of grid alone z axis in format of (lower_bound, upper_bound, interval). **kwargs: Container for other potential parameters """ self.grid_lower_bound = torch.Tensor([cfg[0] for cfg in [x, y, z]]) self.grid_interval = torch.Tensor([cfg[2] for cfg in [x, y, z]]) self.grid_size = torch.Tensor([(cfg[1] - cfg[0]) / cfg[2] for cfg in [x, y, z]]) def create_frustum(self, depth_cfg, input_size, downsample): """Generate the frustum template for each image. Args: depth_cfg (tuple(float)): Config of grid alone depth axis in format of (lower_bound, upper_bound, interval). input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample scale factor from the input size to the feature size. """ H_in, W_in = input_size H_feat, W_feat = H_in // downsample, W_in // downsample d = torch.arange(*depth_cfg, dtype=torch.float)\ .view(-1, 1, 1).expand(-1, H_feat, W_feat) self.D = d.shape[0] x = torch.linspace(0, W_in - 1, W_feat, dtype=torch.float)\ .view(1, 1, W_feat).expand(self.D, H_feat, W_feat) y = torch.linspace(0, H_in - 1, H_feat, dtype=torch.float)\ .view(1, H_feat, 1).expand(self.D, H_feat, W_feat) # D x H x W x 3 self.frustum = torch.stack((x, y, d), -1) def get_lidar_coor(self, rots, trans, cam2imgs, post_rots, post_trans, bda): """Calculate the locations of the frustum points in the lidar coordinate system. Args: rots (torch.Tensor): Rotation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3, 3). trans (torch.Tensor): Translation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3). cam2imgs (torch.Tensor): Camera intrinsic matrixes in shape (B, N_cams, 3, 3). post_rots (torch.Tensor): Rotation in camera coordinate system in shape (B, N_cams, 3, 3). It is derived from the image view augmentation. post_trans (torch.Tensor): Translation in camera coordinate system derived from image view augmentation in shape (B, N_cams, 3). Returns: torch.tensor: Point coordinates in shape (B, N_cams, D, ownsample, 3) """ B, N, _ = trans.shape # post-transformation # B x N x D x H x W x 3 points = self.frustum.to(rots) - post_trans.view(B, N, 1, 1, 1, 3) points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\ .matmul(points.unsqueeze(-1)) # cam_to_ego points = torch.cat( (points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5) combine = rots.matmul(torch.inverse(cam2imgs)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += trans.view(B, N, 1, 1, 1, 3) points = bda.view(B, 1, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)).squeeze(-1) return points def init_acceleration_v2(self, coor): """Pre-compute the necessary information in acceleration including the index of points in the final feature. Args: coor (torch.tensor): Coordinate of points in lidar space in shape (B, N_cams, D, H, W, 3). x (torch.tensor): Feature of points in shape (B, N_cams, D, H, W, C). """ ranks_bev, ranks_depth, ranks_feat, \ interval_starts, interval_lengths = \ self.voxel_pooling_prepare_v2(coor) self.ranks_bev = ranks_bev.int().contiguous() self.ranks_feat = ranks_feat.int().contiguous() self.ranks_depth = ranks_depth.int().contiguous() self.interval_starts = interval_starts.int().contiguous() self.interval_lengths = interval_lengths.int().contiguous() def voxel_pooling_v2(self, coor, depth, feat): ranks_bev, ranks_depth, ranks_feat, \ interval_starts, interval_lengths = \ self.voxel_pooling_prepare_v2(coor) if ranks_feat is None: print('warning ---> no points within the predefined ' 'bev receptive field') dummy = torch.zeros(size=[ feat.shape[0], feat.shape[2], int(self.grid_size[2]), int(self.grid_size[0]), int(self.grid_size[1]) ]).to(feat) dummy = torch.cat(dummy.unbind(dim=2), 1) return dummy feat = feat.permute(0, 1, 3, 4, 2) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) # collapse Z bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) return bev_feat def voxel_pooling_prepare_v2(self, coor): """Data preparation for voxel pooling. Args: coor (torch.tensor): Coordinate of points in the lidar space in shape (B, N, D, H, W, 3). Returns: tuple[torch.tensor]: Rank of the voxel that a point is belong to in shape (N_Points); Reserved index of points in the depth space in shape (N_Points). Reserved index of points in the feature space in shape (N_Points). """ B, N, D, H, W, _ = coor.shape num_points = B * N * D * H * W # record the index of selected points for acceleration purpose ranks_depth = torch.range( 0, num_points - 1, dtype=torch.int, device=coor.device) ranks_feat = torch.range( 0, num_points // D - 1, dtype=torch.int, device=coor.device) ranks_feat = ranks_feat.reshape(B, N, 1, H, W) ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() # convert coordinate into the voxel space coor = ((coor - self.grid_lower_bound.to(coor)) / self.grid_interval.to(coor)) coor = coor.long().view(num_points, 3) batch_idx = torch.range(0, B - 1).reshape(B, 1). \ expand(B, num_points // B).reshape(num_points, 1).to(coor) coor = torch.cat((coor, batch_idx), 1) # filter out points that are outside box kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \ (coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \ (coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2]) if len(kept) == 0: return None, None, None, None, None coor, ranks_depth, ranks_feat = \ coor[kept], ranks_depth[kept], ranks_feat[kept] # get tensors from the same voxel next to each other ranks_bev = coor[:, 3] * ( self.grid_size[2] * self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0] order = ranks_bev.argsort() ranks_bev, ranks_depth, ranks_feat = \ ranks_bev[order], ranks_depth[order], ranks_feat[order] kept = torch.ones( ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_bev[1:] != ranks_bev[:-1] interval_starts = torch.where(kept)[0].int() if len(interval_starts) == 0: return None, None, None, None, None interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] return ranks_bev.int().contiguous(), ranks_depth.int().contiguous( ), ranks_feat.int().contiguous(), interval_starts.int().contiguous( ), interval_lengths.int().contiguous() def pre_compute(self, cam_params): if self.initial_flag: coor = self.get_lidar_coor(*cam_params) self.init_acceleration_v2(coor) self.initial_flag = False def view_transform_core(self, cam_params, depth, tran_feat): # Lift-Splat if self.accelerate: feat = tran_feat # tran_feat.view(B, N, self.out_channels, H, W) feat = feat.permute(0, 1, 3, 4, 2) depth = depth #.view(B, N, self.D, H, W) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, self.ranks_depth, self.ranks_feat, self.ranks_bev, bev_feat_shape, self.interval_starts, self.interval_lengths) bev_feat = bev_feat.squeeze(2) else: coor = self.get_lidar_coor(*cam_params) bev_feat = self.voxel_pooling_v2( coor, depth, tran_feat) return bev_feat def view_transform(self, cam_params, depth, tran_feat): if self.accelerate: self.pre_compute(cam_params) return self.view_transform_core(cam_params, depth, tran_feat) def forward(self, cam_params, context, depth, **kwargs): """Transform image-view feature into bird-eye-view feature. Args: cam_params (list(torch.tensor)): of (rots, trans, intrins, post_rots, post_trans) Returns: torch.tensor: Bird-eye-view feature in shape (B, C, H_BEV, W_BEV) """ return self.view_transform(cam_params, depth, context) def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): return None @NECKS.register_module() class LSSViewTransformerFunction3D(BaseModule): r"""Lift-Splat-Shoot view transformer. Please refer to the `paper `_ Args: grid_config (dict): Config of grid alone each axis in format of (lower_bound, upper_bound, interval). axis in {x,y,z,depth}. input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample factor from the input size to the feature size. in_channels (int): Channels of input feature. out_channels (int): Channels of transformed feature. accelerate (bool): Whether the view transformation is conducted with acceleration. Note: the intrinsic and extrinsic of cameras should be constant when 'accelerate' is set true. """ def __init__( self, grid_config, input_size, downsample=16, # in_channels=512, # out_channels=64, accelerate=False, uniform=False, with_cp=False, extra_relu=False, ): super(LSSViewTransformerFunction3D, self).__init__() self.uniform = uniform self.with_cp = with_cp self.extra_relu=extra_relu self.grid_config = grid_config dx, bx, nx = gen_dx_bx(self.grid_config['x'], self.grid_config['y'], self.grid_config['z'], ) self.dx = nn.Parameter(dx, requires_grad=False) self.bx = nn.Parameter(bx, requires_grad=False) self.nx = nn.Parameter(nx, requires_grad=False) self.downsample = downsample self.create_grid_infos(**grid_config) self.input_size = input_size self.create_frustum(grid_config['depth'], input_size, downsample) # self.out_channels = out_channels # self.in_channels = in_channels # self.depth_net = nn.Conv2d( in_channels, self.D + self.out_channels, kernel_size=1, padding=0) self.accelerate = accelerate self.initial_flag = True def create_grid_infos(self, x, y, z, **kwargs): """Generate the grid information including the lower bound, interval, and size. Args: x (tuple(float)): Config of grid alone x axis in format of (lower_bound, upper_bound, interval). y (tuple(float)): Config of grid alone y axis in format of (lower_bound, upper_bound, interval). z (tuple(float)): Config of grid alone z axis in format of (lower_bound, upper_bound, interval). **kwargs: Container for other potential parameters """ self.grid_lower_bound = torch.Tensor([cfg[0] for cfg in [x, y, z]]) self.grid_interval = torch.Tensor([cfg[2] for cfg in [x, y, z]]) self.grid_size = torch.Tensor([(cfg[1] - cfg[0]) / cfg[2] for cfg in [x, y, z]]) def create_frustum(self, depth_cfg, input_size, downsample): """Generate the frustum template for each image. Args: depth_cfg (tuple(float)): Config of grid alone depth axis in format of (lower_bound, upper_bound, interval). input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample scale factor from the input size to the feature size. """ H_in, W_in = input_size H_feat, W_feat = H_in // downsample, W_in // downsample d = torch.arange(*depth_cfg, dtype=torch.float)\ .view(-1, 1, 1).expand(-1, H_feat, W_feat) self.D = d.shape[0] x = torch.linspace(0, W_in - 1, W_feat, dtype=torch.float)\ .view(1, 1, W_feat).expand(self.D, H_feat, W_feat) y = torch.linspace(0, H_in - 1, H_feat, dtype=torch.float)\ .view(1, H_feat, 1).expand(self.D, H_feat, W_feat) # D x H x W x 3 self.frustum = torch.stack((x, y, d), -1) def get_cam2ego_coor(self, input, downsample=1): depth_cfg = self.grid_config['depth'] H_in, W_in = self.input_size H_feat, W_feat = H_in // downsample, W_in // downsample d = torch.arange(*depth_cfg, dtype=torch.float)\ .view(-1, 1, 1).expand(-1, H_feat, W_feat) D = d.shape[0] x = torch.linspace(0, W_in - 1, W_feat, dtype=torch.float)\ .view(1, 1, W_feat).expand(self.D, H_feat, W_feat) y = torch.linspace(0, H_in - 1, H_feat, dtype=torch.float)\ .view(1, H_feat, 1).expand(self.D, H_feat, W_feat) # D x H x W x 3 frustum = torch.stack((x, y, d), -1) rots, trans, cam2imgs, post_rots, post_trans, bda = input B, N, _ = trans.shape # post-transformation # B x N x D x H x W x 3 points = frustum.to(rots) - post_trans.view(B, N, 1, 1, 1, 3) points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\ .matmul(points.unsqueeze(-1)) # cam_to_ego points = torch.cat( (points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5) combine = rots.matmul(torch.inverse(cam2imgs)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += trans.view(B, N, 1, 1, 1, 3) points = bda.view(B, 1, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)).squeeze(-1) coor = points coor = ((coor - self.grid_lower_bound.to(coor)) / 0.4) coor = coor.long() # filter out points that are outside box kept = (coor[..., 0] >= 0) & (coor[..., 0] < 200) & \ (coor[..., 1] >= 0) & (coor[..., 1] < 200) & \ (coor[..., 2] >= 0) & (coor[..., 2] < 16) coor[~kept] = -999 return coor def get_lidar_coor(self, rots, trans, cam2imgs, post_rots, post_trans, bda): """Calculate the locations of the frustum points in the lidar coordinate system. Args: rots (torch.Tensor): Rotation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3, 3). trans (torch.Tensor): Translation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3). cam2imgs (torch.Tensor): Camera intrinsic matrixes in shape (B, N_cams, 3, 3). post_rots (torch.Tensor): Rotation in camera coordinate system in shape (B, N_cams, 3, 3). It is derived from the image view augmentation. post_trans (torch.Tensor): Translation in camera coordinate system derived from image view augmentation in shape (B, N_cams, 3). Returns: torch.tensor: Point coordinates in shape (B, N_cams, D, ownsample, 3) """ B, N, _ = trans.shape # post-transformation # B x N x D x H x W x 3 points = self.frustum.to(rots) - post_trans.view(B, N, 1, 1, 1, 3) points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\ .matmul(points.unsqueeze(-1)) # cam_to_ego points = torch.cat( (points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5) combine = rots.matmul(torch.inverse(cam2imgs)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += trans.view(B, N, 1, 1, 1, 3) points = bda.view(B, 1, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)).squeeze(-1) return points def init_acceleration_v2(self, coor): """Pre-compute the necessary information in acceleration including the index of points in the final feature. Args: coor (torch.tensor): Coordinate of points in lidar space in shape (B, N_cams, D, H, W, 3). x (torch.tensor): Feature of points in shape (B, N_cams, D, H, W, C). """ ranks_bev, ranks_depth, ranks_feat, \ interval_starts, interval_lengths = \ self.voxel_pooling_prepare_v2(coor) self.ranks_bev = ranks_bev.int().contiguous() self.ranks_feat = ranks_feat.int().contiguous() self.ranks_depth = ranks_depth.int().contiguous() self.interval_starts = interval_starts.int().contiguous() self.interval_lengths = interval_lengths.int().contiguous() def voxel_pooling_v2(self, coor, depth, feat): ranks_bev, ranks_depth, ranks_feat, \ interval_starts, interval_lengths = \ self.voxel_pooling_prepare_v2(coor) if ranks_feat is None: print('warning ---> no points within the predefined ' 'bev receptive field') dummy = torch.zeros(size=[ feat.shape[0], feat.shape[2], int(self.grid_size[0]), int(self.grid_size[1]), int(self.grid_size[2]), ]).to(feat) # dummy = torch.cat(dummy.unbind(dim=2), 1) return dummy feat = feat.permute(0, 1, 3, 4, 2) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) bev_feat = bev_feat.permute(0, 1, 3, 4, 2) # B, C, Z, X, Y- > B, C, X, Y, Z # bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) return bev_feat def voxel_pooling_prepare_v2(self, coor): """Data preparation for voxel pooling. Args: coor (torch.tensor): Coordinate of points in the lidar space in shape (B, N, D, H, W, 3). Returns: tuple[torch.tensor]: Rank of the voxel that a point is belong to in shape (N_Points); Reserved index of points in the depth space in shape (N_Points). Reserved index of points in the feature space in shape (N_Points). """ B, N, D, H, W, _ = coor.shape num_points = B * N * D * H * W # record the index of selected points for acceleration purpose ranks_depth = torch.arange( 0, num_points, dtype=torch.int, device=coor.device) ranks_feat = torch.arange( 0, num_points // D , dtype=torch.int, device=coor.device) ranks_feat = ranks_feat.reshape(B, N, 1, H, W) ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() # convert coordinate into the voxel space coor = ((coor - self.grid_lower_bound.to(coor)) / self.grid_interval.to(coor)) coor = coor.long().view(num_points, 3) batch_idx = torch.arange(0, B ).reshape(B, 1). \ expand(B, num_points // B).reshape(num_points, 1).to(coor) coor = torch.cat((coor, batch_idx), 1) # filter out points that are outside box kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \ (coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \ (coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2]) if len(kept) == 0: return None, None, None, None, None coor, ranks_depth, ranks_feat = \ coor[kept], ranks_depth[kept], ranks_feat[kept] # get tensors from the same voxel next to each other ranks_bev = coor[:, 3] * ( self.grid_size[2] * self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0] order = ranks_bev.argsort() ranks_bev, ranks_depth, ranks_feat = \ ranks_bev[order], ranks_depth[order], ranks_feat[order] kept = torch.ones( ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_bev[1:] != ranks_bev[:-1] interval_starts = torch.where(kept)[0].int() if len(interval_starts) == 0: return None, None, None, None, None interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] return ranks_bev.int().contiguous(), ranks_depth.int().contiguous( ), ranks_feat.int().contiguous(), interval_starts.int().contiguous( ), interval_lengths.int().contiguous() def pre_compute(self, cam_params): if self.initial_flag: coor = self.get_lidar_coor(*cam_params) self.init_acceleration_v2(coor) self.initial_flag = False def view_transform_core(self, cam_params, depth, tran_feat): # B, N, C, H, W = input[0].shape # Lift-Splat if self.accelerate: feat = tran_feat # tran_feat.view(B, N, self.out_channels, H, W) feat = feat.permute(0, 1, 3, 4, 2) depth = depth #.view(B, N, self.D, H, W) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, self.ranks_depth, self.ranks_feat, self.ranks_bev, bev_feat_shape, self.interval_starts, self.interval_lengths) assert False bev_feat = bev_feat.squeeze(2) else: coor = self.get_lidar_coor(*cam_params) bev_feat = self.voxel_pooling_v2( coor, depth, tran_feat) return bev_feat def view_transform(self, cam_params, depth, tran_feat): if self.accelerate: self.pre_compute(cam_params) return self.view_transform_core(cam_params, depth, tran_feat) # @run_time('lss3d') def forward(self, cam_params, context, depth, **kwargs): """Transform image-view feature into bird-eye-view feature. Args: input (list(torch.tensor)): of (image-view feature, rots, trans, intrins, post_rots, post_trans) Returns: torch.tensor: Bird-eye-view feature in shape (B, C, H_BEV, W_BEV) """ bev = self.view_transform(cam_params, depth, context) if self.extra_relu: return bev.relu() else: return bev def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): return None ================================================ FILE: mmdet3d/models/fusion_layers/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .coord_transform import (apply_3d_transformation, bbox_2d_transform, coord_2d_transform) from .point_fusion import PointFusion from .vote_fusion import VoteFusion __all__ = [ 'PointFusion', 'VoteFusion', 'apply_3d_transformation', 'bbox_2d_transform', 'coord_2d_transform' ] ================================================ FILE: mmdet3d/models/fusion_layers/coord_transform.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from functools import partial import torch from mmdet3d.core.points import get_points_type def apply_3d_transformation(pcd, coord_type, img_meta, reverse=False): """Apply transformation to input point cloud. Args: pcd (torch.Tensor): The point cloud to be transformed. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. img_meta(dict): Meta info regarding data transformation. reverse (bool): Reversed transformation or not. Note: The elements in img_meta['transformation_3d_flow']: "T" stands for translation; "S" stands for scale; "R" stands for rotation; "HF" stands for horizontal flip; "VF" stands for vertical flip. Returns: torch.Tensor: The transformed point cloud. """ dtype = pcd.dtype device = pcd.device pcd_rotate_mat = ( torch.tensor(img_meta['pcd_rotation'], dtype=dtype, device=device) if 'pcd_rotation' in img_meta else torch.eye( 3, dtype=dtype, device=device)) pcd_scale_factor = ( img_meta['pcd_scale_factor'] if 'pcd_scale_factor' in img_meta else 1.) pcd_trans_factor = ( torch.tensor(img_meta['pcd_trans'], dtype=dtype, device=device) if 'pcd_trans' in img_meta else torch.zeros( (3), dtype=dtype, device=device)) pcd_horizontal_flip = img_meta[ 'pcd_horizontal_flip'] if 'pcd_horizontal_flip' in \ img_meta else False pcd_vertical_flip = img_meta[ 'pcd_vertical_flip'] if 'pcd_vertical_flip' in \ img_meta else False flow = img_meta['transformation_3d_flow'] \ if 'transformation_3d_flow' in img_meta else [] pcd = pcd.clone() # prevent inplace modification pcd = get_points_type(coord_type)(pcd) horizontal_flip_func = partial(pcd.flip, bev_direction='horizontal') \ if pcd_horizontal_flip else lambda: None vertical_flip_func = partial(pcd.flip, bev_direction='vertical') \ if pcd_vertical_flip else lambda: None if reverse: scale_func = partial(pcd.scale, scale_factor=1.0 / pcd_scale_factor) translate_func = partial(pcd.translate, trans_vector=-pcd_trans_factor) # pcd_rotate_mat @ pcd_rotate_mat.inverse() is not # exactly an identity matrix # use angle to create the inverse rot matrix neither. rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat.inverse()) # reverse the pipeline flow = flow[::-1] else: scale_func = partial(pcd.scale, scale_factor=pcd_scale_factor) translate_func = partial(pcd.translate, trans_vector=pcd_trans_factor) rotate_func = partial(pcd.rotate, rotation=pcd_rotate_mat) flow_mapping = { 'T': translate_func, 'S': scale_func, 'R': rotate_func, 'HF': horizontal_flip_func, 'VF': vertical_flip_func } for op in flow: assert op in flow_mapping, f'This 3D data '\ f'transformation op ({op}) is not supported' func = flow_mapping[op] func() return pcd.coord def extract_2d_info(img_meta, tensor): """Extract image augmentation information from img_meta. Args: img_meta(dict): Meta info regarding data transformation. tensor(torch.Tensor): Input tensor used to create new ones. Returns: (int, int, int, int, torch.Tensor, bool, torch.Tensor): The extracted information. """ img_shape = img_meta['img_shape'] ori_shape = img_meta['ori_shape'] img_h, img_w, _ = img_shape ori_h, ori_w, _ = ori_shape img_scale_factor = ( tensor.new_tensor(img_meta['scale_factor'][:2]) if 'scale_factor' in img_meta else tensor.new_tensor([1.0, 1.0])) img_flip = img_meta['flip'] if 'flip' in img_meta else False img_crop_offset = ( tensor.new_tensor(img_meta['img_crop_offset']) if 'img_crop_offset' in img_meta else tensor.new_tensor([0.0, 0.0])) return (img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, img_crop_offset) def bbox_2d_transform(img_meta, bbox_2d, ori2new): """Transform 2d bbox according to img_meta. Args: img_meta(dict): Meta info regarding data transformation. bbox_2d (torch.Tensor): Shape (..., >4) The input 2d bboxes to transform. ori2new (bool): Origin img coord system to new or not. Returns: torch.Tensor: The transformed 2d bboxes. """ img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \ img_crop_offset = extract_2d_info(img_meta, bbox_2d) bbox_2d_new = bbox_2d.clone() if ori2new: bbox_2d_new[:, 0] = bbox_2d_new[:, 0] * img_scale_factor[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] * img_scale_factor[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] * img_scale_factor[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] * img_scale_factor[1] bbox_2d_new[:, 0] = bbox_2d_new[:, 0] + img_crop_offset[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] + img_crop_offset[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] + img_crop_offset[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] + img_crop_offset[1] if img_flip: bbox_2d_r = img_w - bbox_2d_new[:, 0] bbox_2d_l = img_w - bbox_2d_new[:, 2] bbox_2d_new[:, 0] = bbox_2d_l bbox_2d_new[:, 2] = bbox_2d_r else: if img_flip: bbox_2d_r = img_w - bbox_2d_new[:, 0] bbox_2d_l = img_w - bbox_2d_new[:, 2] bbox_2d_new[:, 0] = bbox_2d_l bbox_2d_new[:, 2] = bbox_2d_r bbox_2d_new[:, 0] = bbox_2d_new[:, 0] - img_crop_offset[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] - img_crop_offset[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] - img_crop_offset[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] - img_crop_offset[1] bbox_2d_new[:, 0] = bbox_2d_new[:, 0] / img_scale_factor[0] bbox_2d_new[:, 2] = bbox_2d_new[:, 2] / img_scale_factor[0] bbox_2d_new[:, 1] = bbox_2d_new[:, 1] / img_scale_factor[1] bbox_2d_new[:, 3] = bbox_2d_new[:, 3] / img_scale_factor[1] return bbox_2d_new def coord_2d_transform(img_meta, coord_2d, ori2new): """Transform 2d pixel coordinates according to img_meta. Args: img_meta(dict): Meta info regarding data transformation. coord_2d (torch.Tensor): Shape (..., 2) The input 2d coords to transform. ori2new (bool): Origin img coord system to new or not. Returns: torch.Tensor: The transformed 2d coordinates. """ img_h, img_w, ori_h, ori_w, img_scale_factor, img_flip, \ img_crop_offset = extract_2d_info(img_meta, coord_2d) coord_2d_new = coord_2d.clone() if ori2new: # TODO here we assume this order of transformation coord_2d_new[..., 0] = coord_2d_new[..., 0] * img_scale_factor[0] coord_2d_new[..., 1] = coord_2d_new[..., 1] * img_scale_factor[1] coord_2d_new[..., 0] += img_crop_offset[0] coord_2d_new[..., 1] += img_crop_offset[1] # flip uv coordinates and bbox if img_flip: coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0] else: if img_flip: coord_2d_new[..., 0] = img_w - coord_2d_new[..., 0] coord_2d_new[..., 0] -= img_crop_offset[0] coord_2d_new[..., 1] -= img_crop_offset[1] coord_2d_new[..., 0] = coord_2d_new[..., 0] / img_scale_factor[0] coord_2d_new[..., 1] = coord_2d_new[..., 1] / img_scale_factor[1] return coord_2d_new ================================================ FILE: mmdet3d/models/fusion_layers/point_fusion.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.bbox.structures import (get_proj_mat_by_coord_type, points_cam2img) from ..builder import FUSION_LAYERS from . import apply_3d_transformation def point_sample(img_meta, img_features, points, proj_mat, coord_type, img_scale_factor, img_crop_offset, img_flip, img_pad_shape, img_shape, aligned=True, padding_mode='zeros', align_corners=True): """Obtain image features using points. Args: img_meta (dict): Meta info. img_features (torch.Tensor): 1 x C x H x W image features. points (torch.Tensor): Nx3 point cloud in LiDAR coordinates. proj_mat (torch.Tensor): 4x4 transformation matrix. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. img_scale_factor (torch.Tensor): Scale factor with shape of (w_scale, h_scale). img_crop_offset (torch.Tensor): Crop offset used to crop image during data augmentation with shape of (w_offset, h_offset). img_flip (bool): Whether the image is flipped. img_pad_shape (tuple[int]): int tuple indicates the h & w after padding, this is necessary to obtain features in feature map. img_shape (tuple[int]): int tuple indicates the h & w before padding after scaling, this is necessary for flipping coordinates. aligned (bool, optional): Whether use bilinear interpolation when sampling image features for each point. Defaults to True. padding_mode (str, optional): Padding mode when padding values for features of out-of-image points. Defaults to 'zeros'. align_corners (bool, optional): Whether to align corners when sampling image features for each point. Defaults to True. Returns: torch.Tensor: NxC image features sampled by point coordinates. """ # apply transformation based on info in img_meta points = apply_3d_transformation( points, coord_type, img_meta, reverse=True) # project points to camera coordinate pts_2d = points_cam2img(points, proj_mat) # img transformation: scale -> crop -> flip # the image is resized by img_scale_factor img_coors = pts_2d[:, 0:2] * img_scale_factor # Nx2 img_coors -= img_crop_offset # grid sample, the valid grid range should be in [-1,1] coor_x, coor_y = torch.split(img_coors, 1, dim=1) # each is Nx1 if img_flip: # by default we take it as horizontal flip # use img_shape before padding for flip orig_h, orig_w = img_shape coor_x = orig_w - coor_x h, w = img_pad_shape coor_y = coor_y / h * 2 - 1 coor_x = coor_x / w * 2 - 1 grid = torch.cat([coor_x, coor_y], dim=1).unsqueeze(0).unsqueeze(0) # Nx2 -> 1x1xNx2 # align_corner=True provides higher performance mode = 'bilinear' if aligned else 'nearest' point_features = F.grid_sample( img_features, grid, mode=mode, padding_mode=padding_mode, align_corners=align_corners) # 1xCx1xN feats return point_features.squeeze().t() @FUSION_LAYERS.register_module() class PointFusion(BaseModule): """Fuse image features from multi-scale features. Args: img_channels (list[int] | int): Channels of image features. It could be a list if the input is multi-scale image features. pts_channels (int): Channels of point features mid_channels (int): Channels of middle layers out_channels (int): Channels of output fused features img_levels (int, optional): Number of image levels. Defaults to 3. coord_type (str): 'DEPTH' or 'CAMERA' or 'LIDAR'. Defaults to 'LIDAR'. conv_cfg (dict, optional): Dict config of conv layers of middle layers. Defaults to None. norm_cfg (dict, optional): Dict config of norm layers of middle layers. Defaults to None. act_cfg (dict, optional): Dict config of activatation layers. Defaults to None. activate_out (bool, optional): Whether to apply relu activation to output features. Defaults to True. fuse_out (bool, optional): Whether apply conv layer to the fused features. Defaults to False. dropout_ratio (int, float, optional): Dropout ratio of image features to prevent overfitting. Defaults to 0. aligned (bool, optional): Whether apply aligned feature fusion. Defaults to True. align_corners (bool, optional): Whether to align corner when sampling features according to points. Defaults to True. padding_mode (str, optional): Mode used to pad the features of points that do not have corresponding image features. Defaults to 'zeros'. lateral_conv (bool, optional): Whether to apply lateral convs to image features. Defaults to True. """ def __init__(self, img_channels, pts_channels, mid_channels, out_channels, img_levels=3, coord_type='LIDAR', conv_cfg=None, norm_cfg=None, act_cfg=None, init_cfg=None, activate_out=True, fuse_out=False, dropout_ratio=0, aligned=True, align_corners=True, padding_mode='zeros', lateral_conv=True): super(PointFusion, self).__init__(init_cfg=init_cfg) if isinstance(img_levels, int): img_levels = [img_levels] if isinstance(img_channels, int): img_channels = [img_channels] * len(img_levels) assert isinstance(img_levels, list) assert isinstance(img_channels, list) assert len(img_channels) == len(img_levels) self.img_levels = img_levels self.coord_type = coord_type self.act_cfg = act_cfg self.activate_out = activate_out self.fuse_out = fuse_out self.dropout_ratio = dropout_ratio self.img_channels = img_channels self.aligned = aligned self.align_corners = align_corners self.padding_mode = padding_mode self.lateral_convs = None if lateral_conv: self.lateral_convs = nn.ModuleList() for i in range(len(img_channels)): l_conv = ConvModule( img_channels[i], mid_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=self.act_cfg, inplace=False) self.lateral_convs.append(l_conv) self.img_transform = nn.Sequential( nn.Linear(mid_channels * len(img_channels), out_channels), nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), ) else: self.img_transform = nn.Sequential( nn.Linear(sum(img_channels), out_channels), nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), ) self.pts_transform = nn.Sequential( nn.Linear(pts_channels, out_channels), nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), ) if self.fuse_out: self.fuse_conv = nn.Sequential( nn.Linear(mid_channels, out_channels), # For pts the BN is initialized differently by default # TODO: check whether this is necessary nn.BatchNorm1d(out_channels, eps=1e-3, momentum=0.01), nn.ReLU(inplace=False)) if init_cfg is None: self.init_cfg = [ dict(type='Xavier', layer='Conv2d', distribution='uniform'), dict(type='Xavier', layer='Linear', distribution='uniform') ] def forward(self, img_feats, pts, pts_feats, img_metas): """Forward function. Args: img_feats (list[torch.Tensor]): Image features. pts: [list[torch.Tensor]]: A batch of points with shape N x 3. pts_feats (torch.Tensor): A tensor consist of point features of the total batch. img_metas (list[dict]): Meta information of images. Returns: torch.Tensor: Fused features of each point. """ img_pts = self.obtain_mlvl_feats(img_feats, pts, img_metas) img_pre_fuse = self.img_transform(img_pts) if self.training and self.dropout_ratio > 0: img_pre_fuse = F.dropout(img_pre_fuse, self.dropout_ratio) pts_pre_fuse = self.pts_transform(pts_feats) fuse_out = img_pre_fuse + pts_pre_fuse if self.activate_out: fuse_out = F.relu(fuse_out) if self.fuse_out: fuse_out = self.fuse_conv(fuse_out) return fuse_out def obtain_mlvl_feats(self, img_feats, pts, img_metas): """Obtain multi-level features for each point. Args: img_feats (list(torch.Tensor)): Multi-scale image features produced by image backbone in shape (N, C, H, W). pts (list[torch.Tensor]): Points of each sample. img_metas (list[dict]): Meta information for each sample. Returns: torch.Tensor: Corresponding image features of each point. """ if self.lateral_convs is not None: img_ins = [ lateral_conv(img_feats[i]) for i, lateral_conv in zip(self.img_levels, self.lateral_convs) ] else: img_ins = img_feats img_feats_per_point = [] # Sample multi-level features for i in range(len(img_metas)): mlvl_img_feats = [] for level in range(len(self.img_levels)): mlvl_img_feats.append( self.sample_single(img_ins[level][i:i + 1], pts[i][:, :3], img_metas[i])) mlvl_img_feats = torch.cat(mlvl_img_feats, dim=-1) img_feats_per_point.append(mlvl_img_feats) img_pts = torch.cat(img_feats_per_point, dim=0) return img_pts def sample_single(self, img_feats, pts, img_meta): """Sample features from single level image feature map. Args: img_feats (torch.Tensor): Image feature map in shape (1, C, H, W). pts (torch.Tensor): Points of a single sample. img_meta (dict): Meta information of the single sample. Returns: torch.Tensor: Single level image features of each point. """ # TODO: image transformation also extracted img_scale_factor = ( pts.new_tensor(img_meta['scale_factor'][:2]) if 'scale_factor' in img_meta.keys() else 1) img_flip = img_meta['flip'] if 'flip' in img_meta.keys() else False img_crop_offset = ( pts.new_tensor(img_meta['img_crop_offset']) if 'img_crop_offset' in img_meta.keys() else 0) proj_mat = get_proj_mat_by_coord_type(img_meta, self.coord_type) img_pts = point_sample( img_meta=img_meta, img_features=img_feats, points=pts, proj_mat=pts.new_tensor(proj_mat), coord_type=self.coord_type, img_scale_factor=img_scale_factor, img_crop_offset=img_crop_offset, img_flip=img_flip, img_pad_shape=img_meta['input_shape'][:2], img_shape=img_meta['img_shape'][:2], aligned=self.aligned, padding_mode=self.padding_mode, align_corners=self.align_corners, ) return img_pts ================================================ FILE: mmdet3d/models/fusion_layers/vote_fusion.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from mmdet3d.core.bbox import points_cam2img from ..builder import FUSION_LAYERS from . import apply_3d_transformation, bbox_2d_transform, coord_2d_transform EPS = 1e-6 @FUSION_LAYERS.register_module() class VoteFusion(nn.Module): """Fuse 2d features from 3d seeds. Args: num_classes (int): number of classes. max_imvote_per_pixel (int): max number of imvotes. """ def __init__(self, num_classes=10, max_imvote_per_pixel=3): super(VoteFusion, self).__init__() self.num_classes = num_classes self.max_imvote_per_pixel = max_imvote_per_pixel def forward(self, imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas): """Forward function. Args: imgs (list[torch.Tensor]): Image features. bboxes_2d_rescaled (list[torch.Tensor]): 2D bboxes. seeds_3d_depth (torch.Tensor): 3D seeds. img_metas (list[dict]): Meta information of images. Returns: torch.Tensor: Concatenated cues of each point. torch.Tensor: Validity mask of each feature. """ img_features = [] masks = [] for i, data in enumerate( zip(imgs, bboxes_2d_rescaled, seeds_3d_depth, img_metas)): img, bbox_2d_rescaled, seed_3d_depth, img_meta = data bbox_num = bbox_2d_rescaled.shape[0] seed_num = seed_3d_depth.shape[0] img_shape = img_meta['img_shape'] img_h, img_w, _ = img_shape # first reverse the data transformations xyz_depth = apply_3d_transformation( seed_3d_depth, 'DEPTH', img_meta, reverse=True) # project points from depth to image depth2img = xyz_depth.new_tensor(img_meta['depth2img']) uvz_origin = points_cam2img(xyz_depth, depth2img, True) z_cam = uvz_origin[..., 2] uv_origin = (uvz_origin[..., :2] - 1).round() # rescale 2d coordinates and bboxes uv_rescaled = coord_2d_transform(img_meta, uv_origin, True) bbox_2d_origin = bbox_2d_transform(img_meta, bbox_2d_rescaled, False) if bbox_num == 0: imvote_num = seed_num * self.max_imvote_per_pixel # use zero features two_cues = torch.zeros((15, imvote_num), device=seed_3d_depth.device) mask_zero = torch.zeros( imvote_num - seed_num, device=seed_3d_depth.device).bool() mask_one = torch.ones( seed_num, device=seed_3d_depth.device).bool() mask = torch.cat([mask_one, mask_zero], dim=0) else: # expand bboxes and seeds bbox_expanded = bbox_2d_origin.view(1, bbox_num, -1).expand( seed_num, -1, -1) seed_2d_expanded = uv_origin.view(seed_num, 1, -1).expand(-1, bbox_num, -1) seed_2d_expanded_x, seed_2d_expanded_y = \ seed_2d_expanded.split(1, dim=-1) bbox_expanded_l, bbox_expanded_t, bbox_expanded_r, \ bbox_expanded_b, bbox_expanded_conf, bbox_expanded_cls = \ bbox_expanded.split(1, dim=-1) bbox_expanded_midx = (bbox_expanded_l + bbox_expanded_r) / 2 bbox_expanded_midy = (bbox_expanded_t + bbox_expanded_b) / 2 seed_2d_in_bbox_x = (seed_2d_expanded_x > bbox_expanded_l) * \ (seed_2d_expanded_x < bbox_expanded_r) seed_2d_in_bbox_y = (seed_2d_expanded_y > bbox_expanded_t) * \ (seed_2d_expanded_y < bbox_expanded_b) seed_2d_in_bbox = seed_2d_in_bbox_x * seed_2d_in_bbox_y # semantic cues, dim=class_num sem_cue = torch.zeros_like(bbox_expanded_conf).expand( -1, -1, self.num_classes) sem_cue = sem_cue.scatter(-1, bbox_expanded_cls.long(), bbox_expanded_conf) # bbox center - uv delta_u = bbox_expanded_midx - seed_2d_expanded_x delta_v = bbox_expanded_midy - seed_2d_expanded_y seed_3d_expanded = seed_3d_depth.view(seed_num, 1, -1).expand( -1, bbox_num, -1) z_cam = z_cam.view(seed_num, 1, 1).expand(-1, bbox_num, -1) imvote = torch.cat( [delta_u, delta_v, torch.zeros_like(delta_v)], dim=-1).view(-1, 3) imvote = imvote * z_cam.reshape(-1, 1) imvote = imvote @ torch.inverse(depth2img.t()) # apply transformation to lifted imvotes imvote = apply_3d_transformation( imvote, 'DEPTH', img_meta, reverse=False) seed_3d_expanded = seed_3d_expanded.reshape(imvote.shape) # ray angle ray_angle = seed_3d_expanded + imvote ray_angle /= torch.sqrt(torch.sum(ray_angle**2, -1) + EPS).unsqueeze(-1) # imvote lifted to 3d xz = ray_angle[:, [0, 2]] / (ray_angle[:, [1]] + EPS) \ * seed_3d_expanded[:, [1]] - seed_3d_expanded[:, [0, 2]] # geometric cues, dim=5 geo_cue = torch.cat([xz, ray_angle], dim=-1).view(seed_num, -1, 5) two_cues = torch.cat([geo_cue, sem_cue], dim=-1) # mask to 0 if seed not in bbox two_cues = two_cues * seed_2d_in_bbox.float() feature_size = two_cues.shape[-1] # if bbox number is too small, append zeros if bbox_num < self.max_imvote_per_pixel: append_num = self.max_imvote_per_pixel - bbox_num append_zeros = torch.zeros( (seed_num, append_num, 1), device=seed_2d_in_bbox.device).bool() seed_2d_in_bbox = torch.cat( [seed_2d_in_bbox, append_zeros], dim=1) append_zeros = torch.zeros( (seed_num, append_num, feature_size), device=two_cues.device) two_cues = torch.cat([two_cues, append_zeros], dim=1) append_zeros = torch.zeros((seed_num, append_num, 1), device=two_cues.device) bbox_expanded_conf = torch.cat( [bbox_expanded_conf, append_zeros], dim=1) # sort the valid seed-bbox pair according to confidence pair_score = seed_2d_in_bbox.float() + bbox_expanded_conf # and find the largests mask, indices = pair_score.topk( self.max_imvote_per_pixel, dim=1, largest=True, sorted=True) indices_img = indices.expand(-1, -1, feature_size) two_cues = two_cues.gather(dim=1, index=indices_img) two_cues = two_cues.transpose(1, 0) two_cues = two_cues.reshape(-1, feature_size).transpose( 1, 0).contiguous() # since conf is ~ (0, 1), floor gives us validity mask = mask.floor().int() mask = mask.transpose(1, 0).reshape(-1).bool() # clear the padding img = img[:, :img_shape[0], :img_shape[1]] img_flatten = img.reshape(3, -1).float() img_flatten /= 255. # take the normalized pixel value as texture cue uv_rescaled[:, 0] = torch.clamp(uv_rescaled[:, 0].round(), 0, img_shape[1] - 1) uv_rescaled[:, 1] = torch.clamp(uv_rescaled[:, 1].round(), 0, img_shape[0] - 1) uv_flatten = uv_rescaled[:, 1].round() * \ img_shape[1] + uv_rescaled[:, 0].round() uv_expanded = uv_flatten.unsqueeze(0).expand(3, -1).long() txt_cue = torch.gather(img_flatten, dim=-1, index=uv_expanded) txt_cue = txt_cue.unsqueeze(1).expand(-1, self.max_imvote_per_pixel, -1).reshape(3, -1) # append texture cue img_feature = torch.cat([two_cues, txt_cue], dim=0) img_features.append(img_feature) masks.append(mask) return torch.stack(img_features, 0), torch.stack(masks, 0) ================================================ FILE: mmdet3d/models/losses/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.losses import FocalLoss, SmoothL1Loss, binary_cross_entropy from .axis_aligned_iou_loss import AxisAlignedIoULoss, axis_aligned_iou_loss from .chamfer_distance import ChamferDistance, chamfer_distance from .multibin_loss import MultiBinLoss from .paconv_regularization_loss import PAConvRegularizationLoss from .rotated_iou_loss import RotatedIoU3DLoss from .uncertain_smooth_l1_loss import UncertainL1Loss, UncertainSmoothL1Loss __all__ = [ 'FocalLoss', 'SmoothL1Loss', 'binary_cross_entropy', 'ChamferDistance', 'chamfer_distance', 'axis_aligned_iou_loss', 'AxisAlignedIoULoss', 'PAConvRegularizationLoss', 'UncertainL1Loss', 'UncertainSmoothL1Loss', 'MultiBinLoss', 'RotatedIoU3DLoss' ] ================================================ FILE: mmdet3d/models/losses/axis_aligned_iou_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from mmdet.models.losses.utils import weighted_loss from ...core.bbox import AxisAlignedBboxOverlaps3D from ..builder import LOSSES @weighted_loss def axis_aligned_iou_loss(pred, target): """Calculate the IoU loss (1-IoU) of two sets of axis aligned bounding boxes. Note that predictions and targets are one-to-one corresponded. Args: pred (torch.Tensor): Bbox predictions with shape [..., 6] (x1, y1, z1, x2, y2, z2). target (torch.Tensor): Bbox targets (gt) with shape [..., 6] (x1, y1, z1, x2, y2, z2). Returns: torch.Tensor: IoU loss between predictions and targets. """ axis_aligned_iou = AxisAlignedBboxOverlaps3D()( pred, target, is_aligned=True) iou_loss = 1 - axis_aligned_iou return iou_loss @LOSSES.register_module() class AxisAlignedIoULoss(nn.Module): """Calculate the IoU loss (1-IoU) of axis aligned bounding boxes. Args: reduction (str): Method to reduce losses. The valid reduction method are none, sum or mean. loss_weight (float, optional): Weight of loss. Defaults to 1.0. """ def __init__(self, reduction='mean', loss_weight=1.0): super(AxisAlignedIoULoss, self).__init__() assert reduction in ['none', 'sum', 'mean'] self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function of loss calculation. Args: pred (torch.Tensor): Bbox predictions with shape [..., 6] (x1, y1, z1, x2, y2, z2). target (torch.Tensor): Bbox targets (gt) with shape [..., 6] (x1, y1, z1, x2, y2, z2). weight (torch.Tensor | float, optional): Weight of loss. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Defaults to None. Returns: torch.Tensor: IoU loss between predictions and targets. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if (weight is not None) and (not torch.any(weight > 0)) and ( reduction != 'none'): return (pred * weight).sum() return axis_aligned_iou_loss( pred, target, weight=weight, avg_factor=avg_factor, reduction=reduction) * self.loss_weight ================================================ FILE: mmdet3d/models/losses/chamfer_distance.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from torch.nn.functional import l1_loss, mse_loss, smooth_l1_loss from ..builder import LOSSES def chamfer_distance(src, dst, src_weight=1.0, dst_weight=1.0, criterion_mode='l2', reduction='mean'): """Calculate Chamfer Distance of two sets. Args: src (torch.Tensor): Source set with shape [B, N, C] to calculate Chamfer Distance. dst (torch.Tensor): Destination set with shape [B, M, C] to calculate Chamfer Distance. src_weight (torch.Tensor or float): Weight of source loss. dst_weight (torch.Tensor or float): Weight of destination loss. criterion_mode (str): Criterion mode to calculate distance. The valid modes are smooth_l1, l1 or l2. reduction (str): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Returns: tuple: Source and Destination loss with the corresponding indices. - loss_src (torch.Tensor): The min distance from source to destination. - loss_dst (torch.Tensor): The min distance from destination to source. - indices1 (torch.Tensor): Index the min distance point for each point in source to destination. - indices2 (torch.Tensor): Index the min distance point for each point in destination to source. """ if criterion_mode == 'smooth_l1': criterion = smooth_l1_loss elif criterion_mode == 'l1': criterion = l1_loss elif criterion_mode == 'l2': criterion = mse_loss else: raise NotImplementedError src_expand = src.unsqueeze(2).repeat(1, 1, dst.shape[1], 1) dst_expand = dst.unsqueeze(1).repeat(1, src.shape[1], 1, 1) distance = criterion(src_expand, dst_expand, reduction='none').sum(-1) src2dst_distance, indices1 = torch.min(distance, dim=2) # (B,N) dst2src_distance, indices2 = torch.min(distance, dim=1) # (B,M) loss_src = (src2dst_distance * src_weight) loss_dst = (dst2src_distance * dst_weight) if reduction == 'sum': loss_src = torch.sum(loss_src) loss_dst = torch.sum(loss_dst) elif reduction == 'mean': loss_src = torch.mean(loss_src) loss_dst = torch.mean(loss_dst) elif reduction == 'none': pass else: raise NotImplementedError return loss_src, loss_dst, indices1, indices2 @LOSSES.register_module() class ChamferDistance(nn.Module): """Calculate Chamfer Distance of two sets. Args: mode (str): Criterion mode to calculate distance. The valid modes are smooth_l1, l1 or l2. reduction (str): Method to reduce losses. The valid reduction method are none, sum or mean. loss_src_weight (float): Weight of loss_source. loss_dst_weight (float): Weight of loss_target. """ def __init__(self, mode='l2', reduction='mean', loss_src_weight=1.0, loss_dst_weight=1.0): super(ChamferDistance, self).__init__() assert mode in ['smooth_l1', 'l1', 'l2'] assert reduction in ['none', 'sum', 'mean'] self.mode = mode self.reduction = reduction self.loss_src_weight = loss_src_weight self.loss_dst_weight = loss_dst_weight def forward(self, source, target, src_weight=1.0, dst_weight=1.0, reduction_override=None, return_indices=False, **kwargs): """Forward function of loss calculation. Args: source (torch.Tensor): Source set with shape [B, N, C] to calculate Chamfer Distance. target (torch.Tensor): Destination set with shape [B, M, C] to calculate Chamfer Distance. src_weight (torch.Tensor | float, optional): Weight of source loss. Defaults to 1.0. dst_weight (torch.Tensor | float, optional): Weight of destination loss. Defaults to 1.0. reduction_override (str, optional): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Defaults to None. return_indices (bool, optional): Whether to return indices. Defaults to False. Returns: tuple[torch.Tensor]: If ``return_indices=True``, return losses of source and target with their corresponding indices in the order of ``(loss_source, loss_target, indices1, indices2)``. If ``return_indices=False``, return ``(loss_source, loss_target)``. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_source, loss_target, indices1, indices2 = chamfer_distance( source, target, src_weight, dst_weight, self.mode, reduction) loss_source *= self.loss_src_weight loss_target *= self.loss_dst_weight if return_indices: return loss_source, loss_target, indices1, indices2 else: return loss_source, loss_target ================================================ FILE: mmdet3d/models/losses/multibin_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from torch.nn import functional as F from mmdet.models.losses.utils import weighted_loss from ..builder import LOSSES @weighted_loss def multibin_loss(pred_orientations, gt_orientations, num_dir_bins=4): """Multi-Bin Loss. Args: pred_orientations(torch.Tensor): Predicted local vector orientation in [axis_cls, head_cls, sin, cos] format. shape (N, num_dir_bins * 4) gt_orientations(torch.Tensor): Corresponding gt bboxes, shape (N, num_dir_bins * 2). num_dir_bins(int, optional): Number of bins to encode direction angle. Defaults: 4. Return: torch.Tensor: Loss tensor. """ cls_losses = 0 reg_losses = 0 reg_cnt = 0 for i in range(num_dir_bins): # bin cls loss cls_ce_loss = F.cross_entropy( pred_orientations[:, (i * 2):(i * 2 + 2)], gt_orientations[:, i].long(), reduction='mean') # regression loss valid_mask_i = (gt_orientations[:, i] == 1) cls_losses += cls_ce_loss if valid_mask_i.sum() > 0: start = num_dir_bins * 2 + i * 2 end = start + 2 pred_offset = F.normalize(pred_orientations[valid_mask_i, start:end]) gt_offset_sin = torch.sin(gt_orientations[valid_mask_i, num_dir_bins + i]) gt_offset_cos = torch.cos(gt_orientations[valid_mask_i, num_dir_bins + i]) reg_loss = \ F.l1_loss(pred_offset[:, 0], gt_offset_sin, reduction='none') + \ F.l1_loss(pred_offset[:, 1], gt_offset_cos, reduction='none') reg_losses += reg_loss.sum() reg_cnt += valid_mask_i.sum() return cls_losses / num_dir_bins + reg_losses / reg_cnt @LOSSES.register_module() class MultiBinLoss(nn.Module): """Multi-Bin Loss for orientation. Args: reduction (str, optional): The method to reduce the loss. Options are 'none', 'mean' and 'sum'. Defaults to 'none'. loss_weight (float, optional): The weight of loss. Defaults to 1.0. """ def __init__(self, reduction='none', loss_weight=1.0): super(MultiBinLoss, self).__init__() assert reduction in ['none', 'sum', 'mean'] self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, num_dir_bins, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. num_dir_bins (int): Number of bins to encode direction angle. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss = self.loss_weight * multibin_loss( pred, target, num_dir_bins=num_dir_bins, reduction=reduction) return loss ================================================ FILE: mmdet3d/models/losses/paconv_regularization_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from mmdet3d.ops import PAConv, PAConvCUDA from mmdet.models.losses.utils import weight_reduce_loss from ..builder import LOSSES def weight_correlation(conv): """Calculate correlations between kernel weights in Conv's weight bank as regularization loss. The cosine similarity is used as metrics. Args: conv (nn.Module): A Conv modules to be regularized. Currently we only support `PAConv` and `PAConvCUDA`. Returns: torch.Tensor: Correlations between each kernel weights in weight bank. """ assert isinstance(conv, (PAConv, PAConvCUDA)), \ f'unsupported module type {type(conv)}' kernels = conv.weight_bank # [C_in, num_kernels * C_out] in_channels = conv.in_channels out_channels = conv.out_channels num_kernels = conv.num_kernels # [num_kernels, Cin * Cout] flatten_kernels = kernels.view(in_channels, num_kernels, out_channels).\ permute(1, 0, 2).reshape(num_kernels, -1) # [num_kernels, num_kernels] inner_product = torch.matmul(flatten_kernels, flatten_kernels.T) # [num_kernels, 1] kernel_norms = torch.sum(flatten_kernels**2, dim=-1, keepdim=True)**0.5 # [num_kernels, num_kernels] kernel_norms = torch.matmul(kernel_norms, kernel_norms.T) cosine_sims = inner_product / kernel_norms # take upper triangular part excluding diagonal since we only compute # correlation between different kernels once # the square is to ensure positive loss, refer to: # https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/tool/train.py#L208 corr = torch.sum(torch.triu(cosine_sims, diagonal=1)**2) return corr def paconv_regularization_loss(modules, reduction): """Computes correlation loss of PAConv weight kernels as regularization. Args: modules (List[nn.Module] | :obj:`generator`): A list or a python generator of torch.nn.Modules. reduction (str): Method to reduce losses among PAConv modules. The valid reduction method are none, sum or mean. Returns: torch.Tensor: Correlation loss of kernel weights. """ corr_loss = [] for module in modules: if isinstance(module, (PAConv, PAConvCUDA)): corr_loss.append(weight_correlation(module)) corr_loss = torch.stack(corr_loss) # perform reduction corr_loss = weight_reduce_loss(corr_loss, reduction=reduction) return corr_loss @LOSSES.register_module() class PAConvRegularizationLoss(nn.Module): """Calculate correlation loss of kernel weights in PAConv's weight bank. This is used as a regularization term in PAConv model training. Args: reduction (str): Method to reduce losses. The reduction is performed among all PAConv modules instead of prediction tensors. The valid reduction method are none, sum or mean. loss_weight (float, optional): Weight of loss. Defaults to 1.0. """ def __init__(self, reduction='mean', loss_weight=1.0): super(PAConvRegularizationLoss, self).__init__() assert reduction in ['none', 'sum', 'mean'] self.reduction = reduction self.loss_weight = loss_weight def forward(self, modules, reduction_override=None, **kwargs): """Forward function of loss calculation. Args: modules (List[nn.Module] | :obj:`generator`): A list or a python generator of torch.nn.Modules. reduction_override (str, optional): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Defaults to None. Returns: torch.Tensor: Correlation loss of kernel weights. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) return self.loss_weight * paconv_regularization_loss( modules, reduction=reduction) ================================================ FILE: mmdet3d/models/losses/rotated_iou_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.ops import diff_iou_rotated_3d from torch import nn as nn from mmdet.models.losses.utils import weighted_loss from ..builder import LOSSES @weighted_loss def rotated_iou_3d_loss(pred, target): """Calculate the IoU loss (1-IoU) of two sets of rotated bounding boxes. Note that predictions and targets are one-to-one corresponded. Args: pred (torch.Tensor): Bbox predictions with shape [N, 7] (x, y, z, w, l, h, alpha). target (torch.Tensor): Bbox targets (gt) with shape [N, 7] (x, y, z, w, l, h, alpha). Returns: torch.Tensor: IoU loss between predictions and targets. """ iou_loss = 1 - diff_iou_rotated_3d(pred.unsqueeze(0), target.unsqueeze(0))[0] return iou_loss @LOSSES.register_module() class RotatedIoU3DLoss(nn.Module): """Calculate the IoU loss (1-IoU) of rotated bounding boxes. Args: reduction (str): Method to reduce losses. The valid reduction method are none, sum or mean. loss_weight (float, optional): Weight of loss. Defaults to 1.0. """ def __init__(self, reduction='mean', loss_weight=1.0): super().__init__() self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function of loss calculation. Args: pred (torch.Tensor): Bbox predictions with shape [..., 7] (x, y, z, w, l, h, alpha). target (torch.Tensor): Bbox targets (gt) with shape [..., 7] (x, y, z, w, l, h, alpha). weight (torch.Tensor | float, optional): Weight of loss. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): Method to reduce losses. The valid reduction method are 'none', 'sum' or 'mean'. Defaults to None. Returns: torch.Tensor: IoU loss between predictions and targets. """ if weight is not None and not torch.any(weight > 0): return pred.sum() * weight.sum() # 0 assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) if weight is not None and weight.dim() > 1: weight = weight.mean(-1) loss = self.loss_weight * rotated_iou_3d_loss( pred, target, weight, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss ================================================ FILE: mmdet3d/models/losses/uncertain_smooth_l1_loss.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from mmdet.models.losses.utils import weighted_loss from ..builder import LOSSES @weighted_loss def uncertain_smooth_l1_loss(pred, target, sigma, alpha=1.0, beta=1.0): """Smooth L1 loss with uncertainty. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. sigma (torch.Tensor): The sigma for uncertainty. alpha (float, optional): The coefficient of log(sigma). Defaults to 1.0. beta (float, optional): The threshold in the piecewise function. Defaults to 1.0. Returns: torch.Tensor: Calculated loss """ assert beta > 0 assert target.numel() > 0 assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \ f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \ 'are inconsistent.' diff = torch.abs(pred - target) loss = torch.where(diff < beta, 0.5 * diff * diff / beta, diff - 0.5 * beta) loss = torch.exp(-sigma) * loss + alpha * sigma return loss @weighted_loss def uncertain_l1_loss(pred, target, sigma, alpha=1.0): """L1 loss with uncertainty. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. sigma (torch.Tensor): The sigma for uncertainty. alpha (float, optional): The coefficient of log(sigma). Defaults to 1.0. Returns: torch.Tensor: Calculated loss """ assert target.numel() > 0 assert pred.size() == target.size() == sigma.size(), 'The size of pred ' \ f'{pred.size()}, target {target.size()}, and sigma {sigma.size()} ' \ 'are inconsistent.' loss = torch.abs(pred - target) loss = torch.exp(-sigma) * loss + alpha * sigma return loss @LOSSES.register_module() class UncertainSmoothL1Loss(nn.Module): r"""Smooth L1 loss with uncertainty. Please refer to `PGD `_ and `Multi-Task Learning Using Uncertainty to Weigh Losses for Scene Geometry and Semantics `_ for more details. Args: alpha (float, optional): The coefficient of log(sigma). Defaults to 1.0. beta (float, optional): The threshold in the piecewise function. Defaults to 1.0. reduction (str, optional): The method to reduce the loss. Options are 'none', 'mean' and 'sum'. Defaults to 'mean'. loss_weight (float, optional): The weight of loss. Defaults to 1.0 """ def __init__(self, alpha=1.0, beta=1.0, reduction='mean', loss_weight=1.0): super(UncertainSmoothL1Loss, self).__init__() assert reduction in ['none', 'sum', 'mean'] self.alpha = alpha self.beta = beta self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, sigma, weight=None, avg_factor=None, reduction_override=None, **kwargs): """Forward function. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. sigma (torch.Tensor): The sigma for uncertainty. weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_bbox = self.loss_weight * uncertain_smooth_l1_loss( pred, target, weight, sigma=sigma, alpha=self.alpha, beta=self.beta, reduction=reduction, avg_factor=avg_factor, **kwargs) return loss_bbox @LOSSES.register_module() class UncertainL1Loss(nn.Module): """L1 loss with uncertainty. Args: alpha (float, optional): The coefficient of log(sigma). Defaults to 1.0. reduction (str, optional): The method to reduce the loss. Options are 'none', 'mean' and 'sum'. Defaults to 'mean'. loss_weight (float, optional): The weight of loss. Defaults to 1.0. """ def __init__(self, alpha=1.0, reduction='mean', loss_weight=1.0): super(UncertainL1Loss, self).__init__() assert reduction in ['none', 'sum', 'mean'] self.alpha = alpha self.reduction = reduction self.loss_weight = loss_weight def forward(self, pred, target, sigma, weight=None, avg_factor=None, reduction_override=None): """Forward function. Args: pred (torch.Tensor): The prediction. target (torch.Tensor): The learning target of the prediction. sigma (torch.Tensor): The sigma for uncertainty. weight (torch.Tensor, optional): The weight of loss for each prediction. Defaults to None. avg_factor (int, optional): Average factor that is used to average the loss. Defaults to None. reduction_override (str, optional): The reduction method used to override the original reduction method of the loss. Defaults to None. """ assert reduction_override in (None, 'none', 'mean', 'sum') reduction = ( reduction_override if reduction_override else self.reduction) loss_bbox = self.loss_weight * uncertain_l1_loss( pred, target, weight, sigma=sigma, alpha=self.alpha, reduction=reduction, avg_factor=avg_factor) return loss_bbox ================================================ FILE: mmdet3d/models/middle_encoders/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .pillar_scatter import PointPillarsScatter from .sparse_encoder import SparseEncoder, SparseEncoderSASSD from .sparse_unet import SparseUNet __all__ = [ 'PointPillarsScatter', 'SparseEncoder', 'SparseEncoderSASSD', 'SparseUNet' ] ================================================ FILE: mmdet3d/models/middle_encoders/pillar_scatter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import auto_fp16 from torch import nn from ..builder import MIDDLE_ENCODERS @MIDDLE_ENCODERS.register_module() class PointPillarsScatter(nn.Module): """Point Pillar's Scatter. Converts learned features from dense tensor to sparse pseudo image. Args: in_channels (int): Channels of input features. output_shape (list[int]): Required output shape of features. """ def __init__(self, in_channels, output_shape): super().__init__() self.output_shape = output_shape self.ny = output_shape[0] self.nx = output_shape[1] self.in_channels = in_channels self.fp16_enabled = False @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size=None): """Foraward function to scatter features.""" # TODO: rewrite the function in a batch manner # no need to deal with different batch cases if batch_size is not None: return self.forward_batch(voxel_features, coors, batch_size) else: return self.forward_single(voxel_features, coors) def forward_single(self, voxel_features, coors): """Scatter features of single sample. Args: voxel_features (torch.Tensor): Voxel features in shape (N, C). coors (torch.Tensor): Coordinates of each voxel. The first column indicates the sample ID. """ # Create the canvas for this sample canvas = torch.zeros( self.in_channels, self.nx * self.ny, dtype=voxel_features.dtype, device=voxel_features.device) indices = coors[:, 2] * self.nx + coors[:, 3] indices = indices.long() voxels = voxel_features.t() # Now scatter the blob back to the canvas. canvas[:, indices] = voxels # Undo the column stacking to final 4-dim tensor canvas = canvas.view(1, self.in_channels, self.ny, self.nx) return canvas def forward_batch(self, voxel_features, coors, batch_size): """Scatter features of single sample. Args: voxel_features (torch.Tensor): Voxel features in shape (N, C). coors (torch.Tensor): Coordinates of each voxel in shape (N, 4). The first column indicates the sample ID. batch_size (int): Number of samples in the current batch. """ # batch_canvas will be the final output. batch_canvas = [] for batch_itt in range(batch_size): # Create the canvas for this sample canvas = torch.zeros( self.in_channels, self.nx * self.ny, dtype=voxel_features.dtype, device=voxel_features.device) # Only include non-empty pillars batch_mask = coors[:, 0] == batch_itt this_coors = coors[batch_mask, :] indices = this_coors[:, 2] * self.nx + this_coors[:, 3] indices = indices.type(torch.long) voxels = voxel_features[batch_mask, :] voxels = voxels.t() # Now scatter the blob back to the canvas. canvas[:, indices] = voxels # Append to a list for later stacking. batch_canvas.append(canvas) # Stack to 3-dim tensor (batch-size, in_channels, nrows*ncols) batch_canvas = torch.stack(batch_canvas, 0) # Undo the column stacking to final 4-dim tensor batch_canvas = batch_canvas.view(batch_size, self.in_channels, self.ny, self.nx) return batch_canvas ================================================ FILE: mmdet3d/models/middle_encoders/sparse_encoder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.ops import points_in_boxes_all, three_interpolate, three_nn from mmcv.runner import auto_fp16 from torch import nn as nn from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE from mmdet.models.losses import sigmoid_focal_loss, smooth_l1_loss from ..builder import MIDDLE_ENCODERS if IS_SPCONV2_AVAILABLE: from spconv.pytorch import SparseConvTensor, SparseSequential else: from mmcv.ops import SparseConvTensor, SparseSequential @MIDDLE_ENCODERS.register_module() class SparseEncoder(nn.Module): r"""Sparse encoder for SECOND and Part-A2. Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. order (list[str], optional): Order of conv module. Defaults to ('conv', 'norm', 'act'). norm_cfg (dict, optional): Config of normalization layer. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). base_channels (int, optional): Out channels for conv_input layer. Defaults to 16. output_channels (int, optional): Out channels for conv_out layer. Defaults to 128. encoder_channels (tuple[tuple[int]], optional): Convolutional channels of each encode block. Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). encoder_paddings (tuple[tuple[int]], optional): Paddings of each encode block. Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)). block_type (str, optional): Type of the block to use. Defaults to 'conv_module'. """ def __init__(self, in_channels, sparse_shape, order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), base_channels=16, output_channels=128, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)), encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)), block_type='conv_module'): super().__init__() assert block_type in ['conv_module', 'basicblock'] self.sparse_shape = sparse_shape self.in_channels = in_channels self.order = order self.base_channels = base_channels self.output_channels = output_channels self.encoder_channels = encoder_channels self.encoder_paddings = encoder_paddings self.stage_num = len(self.encoder_channels) self.fp16_enabled = False # Spconv init all weight on its own assert isinstance(order, tuple) and len(order) == 3 assert set(order) == {'conv', 'norm', 'act'} if self.order[0] != 'conv': # pre activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d', order=('conv', )) else: # post activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d') encoder_out_channels = self.make_encoder_layers( make_sparse_convmodule, norm_cfg, self.base_channels, block_type=block_type) self.conv_out = make_sparse_convmodule( encoder_out_channels, self.output_channels, kernel_size=(3, 1, 1), stride=(2, 1, 1), norm_cfg=norm_cfg, padding=0, indice_key='spconv_down2', conv_type='SparseConv3d') @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size): """Forward of SparseEncoder. Args: voxel_features (torch.Tensor): Voxel features in shape (N, C). coors (torch.Tensor): Coordinates in shape (N, 4), the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. Returns: dict: Backbone features. """ coors = coors.int() input_sp_tensor = SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) encode_features = [] for encoder_layer in self.encoder_layers: x = encoder_layer(x) encode_features.append(x) # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) spatial_features = out.dense() N, C, D, H, W = spatial_features.shape spatial_features = spatial_features.view(N, C * D, H, W) return spatial_features def make_encoder_layers(self, make_block, norm_cfg, in_channels, block_type='conv_module', conv_cfg=dict(type='SubMConv3d')): """make encoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. block_type (str, optional): Type of the block to use. Defaults to 'conv_module'. conv_cfg (dict, optional): Config of conv layer. Defaults to dict(type='SubMConv3d'). Returns: int: The number of encoder output channels. """ assert block_type in ['conv_module', 'basicblock'] self.encoder_layers = SparseSequential() for i, blocks in enumerate(self.encoder_channels): blocks_list = [] for j, out_channels in enumerate(tuple(blocks)): padding = tuple(self.encoder_paddings[i])[j] # each stage started with a spconv layer # except the first stage if i != 0 and j == 0 and block_type == 'conv_module': blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) elif block_type == 'basicblock': if j == len(blocks) - 1 and i != len( self.encoder_channels) - 1: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) else: blocks_list.append( SparseBasicBlock( out_channels, out_channels, norm_cfg=norm_cfg, conv_cfg=conv_cfg)) else: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, padding=padding, indice_key=f'subm{i + 1}', conv_type='SubMConv3d')) in_channels = out_channels stage_name = f'encoder_layer{i + 1}' stage_layers = SparseSequential(*blocks_list) self.encoder_layers.add_module(stage_name, stage_layers) return out_channels @MIDDLE_ENCODERS.register_module() class MySparseEncoder(nn.Module): r"""Sparse encoder for SECOND and Part-A2. Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. order (list[str], optional): Order of conv module. Defaults to ('conv', 'norm', 'act'). norm_cfg (dict, optional): Config of normalization layer. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). base_channels (int, optional): Out channels for conv_input layer. Defaults to 16. output_channels (int, optional): Out channels for conv_out layer. Defaults to 128. encoder_channels (tuple[tuple[int]], optional): Convolutional channels of each encode block. Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). encoder_paddings (tuple[tuple[int]], optional): Paddings of each encode block. Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)). block_type (str, optional): Type of the block to use. Defaults to 'conv_module'. """ def __init__(self, in_channels, sparse_shape, order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), base_channels=16, output_channels=128, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)), encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)), block_type='conv_module'): super().__init__() assert block_type in ['conv_module', 'basicblock'] self.sparse_shape = sparse_shape self.in_channels = in_channels self.order = order self.base_channels = base_channels self.output_channels = output_channels self.encoder_channels = encoder_channels self.encoder_paddings = encoder_paddings self.stage_num = len(self.encoder_channels) self.fp16_enabled = False # Spconv init all weight on its own assert isinstance(order, tuple) and len(order) == 3 assert set(order) == {'conv', 'norm', 'act'} if self.order[0] != 'conv': # pre activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d', order=('conv', )) else: # post activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d') encoder_out_channels = self.make_encoder_layers( make_sparse_convmodule, norm_cfg, self.base_channels, block_type=block_type) self.conv_out = make_sparse_convmodule( encoder_out_channels, self.output_channels, kernel_size=(3, 1, 1), stride=(1, 1, 1), norm_cfg=norm_cfg, padding=(1, 0, 0), indice_key='spconv_down2', conv_type='SparseConv3d') @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size): """Forward of SparseEncoder. Args: voxel_features (torch.Tensor): Voxel features in shape (N, C). coors (torch.Tensor): Coordinates in shape (N, 4), the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. Returns: dict: Backbone features. """ coors = coors.int() input_sp_tensor = SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) encode_features = [] for encoder_layer in self.encoder_layers: x = encoder_layer(x) encode_features.append(x) # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) spatial_features = out.dense() N, C, D, H, W = spatial_features.shape spatial_features = spatial_features.permute(0, 1, 3, 4, 2) # spatial_features = spatial_features.view(N, C * D, H, W) return spatial_features def make_encoder_layers(self, make_block, norm_cfg, in_channels, block_type='conv_module', conv_cfg=dict(type='SubMConv3d')): """make encoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. block_type (str, optional): Type of the block to use. Defaults to 'conv_module'. conv_cfg (dict, optional): Config of conv layer. Defaults to dict(type='SubMConv3d'). Returns: int: The number of encoder output channels. """ assert block_type in ['conv_module', 'basicblock'] self.encoder_layers = SparseSequential() for i, blocks in enumerate(self.encoder_channels): blocks_list = [] for j, out_channels in enumerate(tuple(blocks)): padding = tuple(self.encoder_paddings[i])[j] # each stage started with a spconv layer # except the first stage if i != 0 and j == 0 and block_type == 'conv_module': blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) elif block_type == 'basicblock': if j == len(blocks) - 1 and i != len( self.encoder_channels) - 1: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) else: blocks_list.append( SparseBasicBlock( out_channels, out_channels, norm_cfg=norm_cfg, conv_cfg=conv_cfg)) else: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, padding=padding, indice_key=f'subm{i + 1}', conv_type='SubMConv3d')) in_channels = out_channels stage_name = f'encoder_layer{i + 1}' stage_layers = SparseSequential(*blocks_list) self.encoder_layers.add_module(stage_name, stage_layers) return out_channels @MIDDLE_ENCODERS.register_module() class SparseEncoderSASSD(SparseEncoder): r"""Sparse encoder for `SASSD `_ Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. order (list[str], optional): Order of conv module. Defaults to ('conv', 'norm', 'act'). norm_cfg (dict, optional): Config of normalization layer. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). base_channels (int, optional): Out channels for conv_input layer. Defaults to 16. output_channels (int, optional): Out channels for conv_out layer. Defaults to 128. encoder_channels (tuple[tuple[int]], optional): Convolutional channels of each encode block. Defaults to ((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)). encoder_paddings (tuple[tuple[int]], optional): Paddings of each encode block. Defaults to ((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)). block_type (str, optional): Type of the block to use. Defaults to 'conv_module'. """ def __init__(self, in_channels, sparse_shape, order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), base_channels=16, output_channels=128, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)), encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)), block_type='conv_module'): super(SparseEncoderSASSD, self).__init__( in_channels=in_channels, sparse_shape=sparse_shape, order=order, norm_cfg=norm_cfg, base_channels=base_channels, output_channels=output_channels, encoder_channels=encoder_channels, encoder_paddings=encoder_paddings, block_type=block_type) self.point_fc = nn.Linear(112, 64, bias=False) self.point_cls = nn.Linear(64, 1, bias=False) self.point_reg = nn.Linear(64, 3, bias=False) @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size, test_mode=False): """Forward of SparseEncoder. Args: voxel_features (torch.Tensor): Voxel features in shape (N, C). coors (torch.Tensor): Coordinates in shape (N, 4), the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. test_mode (bool, optional): Whether in test mode. Defaults to False. Returns: dict: Backbone features. tuple[torch.Tensor]: Mean feature value of the points, Classificaion result of the points, Regression offsets of the points. """ coors = coors.int() input_sp_tensor = SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) encode_features = [] for encoder_layer in self.encoder_layers: x = encoder_layer(x) encode_features.append(x) # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) spatial_features = out.dense() N, C, D, H, W = spatial_features.shape spatial_features = spatial_features.view(N, C * D, H, W) if test_mode: return spatial_features, None points_mean = torch.zeros_like(voxel_features) points_mean[:, 0] = coors[:, 0] points_mean[:, 1:] = voxel_features[:, :3] # auxiliary network p0 = self.make_auxiliary_points( encode_features[0], points_mean, offset=(0, -40., -3.), voxel_size=(.1, .1, .2)) p1 = self.make_auxiliary_points( encode_features[1], points_mean, offset=(0, -40., -3.), voxel_size=(.2, .2, .4)) p2 = self.make_auxiliary_points( encode_features[2], points_mean, offset=(0, -40., -3.), voxel_size=(.4, .4, .8)) pointwise = torch.cat([p0, p1, p2], dim=-1) pointwise = self.point_fc(pointwise) point_cls = self.point_cls(pointwise) point_reg = self.point_reg(pointwise) point_misc = (points_mean, point_cls, point_reg) return spatial_features, point_misc def get_auxiliary_targets(self, nxyz, gt_boxes3d, enlarge=1.0): """Get auxiliary target. Args: nxyz (torch.Tensor): Mean features of the points. gt_boxes3d (torch.Tensor): Coordinates in shape (N, 4), the columns in the order of (batch_idx, z_idx, y_idx, x_idx). enlarge (int, optional): Enlaged scale. Defaults to 1.0. Returns: tuple[torch.Tensor]: Label of the points and center offsets of the points. """ center_offsets = list() pts_labels = list() for i in range(len(gt_boxes3d)): boxes3d = gt_boxes3d[i].tensor.cpu() idx = torch.nonzero(nxyz[:, 0] == i).view(-1) new_xyz = nxyz[idx, 1:].cpu() boxes3d[:, 3:6] *= enlarge pts_in_flag, center_offset = self.calculate_pts_offsets( new_xyz, boxes3d) pts_label = pts_in_flag.max(0)[0].byte() pts_labels.append(pts_label) center_offsets.append(center_offset) center_offsets = torch.cat(center_offsets).cuda() pts_labels = torch.cat(pts_labels).to(center_offsets.device) return pts_labels, center_offsets def calculate_pts_offsets(self, points, boxes): """Find all boxes in which each point is, as well as the offsets from the box centers. Args: points (torch.Tensor): [M, 3], [x, y, z] in LiDAR/DEPTH coordinate boxes (torch.Tensor): [T, 7], num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], (x, y, z) is the bottom center. Returns: tuple[torch.Tensor]: Point indices of boxes with the shape of (T, M). Default background = 0. And offsets from the box centers of points, if it belows to the box, with the shape of (M, 3). Default background = 0. """ boxes_num = len(boxes) pts_num = len(points) points = points.cuda() boxes = boxes.to(points.device) box_idxs_of_pts = points_in_boxes_all(points[None, ...], boxes[None, ...]) pts_indices = box_idxs_of_pts.squeeze(0).transpose(0, 1) center_offsets = torch.zeros_like(points).to(points.device) for i in range(boxes_num): for j in range(pts_num): if pts_indices[i][j] == 1: center_offsets[j][0] = points[j][0] - boxes[i][0] center_offsets[j][1] = points[j][1] - boxes[i][1] center_offsets[j][2] = ( points[j][2] - (boxes[i][2] + boxes[i][2] / 2.0)) return pts_indices.cpu(), center_offsets.cpu() def aux_loss(self, points, point_cls, point_reg, gt_bboxes): """Calculate auxiliary loss. Args: points (torch.Tensor): Mean feature value of the points. point_cls (torch.Tensor): Classificaion result of the points. point_reg (torch.Tensor): Regression offsets of the points. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes for each sample. Returns: dict: Backbone features. """ num_boxes = len(gt_bboxes) pts_labels, center_targets = self.get_auxiliary_targets( points, gt_bboxes) rpn_cls_target = pts_labels.long() pos = (pts_labels > 0).float() neg = (pts_labels == 0).float() pos_normalizer = pos.sum().clamp(min=1.0) cls_weights = pos + neg reg_weights = pos reg_weights = reg_weights / pos_normalizer aux_loss_cls = sigmoid_focal_loss( point_cls, rpn_cls_target, weight=cls_weights, avg_factor=pos_normalizer) aux_loss_cls /= num_boxes weight = reg_weights[..., None] aux_loss_reg = smooth_l1_loss(point_reg, center_targets, beta=1 / 9.) aux_loss_reg = torch.sum(aux_loss_reg * weight)[None] aux_loss_reg /= num_boxes aux_loss_cls, aux_loss_reg = [aux_loss_cls], [aux_loss_reg] return dict(aux_loss_cls=aux_loss_cls, aux_loss_reg=aux_loss_reg) def make_auxiliary_points(self, source_tensor, target, offset=(0., -40., -3.), voxel_size=(.05, .05, .1)): """Make auxiliary points for loss computation. Args: source_tensor (torch.Tensor): (M, C) features to be propigated. target (torch.Tensor): (N, 4) bxyz positions of the target features. offset (tuple[float], optional): Voxelization offset. Defaults to (0., -40., -3.) voxel_size (tuple[float], optional): Voxelization size. Defaults to (.05, .05, .1) Returns: torch.Tensor: (N, C) tensor of the features of the target features. """ # Tansfer tensor to points source = source_tensor.indices.float() offset = torch.Tensor(offset).to(source.device) voxel_size = torch.Tensor(voxel_size).to(source.device) source[:, 1:] = ( source[:, [3, 2, 1]] * voxel_size + offset + .5 * voxel_size) source_feats = source_tensor.features[None, ...].transpose(1, 2) # Interplate auxiliary points dist, idx = three_nn(target[None, ...], source[None, ...]) dist_recip = 1.0 / (dist + 1e-8) norm = torch.sum(dist_recip, dim=2, keepdim=True) weight = dist_recip / norm new_features = three_interpolate(source_feats.contiguous(), idx, weight) return new_features.squeeze(0).transpose(0, 1) ================================================ FILE: mmdet3d/models/middle_encoders/sparse_unet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE if IS_SPCONV2_AVAILABLE: from spconv.pytorch import SparseConvTensor, SparseSequential else: from mmcv.ops import SparseConvTensor, SparseSequential from mmcv.runner import BaseModule, auto_fp16 from mmdet3d.ops import SparseBasicBlock, make_sparse_convmodule from mmdet3d.ops.sparse_block import replace_feature from ..builder import MIDDLE_ENCODERS @MIDDLE_ENCODERS.register_module() class SparseUNet(BaseModule): r"""SparseUNet for PartA^2. See the `paper `_ for more details. Args: in_channels (int): The number of input channels. sparse_shape (list[int]): The sparse shape of input tensor. norm_cfg (dict): Config of normalization layer. base_channels (int): Out channels for conv_input layer. output_channels (int): Out channels for conv_out layer. encoder_channels (tuple[tuple[int]]): Convolutional channels of each encode block. encoder_paddings (tuple[tuple[int]]): Paddings of each encode block. decoder_channels (tuple[tuple[int]]): Convolutional channels of each decode block. decoder_paddings (tuple[tuple[int]]): Paddings of each decode block. """ def __init__(self, in_channels, sparse_shape, order=('conv', 'norm', 'act'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), base_channels=16, output_channels=128, encoder_channels=((16, ), (32, 32, 32), (64, 64, 64), (64, 64, 64)), encoder_paddings=((1, ), (1, 1, 1), (1, 1, 1), ((0, 1, 1), 1, 1)), decoder_channels=((64, 64, 64), (64, 64, 32), (32, 32, 16), (16, 16, 16)), decoder_paddings=((1, 0), (1, 0), (0, 0), (0, 1)), init_cfg=None): super().__init__(init_cfg=init_cfg) self.sparse_shape = sparse_shape self.in_channels = in_channels self.order = order self.base_channels = base_channels self.output_channels = output_channels self.encoder_channels = encoder_channels self.encoder_paddings = encoder_paddings self.decoder_channels = decoder_channels self.decoder_paddings = decoder_paddings self.stage_num = len(self.encoder_channels) self.fp16_enabled = False # Spconv init all weight on its own assert isinstance(order, tuple) and len(order) == 3 assert set(order) == {'conv', 'norm', 'act'} if self.order[0] != 'conv': # pre activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d', order=('conv', )) else: # post activate self.conv_input = make_sparse_convmodule( in_channels, self.base_channels, 3, norm_cfg=norm_cfg, padding=1, indice_key='subm1', conv_type='SubMConv3d') encoder_out_channels = self.make_encoder_layers( make_sparse_convmodule, norm_cfg, self.base_channels) self.make_decoder_layers(make_sparse_convmodule, norm_cfg, encoder_out_channels) self.conv_out = make_sparse_convmodule( encoder_out_channels, self.output_channels, kernel_size=(3, 1, 1), stride=(2, 1, 1), norm_cfg=norm_cfg, padding=0, indice_key='spconv_down2', conv_type='SparseConv3d') @auto_fp16(apply_to=('voxel_features', )) def forward(self, voxel_features, coors, batch_size): """Forward of SparseUNet. Args: voxel_features (torch.float32): Voxel features in shape [N, C]. coors (torch.int32): Coordinates in shape [N, 4], the columns in the order of (batch_idx, z_idx, y_idx, x_idx). batch_size (int): Batch size. Returns: dict[str, torch.Tensor]: Backbone features. """ coors = coors.int() input_sp_tensor = SparseConvTensor(voxel_features, coors, self.sparse_shape, batch_size) x = self.conv_input(input_sp_tensor) encode_features = [] for encoder_layer in self.encoder_layers: x = encoder_layer(x) encode_features.append(x) # for detection head # [200, 176, 5] -> [200, 176, 2] out = self.conv_out(encode_features[-1]) spatial_features = out.dense() N, C, D, H, W = spatial_features.shape spatial_features = spatial_features.view(N, C * D, H, W) # for segmentation head, with output shape: # [400, 352, 11] <- [200, 176, 5] # [800, 704, 21] <- [400, 352, 11] # [1600, 1408, 41] <- [800, 704, 21] # [1600, 1408, 41] <- [1600, 1408, 41] decode_features = [] x = encode_features[-1] for i in range(self.stage_num, 0, -1): x = self.decoder_layer_forward(encode_features[i - 1], x, getattr(self, f'lateral_layer{i}'), getattr(self, f'merge_layer{i}'), getattr(self, f'upsample_layer{i}')) decode_features.append(x) seg_features = decode_features[-1].features ret = dict( spatial_features=spatial_features, seg_features=seg_features) return ret def decoder_layer_forward(self, x_lateral, x_bottom, lateral_layer, merge_layer, upsample_layer): """Forward of upsample and residual block. Args: x_lateral (:obj:`SparseConvTensor`): Lateral tensor. x_bottom (:obj:`SparseConvTensor`): Feature from bottom layer. lateral_layer (SparseBasicBlock): Convolution for lateral tensor. merge_layer (SparseSequential): Convolution for merging features. upsample_layer (SparseSequential): Convolution for upsampling. Returns: :obj:`SparseConvTensor`: Upsampled feature. """ x = lateral_layer(x_lateral) x = replace_feature(x, torch.cat((x_bottom.features, x.features), dim=1)) x_merge = merge_layer(x) x = self.reduce_channel(x, x_merge.features.shape[1]) x = replace_feature(x, x_merge.features + x.features) x = upsample_layer(x) return x @staticmethod def reduce_channel(x, out_channels): """reduce channel for element-wise addition. Args: x (:obj:`SparseConvTensor`): Sparse tensor, ``x.features`` are in shape (N, C1). out_channels (int): The number of channel after reduction. Returns: :obj:`SparseConvTensor`: Channel reduced feature. """ features = x.features n, in_channels = features.shape assert (in_channels % out_channels == 0) and (in_channels >= out_channels) x = replace_feature(x, features.view(n, out_channels, -1).sum(dim=2)) return x def make_encoder_layers(self, make_block, norm_cfg, in_channels): """make encoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. Returns: int: The number of encoder output channels. """ self.encoder_layers = SparseSequential() for i, blocks in enumerate(self.encoder_channels): blocks_list = [] for j, out_channels in enumerate(tuple(blocks)): padding = tuple(self.encoder_paddings[i])[j] # each stage started with a spconv layer # except the first stage if i != 0 and j == 0: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, stride=2, padding=padding, indice_key=f'spconv{i + 1}', conv_type='SparseConv3d')) else: blocks_list.append( make_block( in_channels, out_channels, 3, norm_cfg=norm_cfg, padding=padding, indice_key=f'subm{i + 1}', conv_type='SubMConv3d')) in_channels = out_channels stage_name = f'encoder_layer{i + 1}' stage_layers = SparseSequential(*blocks_list) self.encoder_layers.add_module(stage_name, stage_layers) return out_channels def make_decoder_layers(self, make_block, norm_cfg, in_channels): """make decoder layers using sparse convs. Args: make_block (method): A bounded function to build blocks. norm_cfg (dict[str]): Config of normalization layer. in_channels (int): The number of encoder input channels. Returns: int: The number of encoder output channels. """ block_num = len(self.decoder_channels) for i, block_channels in enumerate(self.decoder_channels): paddings = self.decoder_paddings[i] setattr( self, f'lateral_layer{block_num - i}', SparseBasicBlock( in_channels, block_channels[0], conv_cfg=dict( type='SubMConv3d', indice_key=f'subm{block_num - i}'), norm_cfg=norm_cfg)) setattr( self, f'merge_layer{block_num - i}', make_block( in_channels * 2, block_channels[1], 3, norm_cfg=norm_cfg, padding=paddings[0], indice_key=f'subm{block_num - i}', conv_type='SubMConv3d')) if block_num - i != 1: setattr( self, f'upsample_layer{block_num - i}', make_block( in_channels, block_channels[2], 3, norm_cfg=norm_cfg, indice_key=f'spconv{block_num - i}', conv_type='SparseInverseConv3d')) else: # use submanifold conv instead of inverse conv # in the last block setattr( self, f'upsample_layer{block_num - i}', make_block( in_channels, block_channels[2], 3, norm_cfg=norm_cfg, padding=paddings[1], indice_key='subm1', conv_type='SubMConv3d')) in_channels = block_channels[2] ================================================ FILE: mmdet3d/models/model_utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .edge_fusion_module import EdgeFusionModule from .transformer import GroupFree3DMHA from .vote_module import VoteModule __all__ = ['VoteModule', 'GroupFree3DMHA', 'EdgeFusionModule'] ================================================ FILE: mmdet3d/models/model_utils/edge_fusion_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn import ConvModule from mmcv.runner import BaseModule from torch import nn as nn from torch.nn import functional as F class EdgeFusionModule(BaseModule): """Edge Fusion Module for feature map. Args: out_channels (int): The number of output channels. feat_channels (int): The number of channels in feature map during edge feature fusion. kernel_size (int, optional): Kernel size of convolution. Default: 3. act_cfg (dict, optional): Config of activation. Default: dict(type='ReLU'). norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d')). """ def __init__(self, out_channels, feat_channels, kernel_size=3, act_cfg=dict(type='ReLU'), norm_cfg=dict(type='BN1d')): super().__init__() self.edge_convs = nn.Sequential( ConvModule( feat_channels, feat_channels, kernel_size=kernel_size, padding=kernel_size // 2, conv_cfg=dict(type='Conv1d'), norm_cfg=norm_cfg, act_cfg=act_cfg), nn.Conv1d(feat_channels, out_channels, kernel_size=1)) self.feat_channels = feat_channels def forward(self, features, fused_features, edge_indices, edge_lens, output_h, output_w): """Forward pass. Args: features (torch.Tensor): Different representative features for fusion. fused_features (torch.Tensor): Different representative features to be fused. edge_indices (torch.Tensor): Batch image edge indices. edge_lens (list[int]): List of edge length of each image. output_h (int): Height of output feature map. output_w (int): Width of output feature map. Returns: torch.Tensor: Fused feature maps. """ batch_size = features.shape[0] # normalize grid_edge_indices = edge_indices.view(batch_size, -1, 1, 2).float() grid_edge_indices[..., 0] = \ grid_edge_indices[..., 0] / (output_w - 1) * 2 - 1 grid_edge_indices[..., 1] = \ grid_edge_indices[..., 1] / (output_h - 1) * 2 - 1 # apply edge fusion edge_features = F.grid_sample( features, grid_edge_indices, align_corners=True).squeeze(-1) edge_output = self.edge_convs(edge_features) for k in range(batch_size): edge_indice_k = edge_indices[k, :edge_lens[k]] fused_features[k, :, edge_indice_k[:, 1], edge_indice_k[:, 0]] += edge_output[ k, :, :edge_lens[k]] return fused_features ================================================ FILE: mmdet3d/models/model_utils/transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn.bricks.registry import ATTENTION from mmcv.cnn.bricks.transformer import POSITIONAL_ENCODING, MultiheadAttention from torch import nn as nn @ATTENTION.register_module() class GroupFree3DMHA(MultiheadAttention): """A warpper for torch.nn.MultiheadAttention for GroupFree3D. This module implements MultiheadAttention with identity connection, and positional encoding used in DETR is also passed as input. Args: embed_dims (int): The embedding dimension. num_heads (int): Parallel attention heads. Same as `nn.MultiheadAttention`. attn_drop (float, optional): A Dropout layer on attn_output_weights. Defaults to 0.0. proj_drop (float, optional): A Dropout layer. Defaults to 0.0. dropout_layer (obj:`ConfigDict`, optional): The dropout_layer used when adding the shortcut. init_cfg (obj:`mmcv.ConfigDict`, optional): The Config for initialization. Default: None. batch_first (bool, optional): Key, Query and Value are shape of (batch, n, embed_dim) or (n, batch, embed_dim). Defaults to False. """ def __init__(self, embed_dims, num_heads, attn_drop=0., proj_drop=0., dropout_layer=dict(type='DropOut', drop_prob=0.), init_cfg=None, batch_first=False, **kwargs): super().__init__(embed_dims, num_heads, attn_drop, proj_drop, dropout_layer, init_cfg, batch_first, **kwargs) def forward(self, query, key, value, identity, query_pos=None, key_pos=None, attn_mask=None, key_padding_mask=None, **kwargs): """Forward function for `GroupFree3DMHA`. **kwargs allow passing a more general data flow when combining with other operations in `transformerlayer`. Args: query (Tensor): The input query with shape [num_queries, bs, embed_dims]. Same in `nn.MultiheadAttention.forward`. key (Tensor): The key tensor with shape [num_keys, bs, embed_dims]. Same in `nn.MultiheadAttention.forward`. If None, the ``query`` will be used. value (Tensor): The value tensor with same shape as `key`. Same in `nn.MultiheadAttention.forward`. If None, the `key` will be used. identity (Tensor): This tensor, with the same shape as x, will be used for the identity link. If None, `x` will be used. query_pos (Tensor, optional): The positional encoding for query, with the same shape as `x`. Defaults to None. If not None, it will be added to `x` before forward function. key_pos (Tensor, optional): The positional encoding for `key`, with the same shape as `key`. Defaults to None. If not None, it will be added to `key` before forward function. If None, and `query_pos` has the same shape as `key`, then `query_pos` will be used for `key_pos`. Defaults to None. attn_mask (Tensor, optional): ByteTensor mask with shape [num_queries, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. key_padding_mask (Tensor, optional): ByteTensor with shape [bs, num_keys]. Same in `nn.MultiheadAttention.forward`. Defaults to None. Returns: Tensor: forwarded results with shape [num_queries, bs, embed_dims]. """ if hasattr(self, 'operation_name'): if self.operation_name == 'self_attn': value = value + query_pos elif self.operation_name == 'cross_attn': value = value + key_pos else: raise NotImplementedError( f'{self.__class__.name} ' f"can't be used as {self.operation_name}") else: value = value + query_pos return super(GroupFree3DMHA, self).forward( query=query, key=key, value=value, identity=identity, query_pos=query_pos, key_pos=key_pos, attn_mask=attn_mask, key_padding_mask=key_padding_mask, **kwargs) @POSITIONAL_ENCODING.register_module() class ConvBNPositionalEncoding(nn.Module): """Absolute position embedding with Conv learning. Args: input_channel (int): input features dim. num_pos_feats (int, optional): output position features dim. Defaults to 288 to be consistent with seed features dim. """ def __init__(self, input_channel, num_pos_feats=288): super().__init__() self.position_embedding_head = nn.Sequential( nn.Conv1d(input_channel, num_pos_feats, kernel_size=1), nn.BatchNorm1d(num_pos_feats), nn.ReLU(inplace=True), nn.Conv1d(num_pos_feats, num_pos_feats, kernel_size=1)) def forward(self, xyz): """Forward pass. Args: xyz (Tensor): (B, N, 3) the coordinates to embed. Returns: Tensor: (B, num_pos_feats, N) the embedded position features. """ xyz = xyz.permute(0, 2, 1) position_embedding = self.position_embedding_head(xyz) return position_embedding ================================================ FILE: mmdet3d/models/model_utils/vote_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv import is_tuple_of from mmcv.cnn import ConvModule from torch import nn as nn from mmdet3d.models.builder import build_loss class VoteModule(nn.Module): """Vote module. Generate votes from seed point features. Args: in_channels (int): Number of channels of seed point features. vote_per_seed (int, optional): Number of votes generated from each seed point. Default: 1. gt_per_seed (int, optional): Number of ground truth votes generated from each seed point. Default: 3. num_points (int, optional): Number of points to be used for voting. Default: 1. conv_channels (tuple[int], optional): Out channels of vote generating convolution. Default: (16, 16). conv_cfg (dict, optional): Config of convolution. Default: dict(type='Conv1d'). norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d'). norm_feats (bool, optional): Whether to normalize features. Default: True. with_res_feat (bool, optional): Whether to predict residual features. Default: True. vote_xyz_range (list[float], optional): The range of points translation. Default: None. vote_loss (dict, optional): Config of vote loss. Default: None. """ def __init__(self, in_channels, vote_per_seed=1, gt_per_seed=3, num_points=-1, conv_channels=(16, 16), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), norm_feats=True, with_res_feat=True, vote_xyz_range=None, vote_loss=None): super().__init__() self.in_channels = in_channels self.vote_per_seed = vote_per_seed self.gt_per_seed = gt_per_seed self.num_points = num_points self.norm_feats = norm_feats self.with_res_feat = with_res_feat assert vote_xyz_range is None or is_tuple_of(vote_xyz_range, float) self.vote_xyz_range = vote_xyz_range if vote_loss is not None: self.vote_loss = build_loss(vote_loss) prev_channels = in_channels vote_conv_list = list() for k in range(len(conv_channels)): vote_conv_list.append( ConvModule( prev_channels, conv_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=True, inplace=True)) prev_channels = conv_channels[k] self.vote_conv = nn.Sequential(*vote_conv_list) # conv_out predicts coordinate and residual features if with_res_feat: out_channel = (3 + in_channels) * self.vote_per_seed else: out_channel = 3 * self.vote_per_seed self.conv_out = nn.Conv1d(prev_channels, out_channel, 1) def forward(self, seed_points, seed_feats): """forward. Args: seed_points (torch.Tensor): Coordinate of the seed points in shape (B, N, 3). seed_feats (torch.Tensor): Features of the seed points in shape (B, C, N). Returns: tuple[torch.Tensor]: - vote_points: Voted xyz based on the seed points with shape (B, M, 3), ``M=num_seed*vote_per_seed``. - vote_features: Voted features based on the seed points with shape (B, C, M) where ``M=num_seed*vote_per_seed``, ``C=vote_feature_dim``. """ if self.num_points != -1: assert self.num_points < seed_points.shape[1], \ f'Number of vote points ({self.num_points}) should be '\ f'smaller than seed points size ({seed_points.shape[1]})' seed_points = seed_points[:, :self.num_points] seed_feats = seed_feats[..., :self.num_points] batch_size, feat_channels, num_seed = seed_feats.shape num_vote = num_seed * self.vote_per_seed x = self.vote_conv(seed_feats) # (batch_size, (3+out_dim)*vote_per_seed, num_seed) votes = self.conv_out(x) votes = votes.transpose(2, 1).view(batch_size, num_seed, self.vote_per_seed, -1) offset = votes[:, :, :, 0:3] if self.vote_xyz_range is not None: limited_offset_list = [] for axis in range(len(self.vote_xyz_range)): limited_offset_list.append(offset[..., axis].clamp( min=-self.vote_xyz_range[axis], max=self.vote_xyz_range[axis])) limited_offset = torch.stack(limited_offset_list, -1) vote_points = (seed_points.unsqueeze(2) + limited_offset).contiguous() else: vote_points = (seed_points.unsqueeze(2) + offset).contiguous() vote_points = vote_points.view(batch_size, num_vote, 3) offset = offset.reshape(batch_size, num_vote, 3).transpose(2, 1) if self.with_res_feat: res_feats = votes[:, :, :, 3:] vote_feats = (seed_feats.transpose(2, 1).unsqueeze(2) + res_feats).contiguous() vote_feats = vote_feats.view(batch_size, num_vote, feat_channels).transpose( 2, 1).contiguous() if self.norm_feats: features_norm = torch.norm(vote_feats, p=2, dim=1) vote_feats = vote_feats.div(features_norm.unsqueeze(1)) else: vote_feats = seed_feats return vote_points, vote_feats, offset def get_loss(self, seed_points, vote_points, seed_indices, vote_targets_mask, vote_targets): """Calculate loss of voting module. Args: seed_points (torch.Tensor): Coordinate of the seed points. vote_points (torch.Tensor): Coordinate of the vote points. seed_indices (torch.Tensor): Indices of seed points in raw points. vote_targets_mask (torch.Tensor): Mask of valid vote targets. vote_targets (torch.Tensor): Targets of votes. Returns: torch.Tensor: Weighted vote loss. """ batch_size, num_seed = seed_points.shape[:2] seed_gt_votes_mask = torch.gather(vote_targets_mask, 1, seed_indices).float() seed_indices_expand = seed_indices.unsqueeze(-1).repeat( 1, 1, 3 * self.gt_per_seed) seed_gt_votes = torch.gather(vote_targets, 1, seed_indices_expand) seed_gt_votes += seed_points.repeat(1, 1, self.gt_per_seed) weight = seed_gt_votes_mask / (torch.sum(seed_gt_votes_mask) + 1e-6) distance = self.vote_loss( vote_points.view(batch_size * num_seed, -1, 3), seed_gt_votes.view(batch_size * num_seed, -1, 3), dst_weight=weight.view(batch_size * num_seed, 1))[1] vote_loss = torch.sum(torch.min(distance, dim=1)[0]) return vote_loss ================================================ FILE: mmdet3d/models/necks/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.necks.fpn import FPN from .dla_neck import DLANeck from .fpn import CustomFPN from .imvoxel_neck import OutdoorImVoxelNeck from .lss_fpn import FPN_LSS from .pointnet2_fp_neck import PointNetFPNeck from .second_fpn import SECONDFPN from .view_transformer import LSSViewTransformer, LSSViewTransformerBEVDepth __all__ = [ 'FPN', 'SECONDFPN', 'OutdoorImVoxelNeck', 'PointNetFPNeck', 'DLANeck', 'LSSViewTransformer', 'CustomFPN', 'FPN_LSS', 'LSSViewTransformerBEVDepth' ] ================================================ FILE: mmdet3d/models/necks/dla_neck.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import numpy as np from mmcv.cnn import ConvModule, build_conv_layer from mmcv.runner import BaseModule from torch import nn as nn from ..builder import NECKS def fill_up_weights(up): """Simulated bilinear upsampling kernel. Args: up (nn.Module): ConvTranspose2d module. """ w = up.weight.data f = math.ceil(w.size(2) / 2) c = (2 * f - 1 - f % 2) / (2. * f) for i in range(w.size(2)): for j in range(w.size(3)): w[0, 0, i, j] = \ (1 - math.fabs(i / f - c)) * (1 - math.fabs(j / f - c)) for c in range(1, w.size(0)): w[c, 0, :, :] = w[0, 0, :, :] class IDAUpsample(BaseModule): """Iterative Deep Aggregation (IDA) Upsampling module to upsample features of different scales to a similar scale. Args: out_channels (int): Number of output channels for DeformConv. in_channels (List[int]): List of input channels of multi-scale feature maps. kernel_sizes (List[int]): List of size of the convolving kernel of different scales. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. use_dcn (bool, optional): If True, use DCNv2. Default: True. """ def __init__( self, out_channels, in_channels, kernel_sizes, norm_cfg=None, use_dcn=True, init_cfg=None, ): super(IDAUpsample, self).__init__(init_cfg) self.use_dcn = use_dcn self.projs = nn.ModuleList() self.ups = nn.ModuleList() self.nodes = nn.ModuleList() for i in range(1, len(in_channels)): in_channel = in_channels[i] up_kernel_size = int(kernel_sizes[i]) proj = ConvModule( in_channel, out_channels, 3, padding=1, bias=True, conv_cfg=dict(type='DCNv2') if self.use_dcn else None, norm_cfg=norm_cfg) node = ConvModule( out_channels, out_channels, 3, padding=1, bias=True, conv_cfg=dict(type='DCNv2') if self.use_dcn else None, norm_cfg=norm_cfg) up = build_conv_layer( dict(type='deconv'), out_channels, out_channels, up_kernel_size * 2, stride=up_kernel_size, padding=up_kernel_size // 2, output_padding=0, groups=out_channels, bias=False) self.projs.append(proj) self.ups.append(up) self.nodes.append(node) def forward(self, mlvl_features, start_level, end_level): """Forward function. Args: mlvl_features (list[torch.Tensor]): Features from multiple layers. start_level (int): Start layer for feature upsampling. end_level (int): End layer for feature upsampling. """ for i in range(start_level, end_level - 1): upsample = self.ups[i - start_level] project = self.projs[i - start_level] mlvl_features[i + 1] = upsample(project(mlvl_features[i + 1])) node = self.nodes[i - start_level] mlvl_features[i + 1] = node(mlvl_features[i + 1] + mlvl_features[i]) class DLAUpsample(BaseModule): """Deep Layer Aggregation (DLA) Upsampling module for different scales feature extraction, upsampling and fusion, It consists of groups of IDAupsample modules. Args: start_level (int): The start layer. channels (List[int]): List of input channels of multi-scale feature maps. scales(List[int]): List of scale of different layers' feature. in_channels (NoneType, optional): List of input channels of different scales. Default: None. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. use_dcn (bool, optional): Whether to use dcn in IDAup module. Default: True. """ def __init__(self, start_level, channels, scales, in_channels=None, norm_cfg=None, use_dcn=True, init_cfg=None): super(DLAUpsample, self).__init__(init_cfg) self.start_level = start_level if in_channels is None: in_channels = channels self.channels = channels channels = list(channels) scales = np.array(scales, dtype=int) for i in range(len(channels) - 1): j = -i - 2 setattr( self, 'ida_{}'.format(i), IDAUpsample(channels[j], in_channels[j:], scales[j:] // scales[j], norm_cfg, use_dcn)) scales[j + 1:] = scales[j] in_channels[j + 1:] = [channels[j] for _ in channels[j + 1:]] def forward(self, mlvl_features): """Forward function. Args: mlvl_features(list[torch.Tensor]): Features from multi-scale layers. Returns: tuple[torch.Tensor]: Up-sampled features of different layers. """ outs = [mlvl_features[-1]] for i in range(len(mlvl_features) - self.start_level - 1): ida = getattr(self, 'ida_{}'.format(i)) ida(mlvl_features, len(mlvl_features) - i - 2, len(mlvl_features)) outs.insert(0, mlvl_features[-1]) return outs @NECKS.register_module() class DLANeck(BaseModule): """DLA Neck. Args: in_channels (list[int], optional): List of input channels of multi-scale feature map. start_level (int, optional): The scale level where upsampling starts. Default: 2. end_level (int, optional): The scale level where upsampling ends. Default: 5. norm_cfg (dict, optional): Config dict for normalization layer. Default: None. use_dcn (bool, optional): Whether to use dcn in IDAup module. Default: True. """ def __init__(self, in_channels=[16, 32, 64, 128, 256, 512], start_level=2, end_level=5, norm_cfg=None, use_dcn=True, init_cfg=None): super(DLANeck, self).__init__(init_cfg) self.start_level = start_level self.end_level = end_level scales = [2**i for i in range(len(in_channels[self.start_level:]))] self.dla_up = DLAUpsample( start_level=self.start_level, channels=in_channels[self.start_level:], scales=scales, norm_cfg=norm_cfg, use_dcn=use_dcn) self.ida_up = IDAUpsample( in_channels[self.start_level], in_channels[self.start_level:self.end_level], [2**i for i in range(self.end_level - self.start_level)], norm_cfg, use_dcn) def forward(self, x): mlvl_features = [x[i] for i in range(len(x))] mlvl_features = self.dla_up(mlvl_features) outs = [] for i in range(self.end_level - self.start_level): outs.append(mlvl_features[i].clone()) self.ida_up(outs, 0, len(outs)) return [outs[-1]] def init_weights(self): for m in self.modules(): if isinstance(m, nn.ConvTranspose2d): # In order to be consistent with the source code, # reset the ConvTranspose2d initialization parameters m.reset_parameters() # Simulated bilinear upsampling kernel fill_up_weights(m) elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.Conv2d): # In order to be consistent with the source code, # reset the Conv2d initialization parameters m.reset_parameters() ================================================ FILE: mmdet3d/models/necks/fpn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, auto_fp16 import torch.utils.checkpoint as cp from ..builder import NECKS @NECKS.register_module() class CustomFPN(BaseModule): r"""Feature Pyramid Network. This is an implementation of paper `Feature Pyramid Networks for Object Detection `_. Args: in_channels (List[int]): Number of input channels per scale. out_channels (int): Number of output channels (used at each scale) num_outs (int): Number of output scales. start_level (int): Index of the start input backbone level used to build the feature pyramid. Default: 0. end_level (int): Index of the end input backbone level (exclusive) to build the feature pyramid. Default: -1, which means the last level. add_extra_convs (bool | str): If bool, it decides whether to add conv layers on top of the original feature maps. Default to False. If True, it is equivalent to `add_extra_convs='on_input'`. If str, it specifies the source feature map of the extra convs. Only the following options are allowed - 'on_input': Last feat map of neck inputs (i.e. backbone feature). - 'on_lateral': Last feature map after lateral convs. - 'on_output': The last output feature map after fpn convs. relu_before_extra_convs (bool): Whether to apply relu before the extra conv. Default: False. no_norm_on_lateral (bool): Whether to apply norm on lateral. Default: False. conv_cfg (dict): Config dict for convolution layer. Default: None. norm_cfg (dict): Config dict for normalization layer. Default: None. act_cfg (str): Config dict for activation layer in ConvModule. Default: None. upsample_cfg (dict): Config dict for interpolate layer. Default: `dict(mode='nearest')` init_cfg (dict or list[dict], optional): Initialization config dict. Example: >>> import torch >>> in_channels = [2, 3, 5, 7] >>> scales = [340, 170, 84, 43] >>> inputs = [torch.rand(1, c, s, s) ... for c, s in zip(in_channels, scales)] >>> self = FPN(in_channels, 11, len(in_channels)).eval() >>> outputs = self.forward(inputs) >>> for i in range(len(outputs)): ... print(f'outputs[{i}].shape = {outputs[i].shape}') outputs[0].shape = torch.Size([1, 11, 340, 340]) outputs[1].shape = torch.Size([1, 11, 170, 170]) outputs[2].shape = torch.Size([1, 11, 84, 84]) outputs[3].shape = torch.Size([1, 11, 43, 43]) """ def __init__(self, in_channels, out_channels, num_outs, start_level=0, end_level=-1, out_ids=[], add_extra_convs=False, relu_before_extra_convs=False, no_norm_on_lateral=False, conv_cfg=None, norm_cfg=None, with_cp=False, act_cfg=None, upsample_cfg=dict(mode='nearest'), init_cfg=dict( type='Xavier', layer='Conv2d', distribution='uniform')): super(CustomFPN, self).__init__(init_cfg) assert isinstance(in_channels, list) self.in_channels = in_channels self.out_channels = out_channels self.num_ins = len(in_channels) self.num_outs = num_outs self.relu_before_extra_convs = relu_before_extra_convs self.no_norm_on_lateral = no_norm_on_lateral self.fp16_enabled = False self.with_cp = with_cp self.upsample_cfg = upsample_cfg.copy() self.out_ids = out_ids if end_level == -1: self.backbone_end_level = self.num_ins # assert num_outs >= self.num_ins - start_level else: # if end_level < inputs, no extra level is allowed self.backbone_end_level = end_level assert end_level <= len(in_channels) assert num_outs == end_level - start_level self.start_level = start_level self.end_level = end_level self.add_extra_convs = add_extra_convs assert isinstance(add_extra_convs, (str, bool)) if isinstance(add_extra_convs, str): # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') elif add_extra_convs: # True self.add_extra_convs = 'on_input' self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for i in range(self.start_level, self.backbone_end_level): l_conv = ConvModule( in_channels[i], out_channels, 1, conv_cfg=conv_cfg, norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, act_cfg=act_cfg, inplace=False) self.lateral_convs.append(l_conv) if i in self.out_ids: fpn_conv = ConvModule( out_channels, out_channels, 3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.fpn_convs.append(fpn_conv) # add extra conv layers (e.g., RetinaNet) extra_levels = num_outs - self.backbone_end_level + self.start_level if self.add_extra_convs and extra_levels >= 1: for i in range(extra_levels): if i == 0 and self.add_extra_convs == 'on_input': in_channels = self.in_channels[self.backbone_end_level - 1] else: in_channels = out_channels extra_fpn_conv = ConvModule( in_channels, out_channels, 3, stride=2, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, inplace=False) self.fpn_convs.append(extra_fpn_conv) @auto_fp16() def forward(self, inputs): """Forward function.""" assert len(inputs) == len(self.in_channels) # build laterals laterals = [ lateral_conv(inputs[i + self.start_level]) for i, lateral_conv in enumerate(self.lateral_convs) ] # build top-down path used_backbone_levels = len(laterals) for i in range(used_backbone_levels - 1, 0, -1): # In some cases, fixing `scale factor` (e.g. 2) is preferred, but # it cannot co-exist with `size` in `F.interpolate`. if 'scale_factor' in self.upsample_cfg: laterals[i - 1] += F.interpolate(laterals[i], **self.upsample_cfg) else: prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] += F.interpolate( laterals[i], size=prev_shape, **self.upsample_cfg) # build outputs # part 1: from original levels outs = [self.fpn_convs[i](laterals[i]) for i in self.out_ids] # part 2: add extra levels if self.num_outs > len(outs): # use max pool to get more levels on top of outputs # (e.g., Faster R-CNN, Mask R-CNN) if not self.add_extra_convs: for i in range(self.num_outs - used_backbone_levels): outs.append(F.max_pool2d(outs[-1], 1, stride=2)) # add conv layers on top of original feature maps (RetinaNet) else: if self.add_extra_convs == 'on_input': extra_source = inputs[self.backbone_end_level - 1] elif self.add_extra_convs == 'on_lateral': extra_source = laterals[-1] elif self.add_extra_convs == 'on_output': extra_source = outs[-1] else: raise NotImplementedError outs.append(self.fpn_convs[used_backbone_levels](extra_source)) for i in range(used_backbone_levels + 1, self.num_outs): if self.relu_before_extra_convs: outs.append(self.fpn_convs[i](F.relu(outs[-1]))) else: outs.append(self.fpn_convs[i](outs[-1])) return outs[0] ================================================ FILE: mmdet3d/models/necks/imvoxel_neck.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn import ConvModule from torch import nn from ..builder import NECKS @NECKS.register_module() class OutdoorImVoxelNeck(nn.Module): """Neck for ImVoxelNet outdoor scenario. Args: in_channels (int): Input channels of multi-scale feature map. out_channels (int): Output channels of multi-scale feature map. """ def __init__(self, in_channels, out_channels): super().__init__() self.model = nn.Sequential( ResModule(in_channels), ConvModule( in_channels=in_channels, out_channels=in_channels * 2, kernel_size=3, stride=(1, 1, 2), padding=1, conv_cfg=dict(type='Conv3d'), norm_cfg=dict(type='BN3d'), act_cfg=dict(type='ReLU', inplace=True)), ResModule(in_channels * 2), ConvModule( in_channels=in_channels * 2, out_channels=in_channels * 4, kernel_size=3, stride=(1, 1, 2), padding=1, conv_cfg=dict(type='Conv3d'), norm_cfg=dict(type='BN3d'), act_cfg=dict(type='ReLU', inplace=True)), ResModule(in_channels * 4), ConvModule( in_channels=in_channels * 4, out_channels=out_channels, kernel_size=3, padding=(1, 1, 0), conv_cfg=dict(type='Conv3d'), norm_cfg=dict(type='BN3d'), act_cfg=dict(type='ReLU', inplace=True))) def forward(self, x): """Forward function. Args: x (torch.Tensor): of shape (N, C_in, N_x, N_y, N_z). Returns: list[torch.Tensor]: of shape (N, C_out, N_y, N_x). """ x = self.model.forward(x) assert x.shape[-1] == 1 # Anchor3DHead axis order is (y, x). return [x[..., 0].transpose(-1, -2)] def init_weights(self): """Initialize weights of neck.""" pass class ResModule(nn.Module): """3d residual block for ImVoxelNeck. Args: n_channels (int): Input channels of a feature map. """ def __init__(self, n_channels): super().__init__() self.conv0 = ConvModule( in_channels=n_channels, out_channels=n_channels, kernel_size=3, padding=1, conv_cfg=dict(type='Conv3d'), norm_cfg=dict(type='BN3d'), act_cfg=dict(type='ReLU', inplace=True)) self.conv1 = ConvModule( in_channels=n_channels, out_channels=n_channels, kernel_size=3, padding=1, conv_cfg=dict(type='Conv3d'), norm_cfg=dict(type='BN3d'), act_cfg=None) self.activation = nn.ReLU(inplace=True) def forward(self, x): """Forward function. Args: x (torch.Tensor): of shape (N, C, N_x, N_y, N_z). Returns: torch.Tensor: 5d feature map. """ identity = x x = self.conv0(x) x = self.conv1(x) x = identity + x x = self.activation(x) return x ================================================ FILE: mmdet3d/models/necks/lss_fpn.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import torch import torch.nn as nn from mmcv.cnn import build_norm_layer import torch.nn.functional as F from mmdet.models import NECKS import torch.utils.checkpoint as cp @NECKS.register_module() class FPN_LSS(nn.Module): def __init__(self, in_channels, out_channels, scale_factor=4, input_feature_index=(0, 2), norm_cfg=dict(type='BN'), extra_upsample=2, lateral=None, with_cp=False, use_input_conv=False): super().__init__() self.input_feature_index = input_feature_index self.extra_upsample = extra_upsample is not None self.with_cp = with_cp # self.up = nn.Upsample( # scale_factor=scale_factor, mode='bilinear', align_corners=True) # assert norm_cfg['type'] in ['BN', 'SyncBN'] channels_factor = 2 if self.extra_upsample else 1 self.input_conv = nn.Sequential( nn.Conv2d( in_channels, out_channels * channels_factor, kernel_size=1, padding=0, bias=False), build_norm_layer( norm_cfg, out_channels * channels_factor, postfix=0)[1], nn.ReLU(inplace=True), ) if use_input_conv else None if use_input_conv: in_channels = out_channels * channels_factor self.conv = nn.Sequential( nn.Conv2d( in_channels, out_channels * channels_factor, kernel_size=3, padding=1, bias=False), build_norm_layer( norm_cfg, out_channels * channels_factor, postfix=0)[1], nn.ReLU(inplace=True), nn.Conv2d( out_channels * channels_factor, out_channels * channels_factor, kernel_size=3, padding=1, bias=False), build_norm_layer( norm_cfg, out_channels * channels_factor, postfix=0)[1], nn.ReLU(inplace=True), ) if self.extra_upsample: self.up2 = nn.Sequential( nn.Upsample( scale_factor=extra_upsample, mode='bilinear', align_corners=True), nn.Conv2d( out_channels * channels_factor, out_channels, kernel_size=3, padding=1, bias=False), build_norm_layer(norm_cfg, out_channels, postfix=0)[1], nn.ReLU(inplace=True), nn.Conv2d( out_channels, out_channels, kernel_size=1, padding=0), ) self.lateral = lateral is not None if self.lateral: self.lateral_conv = nn.Sequential( nn.Conv2d( lateral, lateral, kernel_size=1, padding=0, bias=False), build_norm_layer(norm_cfg, lateral, postfix=0)[1], nn.ReLU(inplace=True), ) def forward(self, feats): x2, x1 = feats[self.input_feature_index[0]], \ feats[self.input_feature_index[1]] if self.with_cp: if self.lateral: x2 = cp.checkpoint(self.lateral_conv, x2) x1 = F.interpolate(x1, size=x2.shape[2:]) x = torch.cat([x2, x1], dim=1) if self.input_conv is not None: x = cp.checkpoint(self.input_conv, x) x = cp.checkpoint(self.conv, x) if self.extra_upsample: x = cp.checkpoint(self.up2, x) else: if self.lateral: x2 = self.lateral_conv(x2) x1 = F.interpolate(x1, size=x2.shape[2:]) x = torch.cat([x2, x1], dim=1) if self.input_conv is not None: x = self.input_conv(x) x = self.conv(x) if self.extra_upsample: x = self.up2(x) return x ================================================ FILE: mmdet3d/models/necks/pointnet2_fp_neck.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.runner import BaseModule from torch import nn as nn from mmdet3d.ops import PointFPModule from ..builder import NECKS @NECKS.register_module() class PointNetFPNeck(BaseModule): r"""PointNet FP Module used in PointRCNN. Refer to the `official code `_. .. code-block:: none sa_n ---------------------------------------- | ... --------------------------------- | | | sa_1 ------------- | | | | | sa_0 -> fp_0 -> fp_module ->fp_1 -> ... -> fp_module -> fp_n sa_n including sa_xyz (torch.Tensor) and sa_features (torch.Tensor) fp_n including fp_xyz (torch.Tensor) and fp_features (torch.Tensor) Args: fp_channels (tuple[tuple[int]]): Tuple of mlp channels in FP modules. init_cfg (dict or list[dict], optional): Initialization config dict. Default: None """ def __init__(self, fp_channels, init_cfg=None): super(PointNetFPNeck, self).__init__(init_cfg=init_cfg) self.num_fp = len(fp_channels) self.FP_modules = nn.ModuleList() for cur_fp_mlps in fp_channels: self.FP_modules.append(PointFPModule(mlp_channels=cur_fp_mlps)) def _extract_input(self, feat_dict): """Extract inputs from features dictionary. Args: feat_dict (dict): Feature dict from backbone, which may contain the following keys and values: - sa_xyz (list[torch.Tensor]): Points of each sa module in shape (N, 3). - sa_features (list[torch.Tensor]): Output features of each sa module in shape (N, M). Returns: list[torch.Tensor]: Coordinates of multiple levels of points. list[torch.Tensor]: Features of multiple levels of points. """ sa_xyz = feat_dict['sa_xyz'] sa_features = feat_dict['sa_features'] assert len(sa_xyz) == len(sa_features) return sa_xyz, sa_features def forward(self, feat_dict): """Forward pass. Args: feat_dict (dict): Feature dict from backbone. Returns: dict[str, torch.Tensor]: Outputs of the Neck. - fp_xyz (torch.Tensor): The coordinates of fp features. - fp_features (torch.Tensor): The features from the last feature propagation layers. """ sa_xyz, sa_features = self._extract_input(feat_dict) fp_feature = sa_features[-1] fp_xyz = sa_xyz[-1] for i in range(self.num_fp): # consume the points in a bottom-up manner fp_feature = self.FP_modules[i](sa_xyz[-(i + 2)], sa_xyz[-(i + 1)], sa_features[-(i + 2)], fp_feature) fp_xyz = sa_xyz[-(i + 2)] ret = dict(fp_xyz=fp_xyz, fp_features=fp_feature) return ret ================================================ FILE: mmdet3d/models/necks/second_fpn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn from ..builder import NECKS @NECKS.register_module() class SECONDFPN(BaseModule): """FPN used in SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (list[int]): Input channels of multi-scale feature maps. out_channels (list[int]): Output channels of feature maps. upsample_strides (list[int]): Strides used to upsample the feature maps. norm_cfg (dict): Config dict of normalization layers. upsample_cfg (dict): Config dict of upsample layers. conv_cfg (dict): Config dict of conv layers. use_conv_for_no_stride (bool): Whether to use conv when stride is 1. """ def __init__(self, in_channels=[128, 128, 256], out_channels=[256, 256, 256], upsample_strides=[1, 2, 4], norm_cfg=dict(type='BN', eps=1e-3, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), conv_cfg=dict(type='Conv2d', bias=False), use_conv_for_no_stride=False, init_cfg=None): # if for GroupNorm, # cfg is dict(type='GN', num_groups=num_groups, eps=1e-3, affine=True) super(SECONDFPN, self).__init__(init_cfg=init_cfg) assert len(out_channels) == len(upsample_strides) == len(in_channels) self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False deblocks = [] for i, out_channel in enumerate(out_channels): stride = upsample_strides[i] if stride > 1 or (stride == 1 and not use_conv_for_no_stride): upsample_layer = build_upsample_layer( upsample_cfg, in_channels=in_channels[i], out_channels=out_channel, kernel_size=upsample_strides[i], stride=upsample_strides[i]) else: stride = np.round(1 / stride).astype(np.int64) upsample_layer = build_conv_layer( conv_cfg, in_channels=in_channels[i], out_channels=out_channel, kernel_size=stride, stride=stride) deblock = nn.Sequential(upsample_layer, build_norm_layer(norm_cfg, out_channel)[1], nn.ReLU(inplace=True)) deblocks.append(deblock) self.deblocks = nn.ModuleList(deblocks) if init_cfg is None: self.init_cfg = [ dict(type='Kaiming', layer='ConvTranspose2d'), dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0) ] @auto_fp16() def forward(self, x): """Forward function. Args: x (torch.Tensor): 4D Tensor in (N, C, H, W) shape. Returns: list[torch.Tensor]: Multi-level feature maps. """ assert len(x) == len(self.in_channels) ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)] if len(ups) > 1: out = torch.cat(ups, dim=1) else: out = ups[0] return [out] ================================================ FILE: mmdet3d/models/necks/view_transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import build_conv_layer from mmcv.runner import BaseModule, force_fp32 from torch.cuda.amp.autocast_mode import autocast from torch.utils.checkpoint import checkpoint from mmdet3d.ops.bev_pool_v2.bev_pool import bev_pool_v2 from mmdet.models.backbones.resnet import BasicBlock from ..builder import NECKS import torch.utils.checkpoint as cp @NECKS.register_module() class LSSViewTransformer(BaseModule): r"""Lift-Splat-Shoot view transformer. Please refer to the `paper `_ Args: grid_config (dict): Config of grid alone each axis in format of (lower_bound, upper_bound, interval). axis in {x,y,z,depth}. input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample factor from the input size to the feature size. in_channels (int): Channels of input feature. out_channels (int): Channels of transformed feature. accelerate (bool): Whether the view transformation is conducted with acceleration. Note: the intrinsic and extrinsic of cameras should be constant when 'accelerate' is set true. """ def __init__( self, grid_config, input_size, downsample=16, in_channels=512, out_channels=64, accelerate=False, uniform=False, with_cp=False ): super(LSSViewTransformer, self).__init__() self.uniform = uniform self.with_cp = with_cp self.grid_config = grid_config self.downsample = downsample self.create_grid_infos(**grid_config) self.create_frustum(grid_config['depth'], input_size, downsample) self.out_channels = out_channels self.in_channels = in_channels self.depth_net = nn.Conv2d( in_channels, self.D + self.out_channels, kernel_size=1, padding=0) self.accelerate = accelerate self.initial_flag = True def create_grid_infos(self, x, y, z, **kwargs): """Generate the grid information including the lower bound, interval, and size. Args: x (tuple(float)): Config of grid alone x axis in format of (lower_bound, upper_bound, interval). y (tuple(float)): Config of grid alone y axis in format of (lower_bound, upper_bound, interval). z (tuple(float)): Config of grid alone z axis in format of (lower_bound, upper_bound, interval). **kwargs: Container for other potential parameters """ self.grid_lower_bound = torch.Tensor([cfg[0] for cfg in [x, y, z]]) self.grid_interval = torch.Tensor([cfg[2] for cfg in [x, y, z]]) self.grid_size = torch.Tensor([(cfg[1] - cfg[0]) / cfg[2] for cfg in [x, y, z]]) def create_frustum(self, depth_cfg, input_size, downsample): """Generate the frustum template for each image. Args: depth_cfg (tuple(float)): Config of grid alone depth axis in format of (lower_bound, upper_bound, interval). input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample scale factor from the input size to the feature size. """ H_in, W_in = input_size H_feat, W_feat = H_in // downsample, W_in // downsample d = torch.arange(*depth_cfg, dtype=torch.float)\ .view(-1, 1, 1).expand(-1, H_feat, W_feat) self.D = d.shape[0] x = torch.linspace(0, W_in - 1, W_feat, dtype=torch.float)\ .view(1, 1, W_feat).expand(self.D, H_feat, W_feat) y = torch.linspace(0, H_in - 1, H_feat, dtype=torch.float)\ .view(1, H_feat, 1).expand(self.D, H_feat, W_feat) # D x H x W x 3 self.frustum = torch.stack((x, y, d), -1) def get_lidar_coor(self, rots, trans, cam2imgs, post_rots, post_trans, bda): """Calculate the locations of the frustum points in the lidar coordinate system. Args: rots (torch.Tensor): Rotation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3, 3). trans (torch.Tensor): Translation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3). cam2imgs (torch.Tensor): Camera intrinsic matrixes in shape (B, N_cams, 3, 3). post_rots (torch.Tensor): Rotation in camera coordinate system in shape (B, N_cams, 3, 3). It is derived from the image view augmentation. post_trans (torch.Tensor): Translation in camera coordinate system derived from image view augmentation in shape (B, N_cams, 3). Returns: torch.tensor: Point coordinates in shape (B, N_cams, D, ownsample, 3) """ B, N, _ = trans.shape # post-transformation # B x N x D x H x W x 3 points = self.frustum.to(rots) - post_trans.view(B, N, 1, 1, 1, 3) points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\ .matmul(points.unsqueeze(-1)) # cam_to_ego points = torch.cat( (points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5) combine = rots.matmul(torch.inverse(cam2imgs)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += trans.view(B, N, 1, 1, 1, 3) points = bda.view(B, 1, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)).squeeze(-1) return points def init_acceleration_v2(self, coor): """Pre-compute the necessary information in acceleration including the index of points in the final feature. Args: coor (torch.tensor): Coordinate of points in lidar space in shape (B, N_cams, D, H, W, 3). x (torch.tensor): Feature of points in shape (B, N_cams, D, H, W, C). """ ranks_bev, ranks_depth, ranks_feat, \ interval_starts, interval_lengths = \ self.voxel_pooling_prepare_v2(coor) self.ranks_bev = ranks_bev.int().contiguous() self.ranks_feat = ranks_feat.int().contiguous() self.ranks_depth = ranks_depth.int().contiguous() self.interval_starts = interval_starts.int().contiguous() self.interval_lengths = interval_lengths.int().contiguous() def voxel_pooling_v2(self, coor, depth, feat): ranks_bev, ranks_depth, ranks_feat, \ interval_starts, interval_lengths = \ self.voxel_pooling_prepare_v2(coor) if ranks_feat is None: print('warning ---> no points within the predefined ' 'bev receptive field') dummy = torch.zeros(size=[ feat.shape[0], feat.shape[2], int(self.grid_size[2]), int(self.grid_size[0]), int(self.grid_size[1]) ]).to(feat) dummy = torch.cat(dummy.unbind(dim=2), 1) return dummy feat = feat.permute(0, 1, 3, 4, 2) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) # collapse Z # from IPython import embed # embed() # exit() bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) return bev_feat def voxel_pooling_prepare_v2(self, coor): """Data preparation for voxel pooling. Args: coor (torch.tensor): Coordinate of points in the lidar space in shape (B, N, D, H, W, 3). Returns: tuple[torch.tensor]: Rank of the voxel that a point is belong to in shape (N_Points); Reserved index of points in the depth space in shape (N_Points). Reserved index of points in the feature space in shape (N_Points). """ B, N, D, H, W, _ = coor.shape num_points = B * N * D * H * W # record the index of selected points for acceleration purpose ranks_depth = torch.range( 0, num_points - 1, dtype=torch.int, device=coor.device) ranks_feat = torch.range( 0, num_points // D - 1, dtype=torch.int, device=coor.device) ranks_feat = ranks_feat.reshape(B, N, 1, H, W) ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() # convert coordinate into the voxel space coor = ((coor - self.grid_lower_bound.to(coor)) / self.grid_interval.to(coor)) coor = coor.long().view(num_points, 3) batch_idx = torch.range(0, B - 1).reshape(B, 1). \ expand(B, num_points // B).reshape(num_points, 1).to(coor) coor = torch.cat((coor, batch_idx), 1) # filter out points that are outside box kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \ (coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \ (coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2]) if len(kept) == 0: return None, None, None, None, None coor, ranks_depth, ranks_feat = \ coor[kept], ranks_depth[kept], ranks_feat[kept] # get tensors from the same voxel next to each other ranks_bev = coor[:, 3] * ( self.grid_size[2] * self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0] order = ranks_bev.argsort() ranks_bev, ranks_depth, ranks_feat = \ ranks_bev[order], ranks_depth[order], ranks_feat[order] kept = torch.ones( ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_bev[1:] != ranks_bev[:-1] interval_starts = torch.where(kept)[0].int() if len(interval_starts) == 0: return None, None, None, None, None interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] return ranks_bev.int().contiguous(), \ ranks_depth.int().contiguous(),\ ranks_feat.int().contiguous(), \ interval_starts.int().contiguous(),\ interval_lengths.int().contiguous() def pre_compute(self, input): if self.initial_flag: coor = self.get_lidar_coor(*input[1:7]) self.init_acceleration_v2( coor) self.initial_flag = False def view_transform_core(self, input, depth, tran_feat): B, N, C, H, W = input[0].shape # Lift-Splat if self.accelerate: feat = tran_feat.view(B, N, self.out_channels, H, W) feat = feat.permute(0, 1, 3, 4, 2) depth = depth.view(B, N, self.D, H, W) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, self.ranks_depth, self.ranks_feat, self.ranks_bev, bev_feat_shape, self.interval_starts, self.interval_lengths) bev_feat = bev_feat.squeeze(2) else: coor = self.get_lidar_coor(*input[1:7]) bev_feat = self.voxel_pooling_v2( coor, depth.view(B, N, self.D, H, W), tran_feat.view(B, N, self.out_channels, H, W)) return bev_feat, depth def view_transform(self, input, depth, tran_feat): if self.accelerate: self.pre_compute(input) return self.view_transform_core(input, depth, tran_feat) def forward(self, input, return_depth_digit=False): """Transform image-view feature into bird-eye-view feature. Args: input (list(torch.tensor)): of (image-view feature, rots, trans, intrins, post_rots, post_trans) Returns: torch.tensor: Bird-eye-view feature in shape (B, C, H_BEV, W_BEV) """ x = input[0] B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) if self.with_cp: x = cp.checkpoint(self.depth_net, x) else: x = self.depth_net(x) depth_digit = x[:, :self.D, ...] tran_feat = x[:, self.D:self.D + self.out_channels, ...] if self.uniform: depth_digit = depth_digit * 0 depth = depth_digit.softmax(dim=1) else: depth = depth_digit.softmax(dim=1) if not return_depth_digit: return self.view_transform(input, depth, tran_feat) else: return self.view_transform(input, depth, tran_feat) + (depth_digit, ) def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): return None @NECKS.register_module() class LSSViewTransformer2(BaseModule): r"""Lift-Splat-Shoot view transformer. Please refer to the `paper `_ Args: grid_config (dict): Config of grid alone each axis in format of (lower_bound, upper_bound, interval). axis in {x,y,z,depth}. input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample factor from the input size to the feature size. in_channels (int): Channels of input feature. out_channels (int): Channels of transformed feature. accelerate (bool): Whether the view transformation is conducted with acceleration. Note: the intrinsic and extrinsic of cameras should be constant when 'accelerate' is set true. """ def __init__( self, grid_config, input_size, downsample=16, in_channels=512, out_channels=64, accelerate=False, uniform=False, with_cp=False ): super(LSSViewTransformer2, self).__init__() self.uniform = uniform self.with_cp = with_cp self.grid_config = grid_config self.downsample = downsample self.create_grid_infos(**grid_config) self.create_frustum(grid_config['depth'], input_size, downsample) self.out_channels = out_channels self.in_channels = in_channels self.depth_net = nn.Conv2d( in_channels, self.D + self.out_channels, kernel_size=1, padding=0) self.accelerate = accelerate self.initial_flag = True def create_grid_infos(self, x, y, z, **kwargs): """Generate the grid information including the lower bound, interval, and size. Args: x (tuple(float)): Config of grid alone x axis in format of (lower_bound, upper_bound, interval). y (tuple(float)): Config of grid alone y axis in format of (lower_bound, upper_bound, interval). z (tuple(float)): Config of grid alone z axis in format of (lower_bound, upper_bound, interval). **kwargs: Container for other potential parameters """ self.grid_lower_bound = torch.Tensor([cfg[0] for cfg in [x, y, z]]) self.grid_interval = torch.Tensor([cfg[2] for cfg in [x, y, z]]) self.grid_size = torch.Tensor([(cfg[1] - cfg[0]) / cfg[2] for cfg in [x, y, z]]) def create_frustum(self, depth_cfg, input_size, downsample): """Generate the frustum template for each image. Args: depth_cfg (tuple(float)): Config of grid alone depth axis in format of (lower_bound, upper_bound, interval). input_size (tuple(int)): Size of input images in format of (height, width). downsample (int): Down sample scale factor from the input size to the feature size. """ H_in, W_in = input_size H_feat, W_feat = H_in // downsample, W_in // downsample d = torch.arange(*depth_cfg, dtype=torch.float)\ .view(-1, 1, 1).expand(-1, H_feat, W_feat) self.D = d.shape[0] x = torch.linspace(0, W_in - 1, W_feat, dtype=torch.float)\ .view(1, 1, W_feat).expand(self.D, H_feat, W_feat) y = torch.linspace(0, H_in - 1, H_feat, dtype=torch.float)\ .view(1, H_feat, 1).expand(self.D, H_feat, W_feat) # D x H x W x 3 self.frustum = torch.stack((x, y, d), -1) def get_lidar_coor(self, rots, trans, cam2imgs, post_rots, post_trans, bda): """Calculate the locations of the frustum points in the lidar coordinate system. Args: rots (torch.Tensor): Rotation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3, 3). trans (torch.Tensor): Translation from camera coordinate system to lidar coordinate system in shape (B, N_cams, 3). cam2imgs (torch.Tensor): Camera intrinsic matrixes in shape (B, N_cams, 3, 3). post_rots (torch.Tensor): Rotation in camera coordinate system in shape (B, N_cams, 3, 3). It is derived from the image view augmentation. post_trans (torch.Tensor): Translation in camera coordinate system derived from image view augmentation in shape (B, N_cams, 3). Returns: torch.tensor: Point coordinates in shape (B, N_cams, D, ownsample, 3) """ B, N, _ = trans.shape # post-transformation # B x N x D x H x W x 3 points = self.frustum.to(rots) - post_trans.view(B, N, 1, 1, 1, 3) points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3)\ .matmul(points.unsqueeze(-1)) # cam_to_ego points = torch.cat( (points[..., :2, :] * points[..., 2:3, :], points[..., 2:3, :]), 5) combine = rots.matmul(torch.inverse(cam2imgs)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += trans.view(B, N, 1, 1, 1, 3) points = bda.view(B, 1, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)).squeeze(-1) return points def init_acceleration_v2(self, coor): """Pre-compute the necessary information in acceleration including the index of points in the final feature. Args: coor (torch.tensor): Coordinate of points in lidar space in shape (B, N_cams, D, H, W, 3). x (torch.tensor): Feature of points in shape (B, N_cams, D, H, W, C). """ kept, ranks_bev, ranks_depth, ranks_feat = \ self.voxel_pooling_prepare_v2_inf(coor) self.kept = kept self.ranks_bev = ranks_bev.int().contiguous() self.ranks_feat = ranks_feat.int().contiguous() self.ranks_depth = ranks_depth.int().contiguous() # self.interval_starts = interval_starts.int().contiguous() # self.interval_lengths = interval_lengths.int().contiguous() def voxel_pooling_v2(self, coor, depth, feat): ranks_bev, ranks_depth, ranks_feat = \ self.voxel_pooling_prepare_v2(depth, coor) kept = torch.ones( ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_bev[1:] != ranks_bev[:-1] interval_starts = torch.where(kept)[0].int() if len(interval_starts) == 0: return None, None, None, None, None interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] if ranks_feat is None: print('warning ---> no points within the predefined ' 'bev receptive field') dummy = torch.zeros(size=[ feat.shape[0], feat.shape[2], int(self.grid_size[2]), int(self.grid_size[0]), int(self.grid_size[1]) ]).to(feat) dummy = torch.cat(dummy.unbind(dim=2), 1) return dummy feat = feat.permute(0, 1, 3, 4, 2) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) # collapse Z bev_feat = torch.cat(bev_feat.unbind(dim=2), 1) return bev_feat def voxel_pooling_prepare_v2(self, depth, coor): """Data preparation for voxel pooling. Args: coor (torch.tensor): Coordinate of points in the lidar space in shape (B, N, D, H, W, 3). Returns: tuple[torch.tensor]: Rank of the voxel that a point is belong to in shape (N_Points); Reserved index of points in the depth space in shape (N_Points). Reserved index of points in the feature space in shape (N_Points). """ B, N, D, H, W, _ = coor.shape num_points = B * N * D * H * W # record the index of selected points for acceleration purpose ranks_depth = torch.range( 0, num_points - 1, dtype=torch.int, device=coor.device) ranks_feat = torch.range( 0, num_points // D - 1, dtype=torch.int, device=coor.device) ranks_feat = ranks_feat.reshape(B, N, 1, H, W) ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() # convert coordinate into the voxel space coor = ((coor - self.grid_lower_bound.to(coor)) / self.grid_interval.to(coor)) coor = coor.long().view(num_points, 3) batch_idx = torch.range(0, B - 1).reshape(B, 1). \ expand(B, num_points // B).reshape(num_points, 1).to(coor) coor = torch.cat((coor, batch_idx), 1) # filter out points that are outside box kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \ (coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \ (coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2]) kept_depth = depth.view(-1) > 0.01 kept = kept & kept_depth if len(kept) == 0: return None, None, None, None, None coor, ranks_depth, ranks_feat = \ coor[kept], ranks_depth[kept], ranks_feat[kept] # get tensors from the same voxel next to each other ranks_bev = coor[:, 3] * ( self.grid_size[2] * self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0] order = ranks_bev.argsort() ranks_bev, ranks_depth, ranks_feat = \ ranks_bev[order], ranks_depth[order], ranks_feat[order] return ranks_bev.int().contiguous(),\ ranks_depth.int().contiguous(),\ ranks_feat.int().contiguous(), # interval_starts.int().contiguous(),/ # 每个voxel的其实point坐标 # interval_lengths.int().contiguous() # 每个voxel中累加的point长度 def voxel_pooling_prepare_v2_inf(self, coor): """Data preparation for voxel pooling. Args: coor (torch.tensor): Coordinate of points in the lidar space in shape (B, N, D, H, W, 3). Returns: tuple[torch.tensor]: Rank of the voxel that a point is belong to in shape (N_Points); Reserved index of points in the depth space in shape (N_Points). Reserved index of points in the feature space in shape (N_Points). """ B, N, D, H, W, _ = coor.shape num_points = B * N * D * H * W # record the index of selected points for acceleration purpose ranks_depth = torch.range( 0, num_points - 1, dtype=torch.int, device=coor.device) ranks_feat = torch.range( 0, num_points // D - 1, dtype=torch.int, device=coor.device) ranks_feat = ranks_feat.reshape(B, N, 1, H, W) ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten() # convert coordinate into the voxel space coor = ((coor - self.grid_lower_bound.to(coor)) / self.grid_interval.to(coor)) coor = coor.long().view(num_points, 3) batch_idx = torch.range(0, B - 1).reshape(B, 1). \ expand(B, num_points // B).reshape(num_points, 1).to(coor) coor = torch.cat((coor, batch_idx), 1) # filter out points that are outside box kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \ (coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \ (coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2]) if len(kept) == 0: return None, None, None, None, None coor, ranks_depth, ranks_feat = \ coor[kept], ranks_depth[kept], ranks_feat[kept] # get tensors from the same voxel next to each other ranks_bev = coor[:, 3] * ( self.grid_size[2] * self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0]) ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0] order = ranks_bev.argsort() ranks_bev, ranks_depth, ranks_feat = \ ranks_bev[order], ranks_depth[order], ranks_feat[order] return kept,\ ranks_bev.int().contiguous(),\ ranks_depth.int().contiguous(),\ ranks_feat.int().contiguous(), # interval_starts.int().contiguous(),/ # 每个voxel的其实point坐标 # interval_lengths.int().contiguous() # 每个voxel中累加的point长度 def pre_compute(self, input): if self.initial_flag: coor = self.get_lidar_coor(*input[1:7]) self.init_acceleration_v2(coor) self.initial_flag = False def view_transform_core(self, input, depth, tran_feat): B, N, C, H, W = input[0].shape # Lift-Splat if self.accelerate: feat = tran_feat.view(B, N, self.out_channels, H, W) feat = feat.permute(0, 1, 3, 4, 2) depth = depth.view(B, N, self.D, H, W) bev_feat_shape = (depth.shape[0], int(self.grid_size[2]), int(self.grid_size[1]), int(self.grid_size[0]), feat.shape[-1]) # (B, Z, Y, X, C) depth_kept = (depth.view(-1) > 0.01)[self.kept] # print(depth_kept.sum()/self.kept.sum()) # from IPython import embed # embed() # exit() new_ranks_bev = self.ranks_bev[depth_kept].contiguous() new_ranks_feat = self.ranks_feat[depth_kept].contiguous() new_ranks_depth = self.ranks_depth[depth_kept].contiguous() kept = torch.ones( new_ranks_bev.shape[0], device=new_ranks_bev.device, dtype=torch.bool) kept[1:] = new_ranks_bev[1:] != new_ranks_bev[:-1] interval_starts = torch.where(kept)[0].int() interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = new_ranks_bev.shape[0] - interval_starts[-1] bev_feat = bev_pool_v2(depth, feat, new_ranks_depth, new_ranks_feat, new_ranks_bev, bev_feat_shape, interval_starts.int().contiguous(), interval_lengths.int().contiguous()) bev_feat = bev_feat.squeeze(2) else: coor = self.get_lidar_coor(*input[1:7]) bev_feat = self.voxel_pooling_v2( coor, depth.view(B, N, self.D, H, W), tran_feat.view(B, N, self.out_channels, H, W)) return bev_feat, depth def view_transform(self, input, depth, tran_feat): if self.accelerate: self.pre_compute(input) return self.view_transform_core(input, depth, tran_feat) def forward(self, input, return_depth_digit=False): """Transform image-view feature into bird-eye-view feature. Args: input (list(torch.tensor)): of (image-view feature, rots, trans, intrins, post_rots, post_trans) Returns: torch.tensor: Bird-eye-view feature in shape (B, C, H_BEV, W_BEV) """ x = input[0] B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) if self.with_cp: x = cp.checkpoint(self.depth_net, x) else: x = self.depth_net(x) depth_digit = x[:, :self.D, ...] tran_feat = x[:, self.D:self.D + self.out_channels, ...] if self.uniform: depth_digit = depth_digit * 0 depth = depth_digit.softmax(dim=1) else: depth = depth_digit.softmax(dim=1) if not return_depth_digit: return self.view_transform(input, depth, tran_feat) else: return self.view_transform(input, depth, tran_feat) + (depth_digit, ) def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): return None class _ASPPModule(nn.Module): def __init__(self, inplanes, planes, kernel_size, padding, dilation, BatchNorm): super(_ASPPModule, self).__init__() self.atrous_conv = nn.Conv2d( inplanes, planes, kernel_size=kernel_size, stride=1, padding=padding, dilation=dilation, bias=False) self.bn = BatchNorm(planes) self.relu = nn.ReLU() self._init_weight() def forward(self, x): x = self.atrous_conv(x) x = self.bn(x) return self.relu(x) def _init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() class ASPP(nn.Module): def __init__(self, inplanes, mid_channels=256, BatchNorm=nn.BatchNorm2d): super(ASPP, self).__init__() dilations = [1, 6, 12, 18] self.aspp1 = _ASPPModule( inplanes, mid_channels, 1, padding=0, dilation=dilations[0], BatchNorm=BatchNorm) self.aspp2 = _ASPPModule( inplanes, mid_channels, 3, padding=dilations[1], dilation=dilations[1], BatchNorm=BatchNorm) self.aspp3 = _ASPPModule( inplanes, mid_channels, 3, padding=dilations[2], dilation=dilations[2], BatchNorm=BatchNorm) self.aspp4 = _ASPPModule( inplanes, mid_channels, 3, padding=dilations[3], dilation=dilations[3], BatchNorm=BatchNorm) self.global_avg_pool = nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False), BatchNorm(mid_channels), nn.ReLU(), ) self.conv1 = nn.Conv2d( int(mid_channels * 5), inplanes, 1, bias=False) self.bn1 = BatchNorm(inplanes) self.relu = nn.ReLU() self.dropout = nn.Dropout(0.5) self._init_weight() def forward(self, x): x1 = self.aspp1(x) x2 = self.aspp2(x) x3 = self.aspp3(x) x4 = self.aspp4(x) x5 = self.global_avg_pool(x) x5 = F.interpolate( x5, size=x4.size()[2:], mode='bilinear', align_corners=True) x = torch.cat((x1, x2, x3, x4, x5), dim=1) x = self.conv1(x) x = self.bn1(x) x = self.relu(x) return self.dropout(x) def _init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, drop=0.0): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.drop1 = nn.Dropout(drop) self.fc2 = nn.Linear(hidden_features, out_features) self.drop2 = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop1(x) x = self.fc2(x) x = self.drop2(x) return x class SELayer(nn.Module): def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): super().__init__() self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True) self.act1 = act_layer() self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True) self.gate = gate_layer() def forward(self, x, x_se): x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) return x * self.gate(x_se) class DepthNet(nn.Module): def __init__(self, in_channels, mid_channels, context_channels, depth_channels, use_dcn=True, use_aspp=True, aspp_mid_channels=-1 ): super(DepthNet, self).__init__() self.reduce_conv = nn.Sequential( nn.Conv2d( in_channels, mid_channels, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), ) self.context_conv = nn.Conv2d( mid_channels, context_channels, kernel_size=1, stride=1, padding=0) self.bn = nn.BatchNorm1d(27) self.depth_mlp = Mlp(27, mid_channels, mid_channels) self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware self.context_mlp = Mlp(27, mid_channels, mid_channels) self.context_se = SELayer(mid_channels) # NOTE: add camera-aware depth_conv_list = [ BasicBlock(mid_channels, mid_channels), BasicBlock(mid_channels, mid_channels), BasicBlock(mid_channels, mid_channels), ] if use_aspp: if aspp_mid_channels<0: aspp_mid_channels = mid_channels depth_conv_list.append(ASPP(mid_channels, aspp_mid_channels)) if use_dcn: depth_conv_list.append( build_conv_layer( cfg=dict( type='DCN', in_channels=mid_channels, out_channels=mid_channels, kernel_size=3, padding=1, groups=4, im2col_step=128, ))) depth_conv_list.append( nn.Conv2d( mid_channels, depth_channels, kernel_size=1, stride=1, padding=0)) self.depth_conv = nn.Sequential(*depth_conv_list) def forward(self, x, mlp_input): mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) x = self.reduce_conv(x) context_se = self.context_mlp(mlp_input)[..., None, None] context = self.context_se(x, context_se) context = self.context_conv(context) depth_se = self.depth_mlp(mlp_input)[..., None, None] depth = self.depth_se(x, depth_se) depth = self.depth_conv(depth) return torch.cat([depth, context], dim=1) class DepthAggregation(nn.Module): """pixel cloud feature extraction.""" def __init__(self, in_channels, mid_channels, out_channels): super(DepthAggregation, self).__init__() self.reduce_conv = nn.Sequential( nn.Conv2d( in_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), ) self.conv = nn.Sequential( nn.Conv2d( mid_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), nn.Conv2d( mid_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), ) self.out_conv = nn.Sequential( nn.Conv2d( mid_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), # nn.BatchNorm3d(out_channels), # nn.ReLU(inplace=True), ) @autocast(False) def forward(self, x): x = checkpoint(self.reduce_conv, x) short_cut = x x = checkpoint(self.conv, x) x = short_cut + x x = self.out_conv(x) return x import numpy as np @NECKS.register_module() class LSSViewTransformerBEVDepth(LSSViewTransformer2): def __init__(self, loss_depth_weight=3.0, depthnet_cfg=dict(), with_cp=False, **kwargs): super(LSSViewTransformerBEVDepth, self).__init__(**kwargs) self.with_cp = with_cp self.loss_depth_weight = loss_depth_weight self.depth_net = DepthNet(self.in_channels, self.in_channels, self.out_channels, self.D, **depthnet_cfg) def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda): B, N, _, _ = rot.shape bda = bda.view(B, 1, 3, 3).repeat(1, N, 1, 1) mlp_input = torch.stack([ intrin[:, :, 0, 0], intrin[:, :, 1, 1], intrin[:, :, 0, 2], intrin[:, :, 1, 2], post_rot[:, :, 0, 0], post_rot[:, :, 0, 1], post_tran[:, :, 0], post_rot[:, :, 1, 0], post_rot[:, :, 1, 1], post_tran[:, :, 1], bda[:, :, 0, 0], bda[:, :, 0, 1], bda[:, :, 1, 0], bda[:, :, 1, 1], bda[:, :, 2, 2], ], dim=-1) sensor2ego = torch.cat([rot, tran.reshape(B, N, 3, 1)], dim=-1).reshape(B, N, -1) mlp_input = torch.cat([mlp_input, sensor2ego], dim=-1) return mlp_input def get_downsampled_gt_depth(self, gt_depths): """ Input: gt_depths: [B, N, H, W] Output: gt_depths: [B*N*h*w, d] """ B, N, H, W = gt_depths.shape gt_depths = gt_depths.view(B * N, H // self.downsample, self.downsample, W // self.downsample, self.downsample, 1) gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous() gt_depths = gt_depths.view(-1, self.downsample * self.downsample) gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths) gt_depths = torch.min(gt_depths_tmp, dim=-1).values gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample) gt_depths = ( gt_depths - (self.grid_config['depth'][0] - self.grid_config['depth'][2])) / self.grid_config['depth'][2] gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths)) gt_depths = F.one_hot( gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:] return gt_depths.float() @force_fp32() def get_depth_loss(self, depth_labels, depth_preds): depth_labels = self.get_downsampled_gt_depth(depth_labels) depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D) fg_mask = torch.max(depth_labels, dim=1).values > 0.0 depth_labels = depth_labels[fg_mask] depth_preds = depth_preds[fg_mask] with autocast(enabled=False): depth_loss = F.binary_cross_entropy( depth_preds, depth_labels, reduction='none', ).sum() / max(1.0, fg_mask.sum()) return self.loss_depth_weight * depth_loss def forward(self, input, return_depth_digit=False): (x, rots, trans, intrins, post_rots, post_trans, bda, mlp_input) = input[:8] B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) if self.with_cp: x = cp.checkpoint(self.depth_net, x, mlp_input) else: x = self.depth_net(x, mlp_input) depth_digit = x[:, :self.D, ...] tran_feat = x[:, self.D:self.D + self.out_channels, ...] depth = depth_digit.softmax(dim=1) # from IPython import embed # embed() # exit() # depth[depth<0.01] = 0 # self.counter.append(((depth<0.01).sum()/depth.numel()).item()) if return_depth_digit: return self.view_transform(input, depth, tran_feat) + (depth_digit, ) else: return self.view_transform(input, depth, tran_feat) ================================================ FILE: mmdet3d/models/roi_heads/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base_3droi_head import Base3DRoIHead from .bbox_heads import H3DBboxHead, PartA2BboxHead, PointRCNNBboxHead from .h3d_roi_head import H3DRoIHead from .mask_heads import PointwiseSemanticHead, PrimitiveHead from .part_aggregation_roi_head import PartAggregationROIHead from .point_rcnn_roi_head import PointRCNNRoIHead from .roi_extractors import (Single3DRoIAwareExtractor, Single3DRoIPointExtractor, SingleRoIExtractor) __all__ = [ 'Base3DRoIHead', 'PartAggregationROIHead', 'PointwiseSemanticHead', 'Single3DRoIAwareExtractor', 'PartA2BboxHead', 'SingleRoIExtractor', 'H3DRoIHead', 'PrimitiveHead', 'PointRCNNRoIHead', 'H3DBboxHead', 'PointRCNNBboxHead', 'Single3DRoIPointExtractor' ] ================================================ FILE: mmdet3d/models/roi_heads/base_3droi_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from abc import ABCMeta, abstractmethod from mmcv.runner import BaseModule class Base3DRoIHead(BaseModule, metaclass=ABCMeta): """Base class for 3d RoIHeads.""" def __init__(self, bbox_head=None, mask_roi_extractor=None, mask_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(Base3DRoIHead, self).__init__(init_cfg=init_cfg) self.train_cfg = train_cfg self.test_cfg = test_cfg if bbox_head is not None: self.init_bbox_head(bbox_head) if mask_head is not None: self.init_mask_head(mask_roi_extractor, mask_head) self.init_assigner_sampler() @property def with_bbox(self): """bool: whether the RoIHead has box head""" return hasattr(self, 'bbox_head') and self.bbox_head is not None @property def with_mask(self): """bool: whether the RoIHead has mask head""" return hasattr(self, 'mask_head') and self.mask_head is not None @abstractmethod def init_bbox_head(self): """Initialize the box head.""" pass @abstractmethod def init_mask_head(self): """Initialize maek head.""" pass @abstractmethod def init_assigner_sampler(self): """Initialize assigner and sampler.""" pass @abstractmethod def forward_train(self, x, img_metas, proposal_list, gt_bboxes, gt_labels, gt_bboxes_ignore=None, **kwargs): """Forward function during training. Args: x (dict): Contains features from the first stage. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. gt_bboxes (list[:obj:`BaseInstance3DBoxes`]): GT bboxes of each sample. The bboxes are encapsulated by 3D box structures. gt_labels (list[torch.LongTensor]): GT labels of each sample. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth boxes to be ignored. Returns: dict[str, torch.Tensor]: Losses from each head. """ pass def simple_test(self, x, proposal_list, img_metas, proposals=None, rescale=False, **kwargs): """Test without augmentation.""" pass def aug_test(self, x, proposal_list, img_metas, rescale=False, **kwargs): """Test with augmentations. If rescale is False, then returned bboxes and masks will fit the scale of imgs[0]. """ pass ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.roi_heads.bbox_heads import (BBoxHead, ConvFCBBoxHead, DoubleConvFCBBoxHead, Shared2FCBBoxHead, Shared4Conv1FCBBoxHead) from .h3d_bbox_head import H3DBboxHead from .parta2_bbox_head import PartA2BboxHead from .point_rcnn_bbox_head import PointRCNNBboxHead __all__ = [ 'BBoxHead', 'ConvFCBBoxHead', 'Shared2FCBBoxHead', 'Shared4Conv1FCBBoxHead', 'DoubleConvFCBBoxHead', 'PartA2BboxHead', 'H3DBboxHead', 'PointRCNNBboxHead' ] ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/h3d_bbox_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.bbox import DepthInstance3DBoxes from mmdet3d.core.post_processing import aligned_3d_nms from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.models.losses import chamfer_distance from mmdet3d.ops import build_sa_module from mmdet.core import build_bbox_coder, multi_apply @HEADS.register_module() class H3DBboxHead(BaseModule): r"""Bbox head of `H3DNet `_. Args: num_classes (int): The number of classes. surface_matching_cfg (dict): Config for surface primitive matching. line_matching_cfg (dict): Config for line primitive matching. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. gt_per_seed (int): Number of ground truth votes generated from each seed point. num_proposal (int): Number of proposal votes generated. feat_channels (tuple[int]): Convolution channels of prediction layer. primitive_feat_refine_streams (int): The number of mlps to refine primitive feature. primitive_refine_channels (tuple[int]): Convolution channels of prediction layer. upper_thresh (float): Threshold for line matching. surface_thresh (float): Threshold for surface matching. line_thresh (float): Threshold for line matching. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. dir_class_loss (dict): Config of direction classification loss. dir_res_loss (dict): Config of direction residual regression loss. size_class_loss (dict): Config of size classification loss. size_res_loss (dict): Config of size residual regression loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. cues_objectness_loss (dict): Config of cues objectness loss. cues_semantic_loss (dict): Config of cues semantic loss. proposal_objectness_loss (dict): Config of proposal objectness loss. primitive_center_loss (dict): Config of primitive center regression loss. """ def __init__(self, num_classes, suface_matching_cfg, line_matching_cfg, bbox_coder, train_cfg=None, test_cfg=None, gt_per_seed=1, num_proposal=256, feat_channels=(128, 128), primitive_feat_refine_streams=2, primitive_refine_channels=[128, 128, 128], upper_thresh=100.0, surface_thresh=0.5, line_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=None, center_loss=None, dir_class_loss=None, dir_res_loss=None, size_class_loss=None, size_res_loss=None, semantic_loss=None, cues_objectness_loss=None, cues_semantic_loss=None, proposal_objectness_loss=None, primitive_center_loss=None, init_cfg=None): super(H3DBboxHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.train_cfg = train_cfg self.test_cfg = test_cfg self.gt_per_seed = gt_per_seed self.num_proposal = num_proposal self.with_angle = bbox_coder['with_rot'] self.upper_thresh = upper_thresh self.surface_thresh = surface_thresh self.line_thresh = line_thresh self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.dir_class_loss = build_loss(dir_class_loss) self.dir_res_loss = build_loss(dir_res_loss) self.size_class_loss = build_loss(size_class_loss) self.size_res_loss = build_loss(size_res_loss) self.semantic_loss = build_loss(semantic_loss) self.bbox_coder = build_bbox_coder(bbox_coder) self.num_sizes = self.bbox_coder.num_sizes self.num_dir_bins = self.bbox_coder.num_dir_bins self.cues_objectness_loss = build_loss(cues_objectness_loss) self.cues_semantic_loss = build_loss(cues_semantic_loss) self.proposal_objectness_loss = build_loss(proposal_objectness_loss) self.primitive_center_loss = build_loss(primitive_center_loss) assert suface_matching_cfg['mlp_channels'][-1] == \ line_matching_cfg['mlp_channels'][-1] # surface center matching self.surface_center_matcher = build_sa_module(suface_matching_cfg) # line center matching self.line_center_matcher = build_sa_module(line_matching_cfg) # Compute the matching scores matching_feat_dims = suface_matching_cfg['mlp_channels'][-1] self.matching_conv = ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True) self.matching_pred = nn.Conv1d(matching_feat_dims, 2, 1) # Compute the semantic matching scores self.semantic_matching_conv = ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True) self.semantic_matching_pred = nn.Conv1d(matching_feat_dims, 2, 1) # Surface feature aggregation self.surface_feats_aggregation = list() for k in range(primitive_feat_refine_streams): self.surface_feats_aggregation.append( ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True)) self.surface_feats_aggregation = nn.Sequential( *self.surface_feats_aggregation) # Line feature aggregation self.line_feats_aggregation = list() for k in range(primitive_feat_refine_streams): self.line_feats_aggregation.append( ConvModule( matching_feat_dims, matching_feat_dims, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True)) self.line_feats_aggregation = nn.Sequential( *self.line_feats_aggregation) # surface center(6) + line center(12) prev_channel = 18 * matching_feat_dims self.bbox_pred = nn.ModuleList() for k in range(len(primitive_refine_channels)): self.bbox_pred.append( ConvModule( prev_channel, primitive_refine_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=False)) prev_channel = primitive_refine_channels[k] # Final object detection # Objectness scores (2), center residual (3), # heading class+residual (num_heading_bin*2), size class + # residual(num_size_cluster*4) conv_out_channel = (2 + 3 + bbox_coder['num_dir_bins'] * 2 + bbox_coder['num_sizes'] * 4 + self.num_classes) self.bbox_pred.append(nn.Conv1d(prev_channel, conv_out_channel, 1)) def forward(self, feats_dict, sample_mod): """Forward pass. Args: feats_dict (dict): Feature dict from backbone. sample_mod (str): Sample mode for vote aggregation layer. valid modes are "vote", "seed" and "random". Returns: dict: Predictions of vote head. """ ret_dict = {} aggregated_points = feats_dict['aggregated_points'] original_feature = feats_dict['aggregated_features'] batch_size = original_feature.shape[0] object_proposal = original_feature.shape[2] # Extract surface center, features and semantic predictions z_center = feats_dict['pred_z_center'] xy_center = feats_dict['pred_xy_center'] z_semantic = feats_dict['sem_cls_scores_z'] xy_semantic = feats_dict['sem_cls_scores_xy'] z_feature = feats_dict['aggregated_features_z'] xy_feature = feats_dict['aggregated_features_xy'] # Extract line points and features line_center = feats_dict['pred_line_center'] line_feature = feats_dict['aggregated_features_line'] surface_center_pred = torch.cat((z_center, xy_center), dim=1) ret_dict['surface_center_pred'] = surface_center_pred ret_dict['surface_sem_pred'] = torch.cat((z_semantic, xy_semantic), dim=1) # Extract the surface and line centers of rpn proposals rpn_proposals = feats_dict['proposal_list'] rpn_proposals_bbox = DepthInstance3DBoxes( rpn_proposals.reshape(-1, 7).clone(), box_dim=rpn_proposals.shape[-1], with_yaw=self.with_angle, origin=(0.5, 0.5, 0.5)) obj_surface_center, obj_line_center = \ rpn_proposals_bbox.get_surface_line_center() obj_surface_center = obj_surface_center.reshape( batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3) obj_line_center = obj_line_center.reshape(batch_size, -1, 12, 3).transpose(1, 2).reshape( batch_size, -1, 3) ret_dict['surface_center_object'] = obj_surface_center ret_dict['line_center_object'] = obj_line_center # aggregate primitive z and xy features to rpn proposals surface_center_feature_pred = torch.cat((z_feature, xy_feature), dim=2) surface_center_feature_pred = torch.cat( (surface_center_feature_pred.new_zeros( (batch_size, 6, surface_center_feature_pred.shape[2])), surface_center_feature_pred), dim=1) surface_xyz, surface_features, _ = self.surface_center_matcher( surface_center_pred, surface_center_feature_pred, target_xyz=obj_surface_center) # aggregate primitive line features to rpn proposals line_feature = torch.cat((line_feature.new_zeros( (batch_size, 12, line_feature.shape[2])), line_feature), dim=1) line_xyz, line_features, _ = self.line_center_matcher( line_center, line_feature, target_xyz=obj_line_center) # combine the surface and line features combine_features = torch.cat((surface_features, line_features), dim=2) matching_features = self.matching_conv(combine_features) matching_score = self.matching_pred(matching_features) ret_dict['matching_score'] = matching_score.transpose(2, 1) semantic_matching_features = self.semantic_matching_conv( combine_features) semantic_matching_score = self.semantic_matching_pred( semantic_matching_features) ret_dict['semantic_matching_score'] = \ semantic_matching_score.transpose(2, 1) surface_features = self.surface_feats_aggregation(surface_features) line_features = self.line_feats_aggregation(line_features) # Combine all surface and line features surface_features = surface_features.view(batch_size, -1, object_proposal) line_features = line_features.view(batch_size, -1, object_proposal) combine_feature = torch.cat((surface_features, line_features), dim=1) # Final bbox predictions bbox_predictions = self.bbox_pred[0](combine_feature) bbox_predictions += original_feature for conv_module in self.bbox_pred[1:]: bbox_predictions = conv_module(bbox_predictions) refine_decode_res = self.bbox_coder.split_pred( bbox_predictions[:, :self.num_classes + 2], bbox_predictions[:, self.num_classes + 2:], aggregated_points) for key in refine_decode_res.keys(): ret_dict[key + '_optimized'] = refine_decode_res[key] return ret_dict def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, rpn_targets=None, gt_bboxes_ignore=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of h3d bbox head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. rpn_targets (Tuple) : Targets generated by rpn head. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict: Losses of H3dnet. """ (vote_targets, vote_target_masks, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, _, mask_targets, valid_gt_masks, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights) = rpn_targets losses = {} # calculate refined proposal loss refined_proposal_loss = self.get_proposal_stage_loss( bbox_preds, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, mask_targets, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights, suffix='_optimized') for key in refined_proposal_loss.keys(): losses[key + '_optimized'] = refined_proposal_loss[key] bbox3d_optimized = self.bbox_coder.decode( bbox_preds, suffix='_optimized') targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) = targets # match scores for each geometric primitive objectness_scores = bbox_preds['matching_score'] # match scores for the semantics of primitives objectness_scores_sem = bbox_preds['semantic_matching_score'] primitive_objectness_loss = self.cues_objectness_loss( objectness_scores.transpose(2, 1), cues_objectness_label, weight=cues_mask, avg_factor=cues_mask.sum() + 1e-6) primitive_sem_loss = self.cues_semantic_loss( objectness_scores_sem.transpose(2, 1), cues_sem_label, weight=cues_mask, avg_factor=cues_mask.sum() + 1e-6) objectness_scores = bbox_preds['obj_scores_optimized'] objectness_loss_refine = self.proposal_objectness_loss( objectness_scores.transpose(2, 1), proposal_objectness_label) primitive_matching_loss = (objectness_loss_refine * cues_match_mask).sum() / ( cues_match_mask.sum() + 1e-6) * 0.5 primitive_sem_matching_loss = ( objectness_loss_refine * proposal_objectness_mask).sum() / ( proposal_objectness_mask.sum() + 1e-6) * 0.5 # Get the object surface center here batch_size, object_proposal = bbox3d_optimized.shape[:2] refined_bbox = DepthInstance3DBoxes( bbox3d_optimized.reshape(-1, 7).clone(), box_dim=bbox3d_optimized.shape[-1], with_yaw=self.with_angle, origin=(0.5, 0.5, 0.5)) pred_obj_surface_center, pred_obj_line_center = \ refined_bbox.get_surface_line_center() pred_obj_surface_center = pred_obj_surface_center.reshape( batch_size, -1, 6, 3).transpose(1, 2).reshape(batch_size, -1, 3) pred_obj_line_center = pred_obj_line_center.reshape( batch_size, -1, 12, 3).transpose(1, 2).reshape(batch_size, -1, 3) pred_surface_line_center = torch.cat( (pred_obj_surface_center, pred_obj_line_center), 1) square_dist = self.primitive_center_loss(pred_surface_line_center, obj_surface_line_center) match_dist = torch.sqrt(square_dist.sum(dim=-1) + 1e-6) primitive_centroid_reg_loss = torch.sum( match_dist * cues_matching_label) / ( cues_matching_label.sum() + 1e-6) refined_loss = dict( primitive_objectness_loss=primitive_objectness_loss, primitive_sem_loss=primitive_sem_loss, primitive_matching_loss=primitive_matching_loss, primitive_sem_matching_loss=primitive_sem_matching_loss, primitive_centroid_reg_loss=primitive_centroid_reg_loss) losses.update(refined_loss) return losses def get_bboxes(self, points, bbox_preds, input_metas, rescale=False, suffix=''): """Generate bboxes from vote head predictions. Args: points (torch.Tensor): Input points. bbox_preds (dict): Predictions from vote head. input_metas (list[dict]): Point cloud and image's meta info. rescale (bool): Whether to rescale bboxes. Returns: list[tuple[torch.Tensor]]: Bounding boxes, scores and labels. """ # decode boxes obj_scores = F.softmax( bbox_preds['obj_scores' + suffix], dim=-1)[..., -1] sem_scores = F.softmax(bbox_preds['sem_scores'], dim=-1) prediction_collection = {} prediction_collection['center'] = bbox_preds['center' + suffix] prediction_collection['dir_class'] = bbox_preds['dir_class'] prediction_collection['dir_res'] = bbox_preds['dir_res' + suffix] prediction_collection['size_class'] = bbox_preds['size_class'] prediction_collection['size_res'] = bbox_preds['size_res' + suffix] bbox3d = self.bbox_coder.decode(prediction_collection) batch_size = bbox3d.shape[0] results = list() for b in range(batch_size): bbox_selected, score_selected, labels = self.multiclass_nms_single( obj_scores[b], sem_scores[b], bbox3d[b], points[b, ..., :3], input_metas[b]) bbox = input_metas[b]['box_type_3d']( bbox_selected, box_dim=bbox_selected.shape[-1], with_yaw=self.bbox_coder.with_rot) results.append((bbox, score_selected, labels)) return results def multiclass_nms_single(self, obj_scores, sem_scores, bbox, points, input_meta): """Multi-class nms in single batch. Args: obj_scores (torch.Tensor): Objectness score of bounding boxes. sem_scores (torch.Tensor): semantic class score of bounding boxes. bbox (torch.Tensor): Predicted bounding boxes. points (torch.Tensor): Input points. input_meta (dict): Point cloud and image's meta info. Returns: tuple[torch.Tensor]: Bounding boxes, scores and labels. """ bbox = input_meta['box_type_3d']( bbox, box_dim=bbox.shape[-1], with_yaw=self.bbox_coder.with_rot, origin=(0.5, 0.5, 0.5)) box_indices = bbox.points_in_boxes_all(points) corner3d = bbox.corners minmax_box3d = corner3d.new(torch.Size((corner3d.shape[0], 6))) minmax_box3d[:, :3] = torch.min(corner3d, dim=1)[0] minmax_box3d[:, 3:] = torch.max(corner3d, dim=1)[0] nonempty_box_mask = box_indices.T.sum(1) > 5 bbox_classes = torch.argmax(sem_scores, -1) nms_selected = aligned_3d_nms(minmax_box3d[nonempty_box_mask], obj_scores[nonempty_box_mask], bbox_classes[nonempty_box_mask], self.test_cfg.nms_thr) # filter empty boxes and boxes with low score scores_mask = (obj_scores > self.test_cfg.score_thr) nonempty_box_inds = torch.nonzero( nonempty_box_mask, as_tuple=False).flatten() nonempty_mask = torch.zeros_like(bbox_classes).scatter( 0, nonempty_box_inds[nms_selected], 1) selected = (nonempty_mask.bool() & scores_mask.bool()) if self.test_cfg.per_class_proposal: bbox_selected, score_selected, labels = [], [], [] for k in range(sem_scores.shape[-1]): bbox_selected.append(bbox[selected].tensor) score_selected.append(obj_scores[selected] * sem_scores[selected][:, k]) labels.append( torch.zeros_like(bbox_classes[selected]).fill_(k)) bbox_selected = torch.cat(bbox_selected, 0) score_selected = torch.cat(score_selected, 0) labels = torch.cat(labels, 0) else: bbox_selected = bbox[selected].tensor score_selected = obj_scores[selected] labels = bbox_classes[selected] return bbox_selected, score_selected, labels def get_proposal_stage_loss(self, bbox_preds, size_class_targets, size_res_targets, dir_class_targets, dir_res_targets, center_targets, mask_targets, objectness_targets, objectness_weights, box_loss_weights, valid_gt_weights, suffix=''): """Compute loss for the aggregation module. Args: bbox_preds (dict): Predictions from forward of vote head. size_class_targets (torch.Tensor): Ground truth size class of each prediction bounding box. size_res_targets (torch.Tensor): Ground truth size residual of each prediction bounding box. dir_class_targets (torch.Tensor): Ground truth direction class of each prediction bounding box. dir_res_targets (torch.Tensor): Ground truth direction residual of each prediction bounding box. center_targets (torch.Tensor): Ground truth center of each prediction bounding box. mask_targets (torch.Tensor): Validation of each prediction bounding box. objectness_targets (torch.Tensor): Ground truth objectness label of each prediction bounding box. objectness_weights (torch.Tensor): Weights of objectness loss for each prediction bounding box. box_loss_weights (torch.Tensor): Weights of regression loss for each prediction bounding box. valid_gt_weights (torch.Tensor): Validation of each ground truth bounding box. Returns: dict: Losses of aggregation module. """ # calculate objectness loss objectness_loss = self.objectness_loss( bbox_preds['obj_scores' + suffix].transpose(2, 1), objectness_targets, weight=objectness_weights) # calculate center loss source2target_loss, target2source_loss = self.center_loss( bbox_preds['center' + suffix], center_targets, src_weight=box_loss_weights, dst_weight=valid_gt_weights) center_loss = source2target_loss + target2source_loss # calculate direction class loss dir_class_loss = self.dir_class_loss( bbox_preds['dir_class' + suffix].transpose(2, 1), dir_class_targets, weight=box_loss_weights) # calculate direction residual loss batch_size, proposal_num = size_class_targets.shape[:2] heading_label_one_hot = dir_class_targets.new_zeros( (batch_size, proposal_num, self.num_dir_bins)) heading_label_one_hot.scatter_(2, dir_class_targets.unsqueeze(-1), 1) dir_res_norm = (bbox_preds['dir_res_norm' + suffix] * heading_label_one_hot).sum(dim=-1) dir_res_loss = self.dir_res_loss( dir_res_norm, dir_res_targets, weight=box_loss_weights) # calculate size class loss size_class_loss = self.size_class_loss( bbox_preds['size_class' + suffix].transpose(2, 1), size_class_targets, weight=box_loss_weights) # calculate size residual loss one_hot_size_targets = box_loss_weights.new_zeros( (batch_size, proposal_num, self.num_sizes)) one_hot_size_targets.scatter_(2, size_class_targets.unsqueeze(-1), 1) one_hot_size_targets_expand = one_hot_size_targets.unsqueeze( -1).repeat(1, 1, 1, 3) size_residual_norm = (bbox_preds['size_res_norm' + suffix] * one_hot_size_targets_expand).sum(dim=2) box_loss_weights_expand = box_loss_weights.unsqueeze(-1).repeat( 1, 1, 3) size_res_loss = self.size_res_loss( size_residual_norm, size_res_targets, weight=box_loss_weights_expand) # calculate semantic loss semantic_loss = self.semantic_loss( bbox_preds['sem_scores' + suffix].transpose(2, 1), mask_targets, weight=box_loss_weights) losses = dict( objectness_loss=objectness_loss, semantic_loss=semantic_loss, center_loss=center_loss, dir_class_loss=dir_class_loss, dir_res_loss=dir_res_loss, size_class_loss=size_class_loss, size_res_loss=size_res_loss) return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of proposal module. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (torch.Tensor): Bounding box predictions of vote head. Returns: tuple[torch.Tensor]: Targets of proposal module. """ # find empty example valid_gt_masks = list() gt_num = list() for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) valid_gt_masks.append(gt_labels_3d[index].new_zeros(1)) gt_num.append(1) else: valid_gt_masks.append(gt_labels_3d[index].new_ones( gt_labels_3d[index].shape)) gt_num.append(gt_labels_3d[index].shape[0]) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] aggregated_points = [ bbox_preds['aggregated_points'][i] for i in range(len(gt_labels_3d)) ] surface_center_pred = [ bbox_preds['surface_center_pred'][i] for i in range(len(gt_labels_3d)) ] line_center_pred = [ bbox_preds['pred_line_center'][i] for i in range(len(gt_labels_3d)) ] surface_center_object = [ bbox_preds['surface_center_object'][i] for i in range(len(gt_labels_3d)) ] line_center_object = [ bbox_preds['line_center_object'][i] for i in range(len(gt_labels_3d)) ] surface_sem_pred = [ bbox_preds['surface_sem_pred'][i] for i in range(len(gt_labels_3d)) ] line_sem_pred = [ bbox_preds['sem_cls_scores_line'][i] for i in range(len(gt_labels_3d)) ] (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) = multi_apply( self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, aggregated_points, surface_center_pred, line_center_pred, surface_center_object, line_center_object, surface_sem_pred, line_sem_pred) cues_objectness_label = torch.stack(cues_objectness_label) cues_sem_label = torch.stack(cues_sem_label) proposal_objectness_label = torch.stack(proposal_objectness_label) cues_mask = torch.stack(cues_mask) cues_match_mask = torch.stack(cues_match_mask) proposal_objectness_mask = torch.stack(proposal_objectness_mask) cues_matching_label = torch.stack(cues_matching_label) obj_surface_line_center = torch.stack(obj_surface_line_center) return (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, aggregated_points=None, pred_surface_center=None, pred_line_center=None, pred_obj_surface_center=None, pred_obj_line_center=None, pred_surface_sem=None, pred_line_sem=None): """Generate targets for primitive cues for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. aggregated_points (torch.Tensor): Aggregated points from vote aggregation layer. pred_surface_center (torch.Tensor): Prediction of surface center. pred_line_center (torch.Tensor): Prediction of line center. pred_obj_surface_center (torch.Tensor): Objectness prediction of surface center. pred_obj_line_center (torch.Tensor): Objectness prediction of line center. pred_surface_sem (torch.Tensor): Semantic prediction of surface center. pred_line_sem (torch.Tensor): Semantic prediction of line center. Returns: tuple[torch.Tensor]: Targets for primitive cues. """ device = points.device gt_bboxes_3d = gt_bboxes_3d.to(device) num_proposals = aggregated_points.shape[0] gt_center = gt_bboxes_3d.gravity_center dist1, dist2, ind1, _ = chamfer_distance( aggregated_points.unsqueeze(0), gt_center.unsqueeze(0), reduction='none') # Set assignment object_assignment = ind1.squeeze(0) # Generate objectness label and mask # objectness_label: 1 if pred object center is within # self.train_cfg['near_threshold'] of any GT object # objectness_mask: 0 if pred object center is in gray # zone (DONOTCARE), 1 otherwise euclidean_dist1 = torch.sqrt(dist1.squeeze(0) + 1e-6) proposal_objectness_label = euclidean_dist1.new_zeros( num_proposals, dtype=torch.long) proposal_objectness_mask = euclidean_dist1.new_zeros(num_proposals) gt_sem = gt_labels_3d[object_assignment] obj_surface_center, obj_line_center = \ gt_bboxes_3d.get_surface_line_center() obj_surface_center = obj_surface_center.reshape(-1, 6, 3).transpose(0, 1) obj_line_center = obj_line_center.reshape(-1, 12, 3).transpose(0, 1) obj_surface_center = obj_surface_center[:, object_assignment].reshape( 1, -1, 3) obj_line_center = obj_line_center[:, object_assignment].reshape(1, -1, 3) surface_sem = torch.argmax(pred_surface_sem, dim=1).float() line_sem = torch.argmax(pred_line_sem, dim=1).float() dist_surface, _, surface_ind, _ = chamfer_distance( obj_surface_center, pred_surface_center.unsqueeze(0), reduction='none') dist_line, _, line_ind, _ = chamfer_distance( obj_line_center, pred_line_center.unsqueeze(0), reduction='none') surface_sel = pred_surface_center[surface_ind.squeeze(0)] line_sel = pred_line_center[line_ind.squeeze(0)] surface_sel_sem = surface_sem[surface_ind.squeeze(0)] line_sel_sem = line_sem[line_ind.squeeze(0)] surface_sel_sem_gt = gt_sem.repeat(6).float() line_sel_sem_gt = gt_sem.repeat(12).float() euclidean_dist_surface = torch.sqrt(dist_surface.squeeze(0) + 1e-6) euclidean_dist_line = torch.sqrt(dist_line.squeeze(0) + 1e-6) objectness_label_surface = euclidean_dist_line.new_zeros( num_proposals * 6, dtype=torch.long) objectness_mask_surface = euclidean_dist_line.new_zeros(num_proposals * 6) objectness_label_line = euclidean_dist_line.new_zeros( num_proposals * 12, dtype=torch.long) objectness_mask_line = euclidean_dist_line.new_zeros(num_proposals * 12) objectness_label_surface_sem = euclidean_dist_line.new_zeros( num_proposals * 6, dtype=torch.long) objectness_label_line_sem = euclidean_dist_line.new_zeros( num_proposals * 12, dtype=torch.long) euclidean_dist_obj_surface = torch.sqrt(( (pred_obj_surface_center - surface_sel)**2).sum(dim=-1) + 1e-6) euclidean_dist_obj_line = torch.sqrt( torch.sum((pred_obj_line_center - line_sel)**2, dim=-1) + 1e-6) # Objectness score just with centers proposal_objectness_label[ euclidean_dist1 < self.train_cfg['near_threshold']] = 1 proposal_objectness_mask[ euclidean_dist1 < self.train_cfg['near_threshold']] = 1 proposal_objectness_mask[ euclidean_dist1 > self.train_cfg['far_threshold']] = 1 objectness_label_surface[ (euclidean_dist_obj_surface < self.train_cfg['label_surface_threshold']) * (euclidean_dist_surface < self.train_cfg['mask_surface_threshold'])] = 1 objectness_label_surface_sem[ (euclidean_dist_obj_surface < self.train_cfg['label_surface_threshold']) * (euclidean_dist_surface < self.train_cfg['mask_surface_threshold']) * (surface_sel_sem == surface_sel_sem_gt)] = 1 objectness_label_line[ (euclidean_dist_obj_line < self.train_cfg['label_line_threshold']) * (euclidean_dist_line < self.train_cfg['mask_line_threshold'])] = 1 objectness_label_line_sem[ (euclidean_dist_obj_line < self.train_cfg['label_line_threshold']) * (euclidean_dist_line < self.train_cfg['mask_line_threshold']) * (line_sel_sem == line_sel_sem_gt)] = 1 objectness_label_surface_obj = proposal_objectness_label.repeat(6) objectness_mask_surface_obj = proposal_objectness_mask.repeat(6) objectness_label_line_obj = proposal_objectness_label.repeat(12) objectness_mask_line_obj = proposal_objectness_mask.repeat(12) objectness_mask_surface = objectness_mask_surface_obj objectness_mask_line = objectness_mask_line_obj cues_objectness_label = torch.cat( (objectness_label_surface, objectness_label_line), 0) cues_sem_label = torch.cat( (objectness_label_surface_sem, objectness_label_line_sem), 0) cues_mask = torch.cat((objectness_mask_surface, objectness_mask_line), 0) objectness_label_surface *= objectness_label_surface_obj objectness_label_line *= objectness_label_line_obj cues_matching_label = torch.cat( (objectness_label_surface, objectness_label_line), 0) objectness_label_surface_sem *= objectness_label_surface_obj objectness_label_line_sem *= objectness_label_line_obj cues_match_mask = (torch.sum( cues_objectness_label.view(18, num_proposals), dim=0) >= 1).float() obj_surface_line_center = torch.cat( (obj_surface_center, obj_line_center), 1).squeeze(0) return (cues_objectness_label, cues_sem_label, proposal_objectness_label, cues_mask, cues_match_mask, proposal_objectness_mask, cues_matching_label, obj_surface_line_center) ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/parta2_bbox_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import ConvModule, normal_init from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE if IS_SPCONV2_AVAILABLE: from spconv.pytorch import (SparseConvTensor, SparseMaxPool3d, SparseSequential) else: from mmcv.ops import SparseConvTensor, SparseMaxPool3d, SparseSequential from mmcv.runner import BaseModule from torch import nn as nn from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes, rotation_3d_in_axis, xywhr2xyxyr) from mmdet3d.core.post_processing import nms_bev, nms_normal_bev from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.ops import make_sparse_convmodule from mmdet.core import build_bbox_coder, multi_apply @HEADS.register_module() class PartA2BboxHead(BaseModule): """PartA2 RoI head. Args: num_classes (int): The number of classes to prediction. seg_in_channels (int): Input channels of segmentation convolution layer. part_in_channels (int): Input channels of part convolution layer. seg_conv_channels (list(int)): Out channels of each segmentation convolution layer. part_conv_channels (list(int)): Out channels of each part convolution layer. merge_conv_channels (list(int)): Out channels of each feature merged convolution layer. down_conv_channels (list(int)): Out channels of each downsampled convolution layer. shared_fc_channels (list(int)): Out channels of each shared fc layer. cls_channels (list(int)): Out channels of each classification layer. reg_channels (list(int)): Out channels of each regression layer. dropout_ratio (float): Dropout ratio of classification and regression layers. roi_feat_size (int): The size of pooled roi features. with_corner_loss (bool): Whether to use corner loss or not. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for box head. conv_cfg (dict): Config dict of convolutional layers norm_cfg (dict): Config dict of normalization layers loss_bbox (dict): Config dict of box regression loss. loss_cls (dict): Config dict of classifacation loss. """ def __init__(self, num_classes, seg_in_channels, part_in_channels, seg_conv_channels=None, part_conv_channels=None, merge_conv_channels=None, down_conv_channels=None, shared_fc_channels=None, cls_channels=None, reg_channels=None, dropout_ratio=0.1, roi_feat_size=14, with_corner_loss=True, bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=2.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='none', loss_weight=1.0), init_cfg=None): super(PartA2BboxHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.with_corner_loss = with_corner_loss self.bbox_coder = build_bbox_coder(bbox_coder) self.loss_bbox = build_loss(loss_bbox) self.loss_cls = build_loss(loss_cls) self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) assert down_conv_channels[-1] == shared_fc_channels[0] # init layers part_channel_last = part_in_channels part_conv = [] for i, channel in enumerate(part_conv_channels): part_conv.append( make_sparse_convmodule( part_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key=f'rcnn_part{i}', conv_type='SubMConv3d')) part_channel_last = channel self.part_conv = SparseSequential(*part_conv) seg_channel_last = seg_in_channels seg_conv = [] for i, channel in enumerate(seg_conv_channels): seg_conv.append( make_sparse_convmodule( seg_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key=f'rcnn_seg{i}', conv_type='SubMConv3d')) seg_channel_last = channel self.seg_conv = SparseSequential(*seg_conv) self.conv_down = SparseSequential() merge_conv_channel_last = part_channel_last + seg_channel_last merge_conv = [] for i, channel in enumerate(merge_conv_channels): merge_conv.append( make_sparse_convmodule( merge_conv_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key='rcnn_down0')) merge_conv_channel_last = channel down_conv_channel_last = merge_conv_channel_last conv_down = [] for i, channel in enumerate(down_conv_channels): conv_down.append( make_sparse_convmodule( down_conv_channel_last, channel, 3, padding=1, norm_cfg=norm_cfg, indice_key='rcnn_down1')) down_conv_channel_last = channel self.conv_down.add_module('merge_conv', SparseSequential(*merge_conv)) self.conv_down.add_module('max_pool3d', SparseMaxPool3d(kernel_size=2, stride=2)) self.conv_down.add_module('down_conv', SparseSequential(*conv_down)) shared_fc_list = [] pool_size = roi_feat_size // 2 pre_channel = shared_fc_channels[0] * pool_size**3 for k in range(1, len(shared_fc_channels)): shared_fc_list.append( ConvModule( pre_channel, shared_fc_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, inplace=True)) pre_channel = shared_fc_channels[k] if k != len(shared_fc_channels) - 1 and dropout_ratio > 0: shared_fc_list.append(nn.Dropout(dropout_ratio)) self.shared_fc = nn.Sequential(*shared_fc_list) # Classification layer channel_in = shared_fc_channels[-1] cls_channel = 1 cls_layers = [] pre_channel = channel_in for k in range(0, len(cls_channels)): cls_layers.append( ConvModule( pre_channel, cls_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, inplace=True)) pre_channel = cls_channels[k] cls_layers.append( ConvModule( pre_channel, cls_channel, 1, padding=0, conv_cfg=conv_cfg, act_cfg=None)) if dropout_ratio >= 0: cls_layers.insert(1, nn.Dropout(dropout_ratio)) self.conv_cls = nn.Sequential(*cls_layers) # Regression layer reg_layers = [] pre_channel = channel_in for k in range(0, len(reg_channels)): reg_layers.append( ConvModule( pre_channel, reg_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, inplace=True)) pre_channel = reg_channels[k] reg_layers.append( ConvModule( pre_channel, self.bbox_coder.code_size, 1, padding=0, conv_cfg=conv_cfg, act_cfg=None)) if dropout_ratio >= 0: reg_layers.insert(1, nn.Dropout(dropout_ratio)) self.conv_reg = nn.Sequential(*reg_layers) if init_cfg is None: self.init_cfg = dict( type='Xavier', layer=['Conv2d', 'Conv1d'], distribution='uniform') def init_weights(self): super().init_weights() normal_init(self.conv_reg[-1].conv, mean=0, std=0.001) def forward(self, seg_feats, part_feats): """Forward pass. Args: seg_feats (torch.Tensor): Point-wise semantic features. part_feats (torch.Tensor): Point-wise part prediction features. Returns: tuple[torch.Tensor]: Score of class and bbox predictions. """ # (B * N, out_x, out_y, out_z, 4) rcnn_batch_size = part_feats.shape[0] # transform to sparse tensors sparse_shape = part_feats.shape[1:4] # (non_empty_num, 4) ==> [bs_idx, x_idx, y_idx, z_idx] sparse_idx = part_feats.sum(dim=-1).nonzero(as_tuple=False) part_features = part_feats[sparse_idx[:, 0], sparse_idx[:, 1], sparse_idx[:, 2], sparse_idx[:, 3]] seg_features = seg_feats[sparse_idx[:, 0], sparse_idx[:, 1], sparse_idx[:, 2], sparse_idx[:, 3]] coords = sparse_idx.int().contiguous() part_features = SparseConvTensor(part_features, coords, sparse_shape, rcnn_batch_size) seg_features = SparseConvTensor(seg_features, coords, sparse_shape, rcnn_batch_size) # forward rcnn network x_part = self.part_conv(part_features) x_rpn = self.seg_conv(seg_features) merged_feature = torch.cat((x_rpn.features, x_part.features), dim=1) # (N, C) shared_feature = SparseConvTensor(merged_feature, coords, sparse_shape, rcnn_batch_size) x = self.conv_down(shared_feature) shared_feature = x.dense().view(rcnn_batch_size, -1, 1) shared_feature = self.shared_fc(shared_feature) cls_score = self.conv_cls(shared_feature).transpose( 1, 2).contiguous().squeeze(dim=1) # (B, 1) bbox_pred = self.conv_reg(shared_feature).transpose( 1, 2).contiguous().squeeze(dim=1) # (B, C) return cls_score, bbox_pred def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights): """Computing losses. Args: cls_score (torch.Tensor): Scores of each roi. bbox_pred (torch.Tensor): Predictions of bboxes. rois (torch.Tensor): Roi bboxes. labels (torch.Tensor): Labels of class. bbox_targets (torch.Tensor): Target of positive bboxes. pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes. reg_mask (torch.Tensor): Mask for positive bboxes. label_weights (torch.Tensor): Weights of class loss. bbox_weights (torch.Tensor): Weights of bbox loss. Returns: dict: Computed losses. - loss_cls (torch.Tensor): Loss of classes. - loss_bbox (torch.Tensor): Loss of bboxes. - loss_corner (torch.Tensor): Loss of corners. """ losses = dict() rcnn_batch_size = cls_score.shape[0] # calculate class loss cls_flat = cls_score.view(-1) loss_cls = self.loss_cls(cls_flat, labels, label_weights) losses['loss_cls'] = loss_cls # calculate regression loss code_size = self.bbox_coder.code_size pos_inds = (reg_mask > 0) if pos_inds.any() == 0: # fake a part loss losses['loss_bbox'] = loss_cls.new_tensor(0) if self.with_corner_loss: losses['loss_corner'] = loss_cls.new_tensor(0) else: pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds] bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat( 1, pos_bbox_pred.shape[-1]) loss_bbox = self.loss_bbox( pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0), bbox_weights_flat.unsqueeze(dim=0)) losses['loss_bbox'] = loss_bbox if self.with_corner_loss: pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds] pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size) batch_anchors = pos_roi_boxes3d.clone().detach() pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1) roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3) batch_anchors[..., 0:3] = 0 # decode boxes pred_boxes3d = self.bbox_coder.decode( batch_anchors, pos_bbox_pred.view(-1, code_size)).view(-1, code_size) pred_boxes3d[..., 0:3] = rotation_3d_in_axis( pred_boxes3d[..., 0:3].unsqueeze(1), pos_rois_rotation, axis=2).squeeze(1) pred_boxes3d[:, 0:3] += roi_xyz # calculate corner loss loss_corner = self.get_corner_loss_lidar( pred_boxes3d, pos_gt_bboxes) losses['loss_corner'] = loss_corner return losses def get_targets(self, sampling_results, rcnn_train_cfg, concat=True): """Generate targets. Args: sampling_results (list[:obj:`SamplingResult`]): Sampled results from rois. rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn. concat (bool): Whether to concatenate targets between batches. Returns: tuple[torch.Tensor]: Targets of boxes and class prediction. """ pos_bboxes_list = [res.pos_bboxes for res in sampling_results] pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results] iou_list = [res.iou for res in sampling_results] targets = multi_apply( self._get_target_single, pos_bboxes_list, pos_gt_bboxes_list, iou_list, cfg=rcnn_train_cfg) (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) = targets if concat: label = torch.cat(label, 0) bbox_targets = torch.cat(bbox_targets, 0) pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0) reg_mask = torch.cat(reg_mask, 0) label_weights = torch.cat(label_weights, 0) label_weights /= torch.clamp(label_weights.sum(), min=1.0) bbox_weights = torch.cat(bbox_weights, 0) bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0) return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg): """Generate training targets for a single sample. Args: pos_bboxes (torch.Tensor): Positive boxes with shape (N, 7). pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape (M, 7). ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes` in shape (N, M). cfg (dict): Training configs. Returns: tuple[torch.Tensor]: Target for positive boxes. (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) """ cls_pos_mask = ious > cfg.cls_pos_thr cls_neg_mask = ious < cfg.cls_neg_thr interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0) # iou regression target label = (cls_pos_mask > 0).float() label[interval_mask] = ious[interval_mask] * 2 - 0.5 # label weights label_weights = (label >= 0).float() # box regression target reg_mask = pos_bboxes.new_zeros(ious.size(0)).long() reg_mask[0:pos_gt_bboxes.size(0)] = 1 bbox_weights = (reg_mask > 0).float() if reg_mask.bool().any(): pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach() roi_center = pos_bboxes[..., 0:3] roi_ry = pos_bboxes[..., 6] % (2 * np.pi) # canonical transformation pos_gt_bboxes_ct[..., 0:3] -= roi_center pos_gt_bboxes_ct[..., 6] -= roi_ry pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis( pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -roi_ry, axis=2).squeeze(1) # flip orientation if rois have opposite orientation ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi) # 0 ~ 2pi opposite_flag = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5) ry_label[opposite_flag] = (ry_label[opposite_flag] + np.pi) % ( 2 * np.pi) # (0 ~ pi/2, 3pi/2 ~ 2pi) flag = ry_label > np.pi ry_label[flag] = ry_label[flag] - np.pi * 2 # (-pi/2, pi/2) ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2) pos_gt_bboxes_ct[..., 6] = ry_label rois_anchor = pos_bboxes.clone().detach() rois_anchor[:, 0:3] = 0 rois_anchor[:, 6] = 0 bbox_targets = self.bbox_coder.encode(rois_anchor, pos_gt_bboxes_ct) else: # no fg bbox bbox_targets = pos_gt_bboxes.new_empty((0, 7)) return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0): """Calculate corner loss of given boxes. Args: pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7). gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7). delta (float, optional): huber loss threshold. Defaults to 1.0 Returns: torch.FloatTensor: Calculated corner loss in shape (N). """ assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0] # This is a little bit hack here because we assume the box for # Part-A2 is in LiDAR coordinates gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d) pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners gt_box_corners = gt_boxes_structure.corners # This flip only changes the heading direction of GT boxes gt_bbox3d_flip = gt_boxes_structure.clone() gt_bbox3d_flip.tensor[:, 6] += np.pi gt_box_corners_flip = gt_bbox3d_flip.corners corner_dist = torch.min( torch.norm(pred_box_corners - gt_box_corners, dim=2), torch.norm(pred_box_corners - gt_box_corners_flip, dim=2)) # (N, 8) # huber loss abs_error = corner_dist.abs() quadratic = abs_error.clamp(max=delta) linear = (abs_error - quadratic) corner_loss = 0.5 * quadratic**2 + delta * linear return corner_loss.mean(dim=1) def get_bboxes(self, rois, cls_score, bbox_pred, class_labels, class_pred, img_metas, cfg=None): """Generate bboxes from bbox head predictions. Args: rois (torch.Tensor): Roi bounding boxes. cls_score (torch.Tensor): Scores of bounding boxes. bbox_pred (torch.Tensor): Bounding boxes predictions class_labels (torch.Tensor): Label of classes class_pred (torch.Tensor): Score for nms. img_metas (list[dict]): Point cloud and image's meta info. cfg (:obj:`ConfigDict`): Testing config. Returns: list[tuple]: Decoded bbox, scores and labels after nms. """ roi_batch_id = rois[..., 0] roi_boxes = rois[..., 1:] # boxes without batch id batch_size = int(roi_batch_id.max().item() + 1) # decode boxes roi_ry = roi_boxes[..., 6].view(-1) roi_xyz = roi_boxes[..., 0:3].view(-1, 3) local_roi_boxes = roi_boxes.clone().detach() local_roi_boxes[..., 0:3] = 0 rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred) rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis( rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1) rcnn_boxes3d[:, 0:3] += roi_xyz # post processing result_list = [] for batch_id in range(batch_size): cur_class_labels = class_labels[batch_id] cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1) cur_box_prob = class_pred[batch_id] cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id] keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d, cfg.score_thr, cfg.nms_thr, img_metas[batch_id], cfg.use_rotate_nms) selected_bboxes = cur_rcnn_boxes3d[keep] selected_label_preds = cur_class_labels[keep] selected_scores = cur_cls_score[keep] result_list.append( (img_metas[batch_id]['box_type_3d'](selected_bboxes, self.bbox_coder.code_size), selected_scores, selected_label_preds)) return result_list def multi_class_nms(self, box_probs, box_preds, score_thr, nms_thr, input_meta, use_rotate_nms=True): """Multi-class NMS for box head. Note: This function has large overlap with the `box3d_multiclass_nms` implemented in `mmdet3d.core.post_processing`. We are considering merging these two functions in the future. Args: box_probs (torch.Tensor): Predicted boxes probabitilies in shape (N,). box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C). score_thr (float): Threshold of scores. nms_thr (float): Threshold for NMS. input_meta (dict): Meta information of the current sample. use_rotate_nms (bool, optional): Whether to use rotated nms. Defaults to True. Returns: torch.Tensor: Selected indices. """ if use_rotate_nms: nms_func = nms_bev else: nms_func = nms_normal_bev assert box_probs.shape[ 1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}' selected_list = [] selected_labels = [] boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( box_preds, self.bbox_coder.code_size).bev) score_thresh = score_thr if isinstance( score_thr, list) else [score_thr for x in range(self.num_classes)] nms_thresh = nms_thr if isinstance( nms_thr, list) else [nms_thr for x in range(self.num_classes)] for k in range(0, self.num_classes): class_scores_keep = box_probs[:, k] >= score_thresh[k] if class_scores_keep.int().sum() > 0: original_idxs = class_scores_keep.nonzero( as_tuple=False).view(-1) cur_boxes_for_nms = boxes_for_nms[class_scores_keep] cur_rank_scores = box_probs[class_scores_keep, k] cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores, nms_thresh[k]) if cur_selected.shape[0] == 0: continue selected_list.append(original_idxs[cur_selected]) selected_labels.append( torch.full([cur_selected.shape[0]], k + 1, dtype=torch.int64, device=box_preds.device)) keep = torch.cat( selected_list, dim=0) if len(selected_list) > 0 else [] return keep ================================================ FILE: mmdet3d/models/roi_heads/bbox_heads/point_rcnn_bbox_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import ConvModule, normal_init from mmcv.cnn.bricks import build_conv_layer from mmcv.runner import BaseModule from torch import nn as nn from mmdet3d.core.bbox.structures import (LiDARInstance3DBoxes, rotation_3d_in_axis, xywhr2xyxyr) from mmdet3d.core.post_processing import nms_bev, nms_normal_bev from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.ops import build_sa_module from mmdet.core import build_bbox_coder, multi_apply @HEADS.register_module() class PointRCNNBboxHead(BaseModule): """PointRCNN RoI Bbox head. Args: num_classes (int): The number of classes to prediction. in_channels (int): Input channels of point features. mlp_channels (list[int]): the number of mlp channels pred_layer_cfg (dict, optional): Config of classfication and regression prediction layers. Defaults to None. num_points (tuple, optional): The number of points which each SA module samples. Defaults to (128, 32, -1). radius (tuple, optional): Sampling radius of each SA module. Defaults to (0.2, 0.4, 100). num_samples (tuple, optional): The number of samples for ball query in each SA module. Defaults to (64, 64, 64). sa_channels (tuple, optional): Out channels of each mlp in SA module. Defaults to ((128, 128, 128), (128, 128, 256), (256, 256, 512)). bbox_coder (dict, optional): Config dict of box coders. Defaults to dict(type='DeltaXYZWLHRBBoxCoder'). sa_cfg (dict, optional): Config of set abstraction module, which may contain the following keys and values: - pool_mod (str): Pool method ('max' or 'avg') for SA modules. - use_xyz (bool): Whether to use xyz as a part of features. - normalize_xyz (bool): Whether to normalize xyz with radii in each SA module. Defaults to dict(type='PointSAModule', pool_mod='max', use_xyz=True). conv_cfg (dict, optional): Config dict of convolutional layers. Defaults to dict(type='Conv1d'). norm_cfg (dict, optional): Config dict of normalization layers. Defaults to dict(type='BN1d'). act_cfg (dict, optional): Config dict of activation layers. Defaults to dict(type='ReLU'). bias (str, optional): Type of bias. Defaults to 'auto'. loss_bbox (dict, optional): Config of regression loss function. Defaults to dict(type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0). loss_cls (dict, optional): Config of classification loss function. Defaults to dict(type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0). with_corner_loss (bool, optional): Whether using corner loss. Defaults to True. init_cfg (dict, optional): Config of initialization. Defaults to None. """ def __init__( self, num_classes, in_channels, mlp_channels, pred_layer_cfg=None, num_points=(128, 32, -1), radius=(0.2, 0.4, 100), num_samples=(64, 64, 64), sa_channels=((128, 128, 128), (128, 128, 256), (256, 256, 512)), bbox_coder=dict(type='DeltaXYZWLHRBBoxCoder'), sa_cfg=dict(type='PointSAModule', pool_mod='max', use_xyz=True), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), bias='auto', loss_bbox=dict( type='SmoothL1Loss', beta=1.0 / 9.0, reduction='sum', loss_weight=1.0), loss_cls=dict( type='CrossEntropyLoss', use_sigmoid=True, reduction='sum', loss_weight=1.0), with_corner_loss=True, init_cfg=None): super(PointRCNNBboxHead, self).__init__(init_cfg=init_cfg) self.num_classes = num_classes self.num_sa = len(sa_channels) self.with_corner_loss = with_corner_loss self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg self.act_cfg = act_cfg self.bias = bias self.loss_bbox = build_loss(loss_bbox) self.loss_cls = build_loss(loss_cls) self.bbox_coder = build_bbox_coder(bbox_coder) self.use_sigmoid_cls = loss_cls.get('use_sigmoid', False) self.in_channels = in_channels mlp_channels = [self.in_channels] + mlp_channels shared_mlps = nn.Sequential() for i in range(len(mlp_channels) - 1): shared_mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, 1), stride=(1, 1), inplace=False, conv_cfg=dict(type='Conv2d'))) self.xyz_up_layer = nn.Sequential(*shared_mlps) c_out = mlp_channels[-1] self.merge_down_layer = ConvModule( c_out * 2, c_out, kernel_size=(1, 1), stride=(1, 1), inplace=False, conv_cfg=dict(type='Conv2d')) pre_channels = c_out self.SA_modules = nn.ModuleList() sa_in_channel = pre_channels for sa_index in range(self.num_sa): cur_sa_mlps = list(sa_channels[sa_index]) cur_sa_mlps = [sa_in_channel] + cur_sa_mlps sa_out_channel = cur_sa_mlps[-1] cur_num_points = num_points[sa_index] if cur_num_points <= 0: cur_num_points = None self.SA_modules.append( build_sa_module( num_point=cur_num_points, radius=radius[sa_index], num_sample=num_samples[sa_index], mlp_channels=cur_sa_mlps, cfg=sa_cfg)) sa_in_channel = sa_out_channel self.cls_convs = self._add_conv_branch( pred_layer_cfg.in_channels, pred_layer_cfg.cls_conv_channels) self.reg_convs = self._add_conv_branch( pred_layer_cfg.in_channels, pred_layer_cfg.reg_conv_channels) prev_channel = pred_layer_cfg.cls_conv_channels[-1] self.conv_cls = build_conv_layer( self.conv_cfg, in_channels=prev_channel, out_channels=self.num_classes, kernel_size=1) prev_channel = pred_layer_cfg.reg_conv_channels[-1] self.conv_reg = build_conv_layer( self.conv_cfg, in_channels=prev_channel, out_channels=self.bbox_coder.code_size * self.num_classes, kernel_size=1) if init_cfg is None: self.init_cfg = dict(type='Xavier', layer=['Conv2d', 'Conv1d']) def _add_conv_branch(self, in_channels, conv_channels): """Add shared or separable branch. Args: in_channels (int): Input feature channel. conv_channels (tuple): Middle feature channels. """ conv_spec = [in_channels] + list(conv_channels) # add branch specific conv layers conv_layers = nn.Sequential() for i in range(len(conv_spec) - 1): conv_layers.add_module( f'layer{i}', ConvModule( conv_spec[i], conv_spec[i + 1], kernel_size=1, padding=0, conv_cfg=self.conv_cfg, norm_cfg=self.norm_cfg, act_cfg=self.act_cfg, bias=self.bias, inplace=True)) return conv_layers def init_weights(self): """Initialize weights of the head.""" super().init_weights() for m in self.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.Conv1d): if m.bias is not None: nn.init.constant_(m.bias, 0) normal_init(self.conv_reg.weight, mean=0, std=0.001) def forward(self, feats): """Forward pass. Args: feats (torch.Torch): Features from RCNN modules. Returns: tuple[torch.Tensor]: Score of class and bbox predictions. """ input_data = feats.clone().detach() xyz_input = input_data[..., 0:self.in_channels].transpose( 1, 2).unsqueeze(dim=3).contiguous().clone().detach() xyz_features = self.xyz_up_layer(xyz_input) rpn_features = input_data[..., self.in_channels:].transpose( 1, 2).unsqueeze(dim=3) merged_features = torch.cat((xyz_features, rpn_features), dim=1) merged_features = self.merge_down_layer(merged_features) l_xyz, l_features = [input_data[..., 0:3].contiguous()], \ [merged_features.squeeze(dim=3)] for i in range(len(self.SA_modules)): li_xyz, li_features, cur_indices = \ self.SA_modules[i](l_xyz[i], l_features[i]) l_xyz.append(li_xyz) l_features.append(li_features) shared_features = l_features[-1] x_cls = shared_features x_reg = shared_features x_cls = self.cls_convs(x_cls) rcnn_cls = self.conv_cls(x_cls) x_reg = self.reg_convs(x_reg) rcnn_reg = self.conv_reg(x_reg) rcnn_cls = rcnn_cls.transpose(1, 2).contiguous().squeeze(dim=1) rcnn_reg = rcnn_reg.transpose(1, 2).contiguous().squeeze(dim=1) return rcnn_cls, rcnn_reg def loss(self, cls_score, bbox_pred, rois, labels, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights): """Computing losses. Args: cls_score (torch.Tensor): Scores of each RoI. bbox_pred (torch.Tensor): Predictions of bboxes. rois (torch.Tensor): RoI bboxes. labels (torch.Tensor): Labels of class. bbox_targets (torch.Tensor): Target of positive bboxes. pos_gt_bboxes (torch.Tensor): Ground truths of positive bboxes. reg_mask (torch.Tensor): Mask for positive bboxes. label_weights (torch.Tensor): Weights of class loss. bbox_weights (torch.Tensor): Weights of bbox loss. Returns: dict: Computed losses. - loss_cls (torch.Tensor): Loss of classes. - loss_bbox (torch.Tensor): Loss of bboxes. - loss_corner (torch.Tensor): Loss of corners. """ losses = dict() rcnn_batch_size = cls_score.shape[0] # calculate class loss cls_flat = cls_score.view(-1) loss_cls = self.loss_cls(cls_flat, labels, label_weights) losses['loss_cls'] = loss_cls # calculate regression loss code_size = self.bbox_coder.code_size pos_inds = (reg_mask > 0) pos_bbox_pred = bbox_pred.view(rcnn_batch_size, -1)[pos_inds].clone() bbox_weights_flat = bbox_weights[pos_inds].view(-1, 1).repeat( 1, pos_bbox_pred.shape[-1]) loss_bbox = self.loss_bbox( pos_bbox_pred.unsqueeze(dim=0), bbox_targets.unsqueeze(dim=0).detach(), bbox_weights_flat.unsqueeze(dim=0)) losses['loss_bbox'] = loss_bbox if pos_inds.any() != 0 and self.with_corner_loss: rois = rois.detach() pos_roi_boxes3d = rois[..., 1:].view(-1, code_size)[pos_inds] pos_roi_boxes3d = pos_roi_boxes3d.view(-1, code_size) batch_anchors = pos_roi_boxes3d.clone().detach() pos_rois_rotation = pos_roi_boxes3d[..., 6].view(-1) roi_xyz = pos_roi_boxes3d[..., 0:3].view(-1, 3) batch_anchors[..., 0:3] = 0 # decode boxes pred_boxes3d = self.bbox_coder.decode( batch_anchors, pos_bbox_pred.view(-1, code_size)).view(-1, code_size) pred_boxes3d[..., 0:3] = rotation_3d_in_axis( pred_boxes3d[..., 0:3].unsqueeze(1), (pos_rois_rotation), axis=2).squeeze(1) pred_boxes3d[:, 0:3] += roi_xyz # calculate corner loss loss_corner = self.get_corner_loss_lidar(pred_boxes3d, pos_gt_bboxes) losses['loss_corner'] = loss_corner else: losses['loss_corner'] = loss_cls.new_tensor(0) return losses def get_corner_loss_lidar(self, pred_bbox3d, gt_bbox3d, delta=1.0): """Calculate corner loss of given boxes. Args: pred_bbox3d (torch.FloatTensor): Predicted boxes in shape (N, 7). gt_bbox3d (torch.FloatTensor): Ground truth boxes in shape (N, 7). delta (float, optional): huber loss threshold. Defaults to 1.0 Returns: torch.FloatTensor: Calculated corner loss in shape (N). """ assert pred_bbox3d.shape[0] == gt_bbox3d.shape[0] # This is a little bit hack here because we assume the box for # PointRCNN is in LiDAR coordinates gt_boxes_structure = LiDARInstance3DBoxes(gt_bbox3d) pred_box_corners = LiDARInstance3DBoxes(pred_bbox3d).corners gt_box_corners = gt_boxes_structure.corners # This flip only changes the heading direction of GT boxes gt_bbox3d_flip = gt_boxes_structure.clone() gt_bbox3d_flip.tensor[:, 6] += np.pi gt_box_corners_flip = gt_bbox3d_flip.corners corner_dist = torch.min( torch.norm(pred_box_corners - gt_box_corners, dim=2), torch.norm(pred_box_corners - gt_box_corners_flip, dim=2)) # huber loss abs_error = corner_dist.abs() quadratic = abs_error.clamp(max=delta) linear = (abs_error - quadratic) corner_loss = 0.5 * quadratic**2 + delta * linear return corner_loss.mean(dim=1) def get_targets(self, sampling_results, rcnn_train_cfg, concat=True): """Generate targets. Args: sampling_results (list[:obj:`SamplingResult`]): Sampled results from rois. rcnn_train_cfg (:obj:`ConfigDict`): Training config of rcnn. concat (bool, optional): Whether to concatenate targets between batches. Defaults to True. Returns: tuple[torch.Tensor]: Targets of boxes and class prediction. """ pos_bboxes_list = [res.pos_bboxes for res in sampling_results] pos_gt_bboxes_list = [res.pos_gt_bboxes for res in sampling_results] iou_list = [res.iou for res in sampling_results] targets = multi_apply( self._get_target_single, pos_bboxes_list, pos_gt_bboxes_list, iou_list, cfg=rcnn_train_cfg) (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) = targets if concat: label = torch.cat(label, 0) bbox_targets = torch.cat(bbox_targets, 0) pos_gt_bboxes = torch.cat(pos_gt_bboxes, 0) reg_mask = torch.cat(reg_mask, 0) label_weights = torch.cat(label_weights, 0) label_weights /= torch.clamp(label_weights.sum(), min=1.0) bbox_weights = torch.cat(bbox_weights, 0) bbox_weights /= torch.clamp(bbox_weights.sum(), min=1.0) return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) def _get_target_single(self, pos_bboxes, pos_gt_bboxes, ious, cfg): """Generate training targets for a single sample. Args: pos_bboxes (torch.Tensor): Positive boxes with shape (N, 7). pos_gt_bboxes (torch.Tensor): Ground truth boxes with shape (M, 7). ious (torch.Tensor): IoU between `pos_bboxes` and `pos_gt_bboxes` in shape (N, M). cfg (dict): Training configs. Returns: tuple[torch.Tensor]: Target for positive boxes. (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) """ cls_pos_mask = ious > cfg.cls_pos_thr cls_neg_mask = ious < cfg.cls_neg_thr interval_mask = (cls_pos_mask == 0) & (cls_neg_mask == 0) # iou regression target label = (cls_pos_mask > 0).float() label[interval_mask] = (ious[interval_mask] - cfg.cls_neg_thr) / \ (cfg.cls_pos_thr - cfg.cls_neg_thr) # label weights label_weights = (label >= 0).float() # box regression target reg_mask = pos_bboxes.new_zeros(ious.size(0)).long() reg_mask[0:pos_gt_bboxes.size(0)] = 1 bbox_weights = (reg_mask > 0).float() if reg_mask.bool().any(): pos_gt_bboxes_ct = pos_gt_bboxes.clone().detach() roi_center = pos_bboxes[..., 0:3] roi_ry = pos_bboxes[..., 6] % (2 * np.pi) # canonical transformation pos_gt_bboxes_ct[..., 0:3] -= roi_center pos_gt_bboxes_ct[..., 6] -= roi_ry pos_gt_bboxes_ct[..., 0:3] = rotation_3d_in_axis( pos_gt_bboxes_ct[..., 0:3].unsqueeze(1), -(roi_ry), axis=2).squeeze(1) # flip orientation if gt have opposite orientation ry_label = pos_gt_bboxes_ct[..., 6] % (2 * np.pi) # 0 ~ 2pi is_opposite = (ry_label > np.pi * 0.5) & (ry_label < np.pi * 1.5) ry_label[is_opposite] = (ry_label[is_opposite] + np.pi) % ( 2 * np.pi) # (0 ~ pi/2, 3pi/2 ~ 2pi) flag = ry_label > np.pi ry_label[flag] = ry_label[flag] - np.pi * 2 # (-pi/2, pi/2) ry_label = torch.clamp(ry_label, min=-np.pi / 2, max=np.pi / 2) pos_gt_bboxes_ct[..., 6] = ry_label rois_anchor = pos_bboxes.clone().detach() rois_anchor[:, 0:3] = 0 rois_anchor[:, 6] = 0 bbox_targets = self.bbox_coder.encode(rois_anchor, pos_gt_bboxes_ct) else: # no fg bbox bbox_targets = pos_gt_bboxes.new_empty((0, 7)) return (label, bbox_targets, pos_gt_bboxes, reg_mask, label_weights, bbox_weights) def get_bboxes(self, rois, cls_score, bbox_pred, class_labels, img_metas, cfg=None): """Generate bboxes from bbox head predictions. Args: rois (torch.Tensor): RoI bounding boxes. cls_score (torch.Tensor): Scores of bounding boxes. bbox_pred (torch.Tensor): Bounding boxes predictions class_labels (torch.Tensor): Label of classes img_metas (list[dict]): Point cloud and image's meta info. cfg (:obj:`ConfigDict`, optional): Testing config. Defaults to None. Returns: list[tuple]: Decoded bbox, scores and labels after nms. """ roi_batch_id = rois[..., 0] roi_boxes = rois[..., 1:] # boxes without batch id batch_size = int(roi_batch_id.max().item() + 1) # decode boxes roi_ry = roi_boxes[..., 6].view(-1) roi_xyz = roi_boxes[..., 0:3].view(-1, 3) local_roi_boxes = roi_boxes.clone().detach() local_roi_boxes[..., 0:3] = 0 rcnn_boxes3d = self.bbox_coder.decode(local_roi_boxes, bbox_pred) rcnn_boxes3d[..., 0:3] = rotation_3d_in_axis( rcnn_boxes3d[..., 0:3].unsqueeze(1), roi_ry, axis=2).squeeze(1) rcnn_boxes3d[:, 0:3] += roi_xyz # post processing result_list = [] for batch_id in range(batch_size): cur_class_labels = class_labels[batch_id] cur_cls_score = cls_score[roi_batch_id == batch_id].view(-1) cur_box_prob = cur_cls_score.unsqueeze(1) cur_rcnn_boxes3d = rcnn_boxes3d[roi_batch_id == batch_id] keep = self.multi_class_nms(cur_box_prob, cur_rcnn_boxes3d, cfg.score_thr, cfg.nms_thr, img_metas[batch_id], cfg.use_rotate_nms) selected_bboxes = cur_rcnn_boxes3d[keep] selected_label_preds = cur_class_labels[keep] selected_scores = cur_cls_score[keep] result_list.append( (img_metas[batch_id]['box_type_3d'](selected_bboxes, self.bbox_coder.code_size), selected_scores, selected_label_preds)) return result_list def multi_class_nms(self, box_probs, box_preds, score_thr, nms_thr, input_meta, use_rotate_nms=True): """Multi-class NMS for box head. Note: This function has large overlap with the `box3d_multiclass_nms` implemented in `mmdet3d.core.post_processing`. We are considering merging these two functions in the future. Args: box_probs (torch.Tensor): Predicted boxes probabilities in shape (N,). box_preds (torch.Tensor): Predicted boxes in shape (N, 7+C). score_thr (float): Threshold of scores. nms_thr (float): Threshold for NMS. input_meta (dict): Meta information of the current sample. use_rotate_nms (bool, optional): Whether to use rotated nms. Defaults to True. Returns: torch.Tensor: Selected indices. """ if use_rotate_nms: nms_func = nms_bev else: nms_func = nms_normal_bev assert box_probs.shape[ 1] == self.num_classes, f'box_probs shape: {str(box_probs.shape)}' selected_list = [] selected_labels = [] boxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( box_preds, self.bbox_coder.code_size).bev) score_thresh = score_thr if isinstance( score_thr, list) else [score_thr for x in range(self.num_classes)] nms_thresh = nms_thr if isinstance( nms_thr, list) else [nms_thr for x in range(self.num_classes)] for k in range(0, self.num_classes): class_scores_keep = box_probs[:, k] >= score_thresh[k] if class_scores_keep.int().sum() > 0: original_idxs = class_scores_keep.nonzero( as_tuple=False).view(-1) cur_boxes_for_nms = boxes_for_nms[class_scores_keep] cur_rank_scores = box_probs[class_scores_keep, k] cur_selected = nms_func(cur_boxes_for_nms, cur_rank_scores, nms_thresh[k]) if cur_selected.shape[0] == 0: continue selected_list.append(original_idxs[cur_selected]) selected_labels.append( torch.full([cur_selected.shape[0]], k + 1, dtype=torch.int64, device=box_preds.device)) keep = torch.cat( selected_list, dim=0) if len(selected_list) > 0 else [] return keep ================================================ FILE: mmdet3d/models/roi_heads/h3d_roi_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet3d.core.bbox import bbox3d2result from ..builder import HEADS, build_head from .base_3droi_head import Base3DRoIHead @HEADS.register_module() class H3DRoIHead(Base3DRoIHead): """H3D roi head for H3DNet. Args: primitive_list (List): Configs of primitive heads. bbox_head (ConfigDict): Config of bbox_head. train_cfg (ConfigDict): Training config. test_cfg (ConfigDict): Testing config. """ def __init__(self, primitive_list, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(H3DRoIHead, self).__init__( bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) # Primitive module assert len(primitive_list) == 3 self.primitive_z = build_head(primitive_list[0]) self.primitive_xy = build_head(primitive_list[1]) self.primitive_line = build_head(primitive_list[2]) def init_mask_head(self): """Initialize mask head, skip since ``H3DROIHead`` does not have one.""" pass def init_bbox_head(self, bbox_head): """Initialize box head.""" bbox_head['train_cfg'] = self.train_cfg bbox_head['test_cfg'] = self.test_cfg self.bbox_head = build_head(bbox_head) def init_assigner_sampler(self): """Initialize assigner and sampler.""" pass def forward_train(self, feats_dict, img_metas, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, gt_bboxes_ignore=None): """Training forward function of PartAggregationROIHead. Args: feats_dict (dict): Contains features from the first stage. img_metas (list[dict]): Contain pcd and img's meta info. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding boxes to ignore. Returns: dict: losses from each head. """ losses = dict() sample_mod = self.train_cfg.sample_mod assert sample_mod in ['vote', 'seed', 'random'] result_z = self.primitive_z(feats_dict, sample_mod) feats_dict.update(result_z) result_xy = self.primitive_xy(feats_dict, sample_mod) feats_dict.update(result_xy) result_line = self.primitive_line(feats_dict, sample_mod) feats_dict.update(result_line) primitive_loss_inputs = (feats_dict, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas, gt_bboxes_ignore) loss_z = self.primitive_z.loss(*primitive_loss_inputs) losses.update(loss_z) loss_xy = self.primitive_xy.loss(*primitive_loss_inputs) losses.update(loss_xy) loss_line = self.primitive_line.loss(*primitive_loss_inputs) losses.update(loss_line) targets = feats_dict.pop('targets') bbox_results = self.bbox_head(feats_dict, sample_mod) feats_dict.update(bbox_results) bbox_loss = self.bbox_head.loss(feats_dict, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, img_metas, targets, gt_bboxes_ignore) losses.update(bbox_loss) return losses def simple_test(self, feats_dict, img_metas, points, rescale=False): """Simple testing forward function of PartAggregationROIHead. Note: This function assumes that the batch size is 1 Args: feats_dict (dict): Contains features from the first stage. img_metas (list[dict]): Contain pcd and img's meta info. points (torch.Tensor): Input points. rescale (bool): Whether to rescale results. Returns: dict: Bbox results of one frame. """ sample_mod = self.test_cfg.sample_mod assert sample_mod in ['vote', 'seed', 'random'] result_z = self.primitive_z(feats_dict, sample_mod) feats_dict.update(result_z) result_xy = self.primitive_xy(feats_dict, sample_mod) feats_dict.update(result_xy) result_line = self.primitive_line(feats_dict, sample_mod) feats_dict.update(result_line) bbox_preds = self.bbox_head(feats_dict, sample_mod) feats_dict.update(bbox_preds) bbox_list = self.bbox_head.get_bboxes( points, feats_dict, img_metas, rescale=rescale, suffix='_optimized') bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results ================================================ FILE: mmdet3d/models/roi_heads/mask_heads/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .pointwise_semantic_head import PointwiseSemanticHead from .primitive_head import PrimitiveHead __all__ = ['PointwiseSemanticHead', 'PrimitiveHead'] ================================================ FILE: mmdet3d/models/roi_heads/mask_heads/pointwise_semantic_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.runner import BaseModule from torch import nn as nn from torch.nn import functional as F from mmdet3d.core.bbox.structures import rotation_3d_in_axis from mmdet3d.models.builder import HEADS, build_loss from mmdet.core import multi_apply @HEADS.register_module() class PointwiseSemanticHead(BaseModule): """Semantic segmentation head for point-wise segmentation. Predict point-wise segmentation and part regression results for PartA2. See `paper `_ for more details. Args: in_channels (int): The number of input channel. num_classes (int): The number of class. extra_width (float): Boxes enlarge width. loss_seg (dict): Config of segmentation loss. loss_part (dict): Config of part prediction loss. """ def __init__(self, in_channels, num_classes=3, extra_width=0.2, seg_score_thr=0.3, init_cfg=None, loss_seg=dict( type='FocalLoss', use_sigmoid=True, reduction='sum', gamma=2.0, alpha=0.25, loss_weight=1.0), loss_part=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)): super(PointwiseSemanticHead, self).__init__(init_cfg=init_cfg) self.extra_width = extra_width self.num_classes = num_classes self.seg_score_thr = seg_score_thr self.seg_cls_layer = nn.Linear(in_channels, 1, bias=True) self.seg_reg_layer = nn.Linear(in_channels, 3, bias=True) self.loss_seg = build_loss(loss_seg) self.loss_part = build_loss(loss_part) def forward(self, x): """Forward pass. Args: x (torch.Tensor): Features from the first stage. Returns: dict: Part features, segmentation and part predictions. - seg_preds (torch.Tensor): Segment predictions. - part_preds (torch.Tensor): Part predictions. - part_feats (torch.Tensor): Feature predictions. """ seg_preds = self.seg_cls_layer(x) # (N, 1) part_preds = self.seg_reg_layer(x) # (N, 3) seg_scores = torch.sigmoid(seg_preds).detach() seg_mask = (seg_scores > self.seg_score_thr) part_offsets = torch.sigmoid(part_preds).clone().detach() part_offsets[seg_mask.view(-1) == 0] = 0 part_feats = torch.cat((part_offsets, seg_scores), dim=-1) # shape (npoints, 4) return dict( seg_preds=seg_preds, part_preds=part_preds, part_feats=part_feats) def get_targets_single(self, voxel_centers, gt_bboxes_3d, gt_labels_3d): """generate segmentation and part prediction targets for a single sample. Args: voxel_centers (torch.Tensor): The center of voxels in shape (voxel_num, 3). gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in shape (box_num, 7). gt_labels_3d (torch.Tensor): Class labels of ground truths in shape (box_num). Returns: tuple[torch.Tensor]: Segmentation targets with shape [voxel_num] part prediction targets with shape [voxel_num, 3] """ gt_bboxes_3d = gt_bboxes_3d.to(voxel_centers.device) enlarged_gt_boxes = gt_bboxes_3d.enlarged_box(self.extra_width) part_targets = voxel_centers.new_zeros((voxel_centers.shape[0], 3), dtype=torch.float32) box_idx = gt_bboxes_3d.points_in_boxes_part(voxel_centers) enlarge_box_idx = enlarged_gt_boxes.points_in_boxes_part( voxel_centers).long() gt_labels_pad = F.pad( gt_labels_3d, (1, 0), mode='constant', value=self.num_classes) seg_targets = gt_labels_pad[(box_idx.long() + 1)] fg_pt_flag = box_idx > -1 ignore_flag = fg_pt_flag ^ (enlarge_box_idx > -1) seg_targets[ignore_flag] = -1 for k in range(len(gt_bboxes_3d)): k_box_flag = box_idx == k # no point in current box (caused by velodyne reduce) if not k_box_flag.any(): continue fg_voxels = voxel_centers[k_box_flag] transformed_voxels = fg_voxels - gt_bboxes_3d.bottom_center[k] transformed_voxels = rotation_3d_in_axis( transformed_voxels.unsqueeze(0), -gt_bboxes_3d.yaw[k].view(1), axis=2) part_targets[k_box_flag] = transformed_voxels / gt_bboxes_3d.dims[ k] + voxel_centers.new_tensor([0.5, 0.5, 0]) part_targets = torch.clamp(part_targets, min=0) return seg_targets, part_targets def get_targets(self, voxels_dict, gt_bboxes_3d, gt_labels_3d): """generate segmentation and part prediction targets. Args: voxel_centers (torch.Tensor): The center of voxels in shape (voxel_num, 3). gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes in shape (box_num, 7). gt_labels_3d (torch.Tensor): Class labels of ground truths in shape (box_num). Returns: dict: Prediction targets - seg_targets (torch.Tensor): Segmentation targets with shape [voxel_num]. - part_targets (torch.Tensor): Part prediction targets with shape [voxel_num, 3]. """ batch_size = len(gt_labels_3d) voxel_center_list = [] for idx in range(batch_size): coords_idx = voxels_dict['coors'][:, 0] == idx voxel_center_list.append(voxels_dict['voxel_centers'][coords_idx]) seg_targets, part_targets = multi_apply(self.get_targets_single, voxel_center_list, gt_bboxes_3d, gt_labels_3d) seg_targets = torch.cat(seg_targets, dim=0) part_targets = torch.cat(part_targets, dim=0) return dict(seg_targets=seg_targets, part_targets=part_targets) def loss(self, semantic_results, semantic_targets): """Calculate point-wise segmentation and part prediction losses. Args: semantic_results (dict): Results from semantic head. - seg_preds: Segmentation predictions. - part_preds: Part predictions. semantic_targets (dict): Targets of semantic results. - seg_preds: Segmentation targets. - part_preds: Part targets. Returns: dict: Loss of segmentation and part prediction. - loss_seg (torch.Tensor): Segmentation prediction loss. - loss_part (torch.Tensor): Part prediction loss. """ seg_preds = semantic_results['seg_preds'] part_preds = semantic_results['part_preds'] seg_targets = semantic_targets['seg_targets'] part_targets = semantic_targets['part_targets'] pos_mask = (seg_targets > -1) & (seg_targets < self.num_classes) binary_seg_target = pos_mask.long() pos = pos_mask.float() neg = (seg_targets == self.num_classes).float() seg_weights = pos + neg pos_normalizer = pos.sum() seg_weights = seg_weights / torch.clamp(pos_normalizer, min=1.0) loss_seg = self.loss_seg(seg_preds, binary_seg_target, seg_weights) if pos_normalizer > 0: loss_part = self.loss_part(part_preds[pos_mask], part_targets[pos_mask]) else: # fake a part loss loss_part = loss_seg.new_tensor(0) return dict(loss_seg=loss_seg, loss_part=loss_part) ================================================ FILE: mmdet3d/models/roi_heads/mask_heads/primitive_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.ops import furthest_point_sample from mmcv.runner import BaseModule from torch import nn as nn from torch.nn import functional as F from mmdet3d.models.builder import HEADS, build_loss from mmdet3d.models.model_utils import VoteModule from mmdet3d.ops import build_sa_module from mmdet.core import multi_apply @HEADS.register_module() class PrimitiveHead(BaseModule): r"""Primitive head of `H3DNet `_. Args: num_dims (int): The dimension of primitive semantic information. num_classes (int): The number of class. primitive_mode (str): The mode of primitive module, available mode ['z', 'xy', 'line']. bbox_coder (:obj:`BaseBBoxCoder`): Bbox coder for encoding and decoding boxes. train_cfg (dict): Config for training. test_cfg (dict): Config for testing. vote_module_cfg (dict): Config of VoteModule for point-wise votes. vote_aggregation_cfg (dict): Config of vote aggregation layer. feat_channels (tuple[int]): Convolution channels of prediction layer. upper_thresh (float): Threshold for line matching. surface_thresh (float): Threshold for surface matching. conv_cfg (dict): Config of convolution in prediction layer. norm_cfg (dict): Config of BN in prediction layer. objectness_loss (dict): Config of objectness loss. center_loss (dict): Config of center loss. semantic_loss (dict): Config of point-wise semantic segmentation loss. """ def __init__(self, num_dims, num_classes, primitive_mode, train_cfg=None, test_cfg=None, vote_module_cfg=None, vote_aggregation_cfg=None, feat_channels=(128, 128), upper_thresh=100.0, surface_thresh=0.5, conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), objectness_loss=None, center_loss=None, semantic_reg_loss=None, semantic_cls_loss=None, init_cfg=None): super(PrimitiveHead, self).__init__(init_cfg=init_cfg) assert primitive_mode in ['z', 'xy', 'line'] # The dimension of primitive semantic information. self.num_dims = num_dims self.num_classes = num_classes self.primitive_mode = primitive_mode self.train_cfg = train_cfg self.test_cfg = test_cfg self.gt_per_seed = vote_module_cfg['gt_per_seed'] self.num_proposal = vote_aggregation_cfg['num_point'] self.upper_thresh = upper_thresh self.surface_thresh = surface_thresh self.objectness_loss = build_loss(objectness_loss) self.center_loss = build_loss(center_loss) self.semantic_reg_loss = build_loss(semantic_reg_loss) self.semantic_cls_loss = build_loss(semantic_cls_loss) assert vote_aggregation_cfg['mlp_channels'][0] == vote_module_cfg[ 'in_channels'] # Primitive existence flag prediction self.flag_conv = ConvModule( vote_module_cfg['conv_channels'][-1], vote_module_cfg['conv_channels'][-1] // 2, 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True) self.flag_pred = torch.nn.Conv1d( vote_module_cfg['conv_channels'][-1] // 2, 2, 1) self.vote_module = VoteModule(**vote_module_cfg) self.vote_aggregation = build_sa_module(vote_aggregation_cfg) prev_channel = vote_aggregation_cfg['mlp_channels'][-1] conv_pred_list = list() for k in range(len(feat_channels)): conv_pred_list.append( ConvModule( prev_channel, feat_channels[k], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, bias=True, inplace=True)) prev_channel = feat_channels[k] self.conv_pred = nn.Sequential(*conv_pred_list) conv_out_channel = 3 + num_dims + num_classes self.conv_pred.add_module('conv_out', nn.Conv1d(prev_channel, conv_out_channel, 1)) def forward(self, feats_dict, sample_mod): """Forward pass. Args: feats_dict (dict): Feature dict from backbone. sample_mod (str): Sample mode for vote aggregation layer. valid modes are "vote", "seed" and "random". Returns: dict: Predictions of primitive head. """ assert sample_mod in ['vote', 'seed', 'random'] seed_points = feats_dict['fp_xyz_net0'][-1] seed_features = feats_dict['hd_feature'] results = {} primitive_flag = self.flag_conv(seed_features) primitive_flag = self.flag_pred(primitive_flag) results['pred_flag_' + self.primitive_mode] = primitive_flag # 1. generate vote_points from seed_points vote_points, vote_features, _ = self.vote_module( seed_points, seed_features) results['vote_' + self.primitive_mode] = vote_points results['vote_features_' + self.primitive_mode] = vote_features # 2. aggregate vote_points if sample_mod == 'vote': # use fps in vote_aggregation sample_indices = None elif sample_mod == 'seed': # FPS on seed and choose the votes corresponding to the seeds sample_indices = furthest_point_sample(seed_points, self.num_proposal) elif sample_mod == 'random': # Random sampling from the votes batch_size, num_seed = seed_points.shape[:2] sample_indices = torch.randint( 0, num_seed, (batch_size, self.num_proposal), dtype=torch.int32, device=seed_points.device) else: raise NotImplementedError('Unsupported sample mod!') vote_aggregation_ret = self.vote_aggregation(vote_points, vote_features, sample_indices) aggregated_points, features, aggregated_indices = vote_aggregation_ret results['aggregated_points_' + self.primitive_mode] = aggregated_points results['aggregated_features_' + self.primitive_mode] = features results['aggregated_indices_' + self.primitive_mode] = aggregated_indices # 3. predict primitive offsets and semantic information predictions = self.conv_pred(features) # 4. decode predictions decode_ret = self.primitive_decode_scores(predictions, aggregated_points) results.update(decode_ret) center, pred_ind = self.get_primitive_center( primitive_flag, decode_ret['center_' + self.primitive_mode]) results['pred_' + self.primitive_mode + '_ind'] = pred_ind results['pred_' + self.primitive_mode + '_center'] = center return results def loss(self, bbox_preds, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, img_metas=None, gt_bboxes_ignore=None): """Compute loss. Args: bbox_preds (dict): Predictions from forward of primitive head. points (list[torch.Tensor]): Input points. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each sample. gt_labels_3d (list[torch.Tensor]): Labels of each sample. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic mask. pts_instance_mask (list[torch.Tensor]): Point-wise instance mask. img_metas (list[dict]): Contain pcd and img's meta info. gt_bboxes_ignore (list[torch.Tensor]): Specify which bounding. Returns: dict: Losses of Primitive Head. """ targets = self.get_targets(points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask, bbox_preds) (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_primitive_mask) = targets losses = {} # Compute the loss of primitive existence flag pred_flag = bbox_preds['pred_flag_' + self.primitive_mode] flag_loss = self.objectness_loss(pred_flag, gt_primitive_mask.long()) losses['flag_loss_' + self.primitive_mode] = flag_loss # calculate vote loss vote_loss = self.vote_module.get_loss( bbox_preds['seed_points'], bbox_preds['vote_' + self.primitive_mode], bbox_preds['seed_indices'], point_mask, point_offset) losses['vote_loss_' + self.primitive_mode] = vote_loss num_proposal = bbox_preds['aggregated_points_' + self.primitive_mode].shape[1] primitive_center = bbox_preds['center_' + self.primitive_mode] if self.primitive_mode != 'line': primitive_semantic = bbox_preds['size_residuals_' + self.primitive_mode].contiguous() else: primitive_semantic = None semancitc_scores = bbox_preds['sem_cls_scores_' + self.primitive_mode].transpose(2, 1) gt_primitive_mask = gt_primitive_mask / \ (gt_primitive_mask.sum() + 1e-6) center_loss, size_loss, sem_cls_loss = self.compute_primitive_loss( primitive_center, primitive_semantic, semancitc_scores, num_proposal, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_primitive_mask) losses['center_loss_' + self.primitive_mode] = center_loss losses['size_loss_' + self.primitive_mode] = size_loss losses['sem_loss_' + self.primitive_mode] = sem_cls_loss return losses def get_targets(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None, bbox_preds=None): """Generate targets of primitive head. Args: points (list[torch.Tensor]): Points of each batch. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth bboxes of each batch. gt_labels_3d (list[torch.Tensor]): Labels of each batch. pts_semantic_mask (list[torch.Tensor]): Point-wise semantic label of each batch. pts_instance_mask (list[torch.Tensor]): Point-wise instance label of each batch. bbox_preds (dict): Predictions from forward of primitive head. Returns: tuple[torch.Tensor]: Targets of primitive head. """ for index in range(len(gt_labels_3d)): if len(gt_labels_3d[index]) == 0: fake_box = gt_bboxes_3d[index].tensor.new_zeros( 1, gt_bboxes_3d[index].tensor.shape[-1]) gt_bboxes_3d[index] = gt_bboxes_3d[index].new_box(fake_box) gt_labels_3d[index] = gt_labels_3d[index].new_zeros(1) if pts_semantic_mask is None: pts_semantic_mask = [None for i in range(len(gt_labels_3d))] pts_instance_mask = [None for i in range(len(gt_labels_3d))] (point_mask, point_sem, point_offset) = multi_apply(self.get_targets_single, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask, pts_instance_mask) point_mask = torch.stack(point_mask) point_sem = torch.stack(point_sem) point_offset = torch.stack(point_offset) batch_size = point_mask.shape[0] num_proposal = bbox_preds['aggregated_points_' + self.primitive_mode].shape[1] num_seed = bbox_preds['seed_points'].shape[1] seed_inds = bbox_preds['seed_indices'].long() seed_inds_expand = seed_inds.view(batch_size, num_seed, 1).repeat(1, 1, 3) seed_gt_votes = torch.gather(point_offset, 1, seed_inds_expand) seed_gt_votes += bbox_preds['seed_points'] gt_primitive_center = seed_gt_votes.view(batch_size * num_proposal, 1, 3) seed_inds_expand_sem = seed_inds.view(batch_size, num_seed, 1).repeat( 1, 1, 4 + self.num_dims) seed_gt_sem = torch.gather(point_sem, 1, seed_inds_expand_sem) gt_primitive_semantic = seed_gt_sem[:, :, 3:3 + self.num_dims].view( batch_size * num_proposal, 1, self.num_dims).contiguous() gt_sem_cls_label = seed_gt_sem[:, :, -1].long() gt_votes_mask = torch.gather(point_mask, 1, seed_inds) return (point_mask, point_offset, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_votes_mask) def get_targets_single(self, points, gt_bboxes_3d, gt_labels_3d, pts_semantic_mask=None, pts_instance_mask=None): """Generate targets of primitive head for single batch. Args: points (torch.Tensor): Points of each batch. gt_bboxes_3d (:obj:`BaseInstance3DBoxes`): Ground truth boxes of each batch. gt_labels_3d (torch.Tensor): Labels of each batch. pts_semantic_mask (torch.Tensor): Point-wise semantic label of each batch. pts_instance_mask (torch.Tensor): Point-wise instance label of each batch. Returns: tuple[torch.Tensor]: Targets of primitive head. """ gt_bboxes_3d = gt_bboxes_3d.to(points.device) num_points = points.shape[0] point_mask = points.new_zeros(num_points) # Offset to the primitive center point_offset = points.new_zeros([num_points, 3]) # Semantic information of primitive center point_sem = points.new_zeros([num_points, 3 + self.num_dims + 1]) # Generate pts_semantic_mask and pts_instance_mask when they are None if pts_semantic_mask is None or pts_instance_mask is None: points2box_mask = gt_bboxes_3d.points_in_boxes_all(points) assignment = points2box_mask.argmax(1) background_mask = points2box_mask.max(1)[0] == 0 if pts_semantic_mask is None: pts_semantic_mask = gt_labels_3d[assignment] pts_semantic_mask[background_mask] = self.num_classes if pts_instance_mask is None: pts_instance_mask = assignment pts_instance_mask[background_mask] = gt_labels_3d.shape[0] instance_flag = torch.nonzero( pts_semantic_mask != self.num_classes, as_tuple=False).squeeze(1) instance_labels = pts_instance_mask[instance_flag].unique() with_yaw = gt_bboxes_3d.with_yaw for i, i_instance in enumerate(instance_labels): indices = instance_flag[pts_instance_mask[instance_flag] == i_instance] coords = points[indices, :3] cur_cls_label = pts_semantic_mask[indices][0] # Bbox Corners cur_corners = gt_bboxes_3d.corners[i] plane_lower_temp = points.new_tensor( [0, 0, 1, -cur_corners[7, -1]]) upper_points = cur_corners[[1, 2, 5, 6]] refined_distance = (upper_points * plane_lower_temp[:3]).sum(dim=1) if self.check_horizon(upper_points) and \ plane_lower_temp[0] + plane_lower_temp[1] < \ self.train_cfg['lower_thresh']: plane_lower = points.new_tensor( [0, 0, 1, plane_lower_temp[-1]]) plane_upper = points.new_tensor( [0, 0, 1, -torch.mean(refined_distance)]) else: raise NotImplementedError('Only horizontal plane is support!') if self.check_dist(plane_upper, upper_points) is False: raise NotImplementedError( 'Mean distance to plane should be lower than thresh!') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_lower, coords) # Get bottom four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='bottom') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching, cur_corners, [1, 1, 0, 0], with_yaw, mode='bottom') # Set the surface labels here if self.primitive_mode == 'z' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='bottom') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_upper, coords) # Get top four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='top') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching, cur_corners, [1, 1, 0, 0], with_yaw, mode='top') if self.primitive_mode == 'z' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets(point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='top') # Get left two lines plane_left_temp = self._get_plane_fomulation( cur_corners[2] - cur_corners[3], cur_corners[3] - cur_corners[0], cur_corners[0]) right_points = cur_corners[[4, 5, 7, 6]] plane_left_temp /= torch.norm(plane_left_temp[:3]) refined_distance = (right_points * plane_left_temp[:3]).sum(dim=1) if plane_left_temp[2] < self.train_cfg['lower_thresh']: plane_left = plane_left_temp plane_right = points.new_tensor([ plane_left_temp[0], plane_left_temp[1], plane_left_temp[2], -refined_distance.mean() ]) else: raise NotImplementedError( 'Normal vector of the plane should be horizontal!') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_left, coords) # Get left four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='left') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching[2:], cur_corners, [2, 2], with_yaw, mode='left') if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='left') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_right, coords) # Get right four lines if self.primitive_mode == 'line': point2line_matching = self.match_point2line( coords[selected], cur_corners, with_yaw, mode='right') point_mask, point_offset, point_sem = \ self._assign_primitive_line_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, point2line_matching[2:], cur_corners, [2, 2], with_yaw, mode='right') if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='right') plane_front_temp = self._get_plane_fomulation( cur_corners[0] - cur_corners[4], cur_corners[4] - cur_corners[5], cur_corners[5]) back_points = cur_corners[[3, 2, 7, 6]] plane_front_temp /= torch.norm(plane_front_temp[:3]) refined_distance = (back_points * plane_front_temp[:3]).sum(dim=1) if plane_front_temp[2] < self.train_cfg['lower_thresh']: plane_front = plane_front_temp plane_back = points.new_tensor([ plane_front_temp[0], plane_front_temp[1], plane_front_temp[2], -torch.mean(refined_distance) ]) else: raise NotImplementedError( 'Normal vector of the plane should be horizontal!') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_front, coords) if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ (point2plane_dist[selected]).var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='front') # Get the boundary points here point2plane_dist, selected = self.match_point2plane( plane_back, coords) if self.primitive_mode == 'xy' and \ selected.sum() > self.train_cfg['num_point'] and \ point2plane_dist[selected].var() < \ self.train_cfg['var_thresh']: point_mask, point_offset, point_sem = \ self._assign_primitive_surface_targets( point_mask, point_offset, point_sem, coords[selected], indices[selected], cur_cls_label, cur_corners, with_yaw, mode='back') return (point_mask, point_sem, point_offset) def primitive_decode_scores(self, predictions, aggregated_points): """Decode predicted parts to primitive head. Args: predictions (torch.Tensor): primitive pridictions of each batch. aggregated_points (torch.Tensor): The aggregated points of vote stage. Returns: Dict: Predictions of primitive head, including center, semantic size and semantic scores. """ ret_dict = {} pred_transposed = predictions.transpose(2, 1) center = aggregated_points + pred_transposed[:, :, 0:3] ret_dict['center_' + self.primitive_mode] = center if self.primitive_mode in ['z', 'xy']: ret_dict['size_residuals_' + self.primitive_mode] = \ pred_transposed[:, :, 3:3 + self.num_dims] ret_dict['sem_cls_scores_' + self.primitive_mode] = \ pred_transposed[:, :, 3 + self.num_dims:] return ret_dict def check_horizon(self, points): """Check whether is a horizontal plane. Args: points (torch.Tensor): Points of input. Returns: Bool: Flag of result. """ return (points[0][-1] == points[1][-1]) and \ (points[1][-1] == points[2][-1]) and \ (points[2][-1] == points[3][-1]) def check_dist(self, plane_equ, points): """Whether the mean of points to plane distance is lower than thresh. Args: plane_equ (torch.Tensor): Plane to be checked. points (torch.Tensor): Points to be checked. Returns: Tuple: Flag of result. """ return (points[:, 2] + plane_equ[-1]).sum() / 4.0 < self.train_cfg['lower_thresh'] def point2line_dist(self, points, pts_a, pts_b): """Calculate the distance from point to line. Args: points (torch.Tensor): Points of input. pts_a (torch.Tensor): Point on the specific line. pts_b (torch.Tensor): Point on the specific line. Returns: torch.Tensor: Distance between each point to line. """ line_a2b = pts_b - pts_a line_a2pts = points - pts_a length = (line_a2pts * line_a2b.view(1, 3)).sum(1) / \ line_a2b.norm() dist = (line_a2pts.norm(dim=1)**2 - length**2).sqrt() return dist def match_point2line(self, points, corners, with_yaw, mode='bottom'): """Match points to corresponding line. Args: points (torch.Tensor): Points of input. corners (torch.Tensor): Eight corners of a bounding box. with_yaw (Bool): Whether the boundind box is with rotation. mode (str, optional): Specify which line should be matched, available mode are ('bottom', 'top', 'left', 'right'). Defaults to 'bottom'. Returns: Tuple: Flag of matching correspondence. """ if with_yaw: corners_pair = { 'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]], 'top': [[1, 2], [5, 6], [1, 5], [2, 6]], 'left': [[0, 1], [3, 2], [0, 1], [3, 2]], 'right': [[4, 5], [7, 6], [4, 5], [7, 6]] } selected_list = [] for pair_index in corners_pair[mode]: selected = self.point2line_dist( points, corners[pair_index[0]], corners[pair_index[1]]) \ < self.train_cfg['line_thresh'] selected_list.append(selected) else: xmin, ymin, _ = corners.min(0)[0] xmax, ymax, _ = corners.max(0)[0] sel1 = torch.abs(points[:, 0] - xmin) < self.train_cfg['line_thresh'] sel2 = torch.abs(points[:, 0] - xmax) < self.train_cfg['line_thresh'] sel3 = torch.abs(points[:, 1] - ymin) < self.train_cfg['line_thresh'] sel4 = torch.abs(points[:, 1] - ymax) < self.train_cfg['line_thresh'] selected_list = [sel1, sel2, sel3, sel4] return selected_list def match_point2plane(self, plane, points): """Match points to plane. Args: plane (torch.Tensor): Equation of the plane. points (torch.Tensor): Points of input. Returns: Tuple: Distance of each point to the plane and flag of matching correspondence. """ point2plane_dist = torch.abs((points * plane[:3]).sum(dim=1) + plane[-1]) min_dist = point2plane_dist.min() selected = torch.abs(point2plane_dist - min_dist) < self.train_cfg['dist_thresh'] return point2plane_dist, selected def compute_primitive_loss(self, primitive_center, primitive_semantic, semantic_scores, num_proposal, gt_primitive_center, gt_primitive_semantic, gt_sem_cls_label, gt_primitive_mask): """Compute loss of primitive module. Args: primitive_center (torch.Tensor): Pridictions of primitive center. primitive_semantic (torch.Tensor): Pridictions of primitive semantic. semantic_scores (torch.Tensor): Pridictions of primitive semantic scores. num_proposal (int): The number of primitive proposal. gt_primitive_center (torch.Tensor): Ground truth of primitive center. gt_votes_sem (torch.Tensor): Ground truth of primitive semantic. gt_sem_cls_label (torch.Tensor): Ground truth of primitive semantic class. gt_primitive_mask (torch.Tensor): Ground truth of primitive mask. Returns: Tuple: Loss of primitive module. """ batch_size = primitive_center.shape[0] vote_xyz_reshape = primitive_center.view(batch_size * num_proposal, -1, 3) center_loss = self.center_loss( vote_xyz_reshape, gt_primitive_center, dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1] if self.primitive_mode != 'line': size_xyz_reshape = primitive_semantic.view( batch_size * num_proposal, -1, self.num_dims).contiguous() size_loss = self.semantic_reg_loss( size_xyz_reshape, gt_primitive_semantic, dst_weight=gt_primitive_mask.view(batch_size * num_proposal, 1))[1] else: size_loss = center_loss.new_tensor(0.0) # Semantic cls loss sem_cls_loss = self.semantic_cls_loss( semantic_scores, gt_sem_cls_label, weight=gt_primitive_mask) return center_loss, size_loss, sem_cls_loss def get_primitive_center(self, pred_flag, center): """Generate primitive center from predictions. Args: pred_flag (torch.Tensor): Scores of primitive center. center (torch.Tensor): Pridictions of primitive center. Returns: Tuple: Primitive center and the prediction indices. """ ind_normal = F.softmax(pred_flag, dim=1) pred_indices = (ind_normal[:, 1, :] > self.surface_thresh).detach().float() selected = (ind_normal[:, 1, :] <= self.surface_thresh).detach().float() offset = torch.ones_like(center) * self.upper_thresh center = center + offset * selected.unsqueeze(-1) return center, pred_indices def _assign_primitive_line_targets(self, point_mask, point_offset, point_sem, coords, indices, cls_label, point2line_matching, corners, center_axises, with_yaw, mode='bottom'): """Generate targets of line primitive. Args: point_mask (torch.Tensor): Tensor to store the ground truth of mask. point_offset (torch.Tensor): Tensor to store the ground truth of offset. point_sem (torch.Tensor): Tensor to store the ground truth of semantic. coords (torch.Tensor): The selected points. indices (torch.Tensor): Indices of the selected points. cls_label (int): Class label of the ground truth bounding box. point2line_matching (torch.Tensor): Flag indicate that matching line of each point. corners (torch.Tensor): Corners of the ground truth bounding box. center_axises (list[int]): Indicate in which axis the line center should be refined. with_yaw (Bool): Whether the boundind box is with rotation. mode (str, optional): Specify which line should be matched, available mode are ('bottom', 'top', 'left', 'right'). Defaults to 'bottom'. Returns: Tuple: Targets of the line primitive. """ corners_pair = { 'bottom': [[0, 3], [4, 7], [0, 4], [3, 7]], 'top': [[1, 2], [5, 6], [1, 5], [2, 6]], 'left': [[0, 1], [3, 2]], 'right': [[4, 5], [7, 6]] } corners_pair = corners_pair[mode] assert len(corners_pair) == len(point2line_matching) == len( center_axises) for line_select, center_axis, pair_index in zip( point2line_matching, center_axises, corners_pair): if line_select.sum() > self.train_cfg['num_point_line']: point_mask[indices[line_select]] = 1.0 if with_yaw: line_center = (corners[pair_index[0]] + corners[pair_index[1]]) / 2 else: line_center = coords[line_select].mean(dim=0) line_center[center_axis] = corners[:, center_axis].mean() point_offset[indices[line_select]] = \ line_center - coords[line_select] point_sem[indices[line_select]] = \ point_sem.new_tensor([line_center[0], line_center[1], line_center[2], cls_label]) return point_mask, point_offset, point_sem def _assign_primitive_surface_targets(self, point_mask, point_offset, point_sem, coords, indices, cls_label, corners, with_yaw, mode='bottom'): """Generate targets for primitive z and primitive xy. Args: point_mask (torch.Tensor): Tensor to store the ground truth of mask. point_offset (torch.Tensor): Tensor to store the ground truth of offset. point_sem (torch.Tensor): Tensor to store the ground truth of semantic. coords (torch.Tensor): The selected points. indices (torch.Tensor): Indices of the selected points. cls_label (int): Class label of the ground truth bounding box. corners (torch.Tensor): Corners of the ground truth bounding box. with_yaw (Bool): Whether the boundind box is with rotation. mode (str, optional): Specify which line should be matched, available mode are ('bottom', 'top', 'left', 'right', 'front', 'back'). Defaults to 'bottom'. Returns: Tuple: Targets of the center primitive. """ point_mask[indices] = 1.0 corners_pair = { 'bottom': [0, 7], 'top': [1, 6], 'left': [0, 1], 'right': [4, 5], 'front': [0, 1], 'back': [3, 2] } pair_index = corners_pair[mode] if self.primitive_mode == 'z': if with_yaw: center = (corners[pair_index[0]] + corners[pair_index[1]]) / 2.0 center[2] = coords[:, 2].mean() point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], (corners[4] - corners[0]).norm(), (corners[3] - corners[0]).norm(), cls_label ]) else: center = point_mask.new_tensor([ corners[:, 0].mean(), corners[:, 1].mean(), coords[:, 2].mean() ]) point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], corners[:, 0].max() - corners[:, 0].min(), corners[:, 1].max() - corners[:, 1].min(), cls_label ]) elif self.primitive_mode == 'xy': if with_yaw: center = coords.mean(0) center[2] = (corners[pair_index[0], 2] + corners[pair_index[1], 2]) / 2.0 point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], corners[pair_index[1], 2] - corners[pair_index[0], 2], cls_label ]) else: center = point_mask.new_tensor([ coords[:, 0].mean(), coords[:, 1].mean(), corners[:, 2].mean() ]) point_sem[indices] = point_sem.new_tensor([ center[0], center[1], center[2], corners[:, 2].max() - corners[:, 2].min(), cls_label ]) point_offset[indices] = center - coords return point_mask, point_offset, point_sem def _get_plane_fomulation(self, vector1, vector2, point): """Compute the equation of the plane. Args: vector1 (torch.Tensor): Parallel vector of the plane. vector2 (torch.Tensor): Parallel vector of the plane. point (torch.Tensor): Point on the plane. Returns: torch.Tensor: Equation of the plane. """ surface_norm = torch.cross(vector1, vector2) surface_dis = -torch.dot(surface_norm, point) plane = point.new_tensor( [surface_norm[0], surface_norm[1], surface_norm[2], surface_dis]) return plane ================================================ FILE: mmdet3d/models/roi_heads/part_aggregation_roi_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import warnings from torch.nn import functional as F from mmdet3d.core import AssignResult from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi from mmdet.core import build_assigner, build_sampler from ..builder import HEADS, build_head, build_roi_extractor from .base_3droi_head import Base3DRoIHead @HEADS.register_module() class PartAggregationROIHead(Base3DRoIHead): """Part aggregation roi head for PartA2. Args: semantic_head (ConfigDict): Config of semantic head. num_classes (int): The number of classes. seg_roi_extractor (ConfigDict): Config of seg_roi_extractor. part_roi_extractor (ConfigDict): Config of part_roi_extractor. bbox_head (ConfigDict): Config of bbox_head. train_cfg (ConfigDict): Training config. test_cfg (ConfigDict): Testing config. """ def __init__(self, semantic_head, num_classes=3, seg_roi_extractor=None, part_roi_extractor=None, bbox_head=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(PartAggregationROIHead, self).__init__( bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, init_cfg=init_cfg) self.num_classes = num_classes assert semantic_head is not None self.semantic_head = build_head(semantic_head) if seg_roi_extractor is not None: self.seg_roi_extractor = build_roi_extractor(seg_roi_extractor) if part_roi_extractor is not None: self.part_roi_extractor = build_roi_extractor(part_roi_extractor) self.init_assigner_sampler() assert not (init_cfg and pretrained), \ 'init_cfg and pretrained cannot be setting at the same time' if isinstance(pretrained, str): warnings.warn('DeprecationWarning: pretrained is a deprecated, ' 'please use "init_cfg" instead') self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) def init_mask_head(self): """Initialize mask head, skip since ``PartAggregationROIHead`` does not have one.""" pass def init_bbox_head(self, bbox_head): """Initialize box head.""" self.bbox_head = build_head(bbox_head) def init_assigner_sampler(self): """Initialize assigner and sampler.""" self.bbox_assigner = None self.bbox_sampler = None if self.train_cfg: if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] self.bbox_sampler = build_sampler(self.train_cfg.sampler) @property def with_semantic(self): """bool: whether the head has semantic branch""" return hasattr(self, 'semantic_head') and self.semantic_head is not None def forward_train(self, feats_dict, voxels_dict, img_metas, proposal_list, gt_bboxes_3d, gt_labels_3d): """Training forward function of PartAggregationROIHead. Args: feats_dict (dict): Contains features from the first stage. voxels_dict (dict): Contains information of voxels. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. The dictionary should contain the following keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes - labels_3d (torch.Tensor): Labels of proposals - cls_preds (torch.Tensor): Original scores of proposals gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): GT bboxes of each sample. The bboxes are encapsulated by 3D box structures. gt_labels_3d (list[LongTensor]): GT labels of each sample. Returns: dict: losses from each head. - loss_semantic (torch.Tensor): loss of semantic head - loss_bbox (torch.Tensor): loss of bboxes """ losses = dict() if self.with_semantic: semantic_results = self._semantic_forward_train( feats_dict['seg_features'], voxels_dict, gt_bboxes_3d, gt_labels_3d) losses.update(semantic_results['loss_semantic']) sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d, gt_labels_3d) if self.with_bbox: bbox_results = self._bbox_forward_train( feats_dict['seg_features'], semantic_results['part_feats'], voxels_dict, sample_results) losses.update(bbox_results['loss_bbox']) return losses def simple_test(self, feats_dict, voxels_dict, img_metas, proposal_list, **kwargs): """Simple testing forward function of PartAggregationROIHead. Note: This function assumes that the batch size is 1 Args: feats_dict (dict): Contains features from the first stage. voxels_dict (dict): Contains information of voxels. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. Returns: dict: Bbox results of one frame. """ assert self.with_bbox, 'Bbox head must be implemented.' assert self.with_semantic semantic_results = self.semantic_head(feats_dict['seg_features']) rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list]) labels_3d = [res['labels_3d'] for res in proposal_list] cls_preds = [res['cls_preds'] for res in proposal_list] bbox_results = self._bbox_forward(feats_dict['seg_features'], semantic_results['part_feats'], voxels_dict, rois) bbox_list = self.bbox_head.get_bboxes( rois, bbox_results['cls_score'], bbox_results['bbox_pred'], labels_3d, cls_preds, img_metas, cfg=self.test_cfg) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def _bbox_forward_train(self, seg_feats, part_feats, voxels_dict, sampling_results): """Forward training function of roi_extractor and bbox_head. Args: seg_feats (torch.Tensor): Point-wise semantic features. part_feats (torch.Tensor): Point-wise part prediction features. voxels_dict (dict): Contains information of voxels. sampling_results (:obj:`SamplingResult`): Sampled results used for training. Returns: dict: Forward results including losses and predictions. """ rois = bbox3d2roi([res.bboxes for res in sampling_results]) bbox_results = self._bbox_forward(seg_feats, part_feats, voxels_dict, rois) bbox_targets = self.bbox_head.get_targets(sampling_results, self.train_cfg) loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_results['bbox_pred'], rois, *bbox_targets) bbox_results.update(loss_bbox=loss_bbox) return bbox_results def _bbox_forward(self, seg_feats, part_feats, voxels_dict, rois): """Forward function of roi_extractor and bbox_head used in both training and testing. Args: seg_feats (torch.Tensor): Point-wise semantic features. part_feats (torch.Tensor): Point-wise part prediction features. voxels_dict (dict): Contains information of voxels. rois (Tensor): Roi boxes. Returns: dict: Contains predictions of bbox_head and features of roi_extractor. """ pooled_seg_feats = self.seg_roi_extractor(seg_feats, voxels_dict['voxel_centers'], voxels_dict['coors'][..., 0], rois) pooled_part_feats = self.part_roi_extractor( part_feats, voxels_dict['voxel_centers'], voxels_dict['coors'][..., 0], rois) cls_score, bbox_pred = self.bbox_head(pooled_seg_feats, pooled_part_feats) bbox_results = dict( cls_score=cls_score, bbox_pred=bbox_pred, pooled_seg_feats=pooled_seg_feats, pooled_part_feats=pooled_part_feats) return bbox_results def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d): """Assign and sample proposals for training. Args: proposal_list (list[dict]): Proposals produced by RPN. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels_3d (list[torch.Tensor]): Ground truth labels Returns: list[:obj:`SamplingResult`]: Sampled results of each training sample. """ sampling_results = [] # bbox assign for batch_idx in range(len(proposal_list)): cur_proposal_list = proposal_list[batch_idx] cur_boxes = cur_proposal_list['boxes_3d'] cur_labels_3d = cur_proposal_list['labels_3d'] cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device) cur_gt_labels = gt_labels_3d[batch_idx] batch_num_gts = 0 # 0 is bg batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0) batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes)) # -1 is bg batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1) # each class may have its own assigner if isinstance(self.bbox_assigner, list): for i, assigner in enumerate(self.bbox_assigner): gt_per_cls = (cur_gt_labels == i) pred_per_cls = (cur_labels_3d == i) cur_assign_res = assigner.assign( cur_boxes.tensor[pred_per_cls], cur_gt_bboxes.tensor[gt_per_cls], gt_labels=cur_gt_labels[gt_per_cls]) # gather assign_results in different class into one result batch_num_gts += cur_assign_res.num_gts # gt inds (1-based) gt_inds_arange_pad = gt_per_cls.nonzero( as_tuple=False).view(-1) + 1 # pad 0 for indice unassigned gt_inds_arange_pad = F.pad( gt_inds_arange_pad, (1, 0), mode='constant', value=0) # pad -1 for indice ignore gt_inds_arange_pad = F.pad( gt_inds_arange_pad, (1, 0), mode='constant', value=-1) # convert to 0~gt_num+2 for indices gt_inds_arange_pad += 1 # now 0 is bg, >1 is fg in batch_gt_indis batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[ cur_assign_res.gt_inds + 1] - 1 batch_max_overlaps[ pred_per_cls] = cur_assign_res.max_overlaps batch_gt_labels[pred_per_cls] = cur_assign_res.labels assign_result = AssignResult(batch_num_gts, batch_gt_indis, batch_max_overlaps, batch_gt_labels) else: # for single class assign_result = self.bbox_assigner.assign( cur_boxes.tensor, cur_gt_bboxes.tensor, gt_labels=cur_gt_labels) # sample boxes sampling_result = self.bbox_sampler.sample(assign_result, cur_boxes.tensor, cur_gt_bboxes.tensor, cur_gt_labels) sampling_results.append(sampling_result) return sampling_results def _semantic_forward_train(self, x, voxels_dict, gt_bboxes_3d, gt_labels_3d): """Train semantic head. Args: x (torch.Tensor): Point-wise semantic features for segmentation voxels_dict (dict): Contains information of voxels. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels_3d (list[torch.Tensor]): Ground truth labels Returns: dict: Segmentation results including losses """ semantic_results = self.semantic_head(x) semantic_targets = self.semantic_head.get_targets( voxels_dict, gt_bboxes_3d, gt_labels_3d) loss_semantic = self.semantic_head.loss(semantic_results, semantic_targets) semantic_results.update(loss_semantic=loss_semantic) return semantic_results ================================================ FILE: mmdet3d/models/roi_heads/point_rcnn_roi_head.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch.nn import functional as F from mmdet3d.core import AssignResult from mmdet3d.core.bbox import bbox3d2result, bbox3d2roi from mmdet.core import build_assigner, build_sampler from ..builder import HEADS, build_head, build_roi_extractor from .base_3droi_head import Base3DRoIHead @HEADS.register_module() class PointRCNNRoIHead(Base3DRoIHead): """RoI head for PointRCNN. Args: bbox_head (dict): Config of bbox_head. point_roi_extractor (dict): Config of RoI extractor. train_cfg (dict): Train configs. test_cfg (dict): Test configs. depth_normalizer (float, optional): Normalize depth feature. Defaults to 70.0. init_cfg (dict, optional): Config of initialization. Defaults to None. """ def __init__(self, bbox_head, point_roi_extractor, train_cfg, test_cfg, depth_normalizer=70.0, pretrained=None, init_cfg=None): super(PointRCNNRoIHead, self).__init__( bbox_head=bbox_head, train_cfg=train_cfg, test_cfg=test_cfg, pretrained=pretrained, init_cfg=init_cfg) self.depth_normalizer = depth_normalizer if point_roi_extractor is not None: self.point_roi_extractor = build_roi_extractor(point_roi_extractor) self.init_assigner_sampler() def init_bbox_head(self, bbox_head): """Initialize box head. Args: bbox_head (dict): Config dict of RoI Head. """ self.bbox_head = build_head(bbox_head) def init_mask_head(self): """Initialize maek head.""" pass def init_assigner_sampler(self): """Initialize assigner and sampler.""" self.bbox_assigner = None self.bbox_sampler = None if self.train_cfg: if isinstance(self.train_cfg.assigner, dict): self.bbox_assigner = build_assigner(self.train_cfg.assigner) elif isinstance(self.train_cfg.assigner, list): self.bbox_assigner = [ build_assigner(res) for res in self.train_cfg.assigner ] self.bbox_sampler = build_sampler(self.train_cfg.sampler) def forward_train(self, feats_dict, input_metas, proposal_list, gt_bboxes_3d, gt_labels_3d): """Training forward function of PointRCNNRoIHead. Args: feats_dict (dict): Contains features from the first stage. imput_metas (list[dict]): Meta info of each input. proposal_list (list[dict]): Proposal information from rpn. The dictionary should contain the following keys: - boxes_3d (:obj:`BaseInstance3DBoxes`): Proposal bboxes - labels_3d (torch.Tensor): Labels of proposals gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): GT bboxes of each sample. The bboxes are encapsulated by 3D box structures. gt_labels_3d (list[LongTensor]): GT labels of each sample. Returns: dict: Losses from RoI RCNN head. - loss_bbox (torch.Tensor): Loss of bboxes """ features = feats_dict['features'] points = feats_dict['points'] point_cls_preds = feats_dict['points_cls_preds'] sem_scores = point_cls_preds.sigmoid() point_scores = sem_scores.max(-1)[0] sample_results = self._assign_and_sample(proposal_list, gt_bboxes_3d, gt_labels_3d) # concat the depth, semantic features and backbone features features = features.transpose(1, 2).contiguous() point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5 features_list = [ point_scores.unsqueeze(2), point_depths.unsqueeze(2), features ] features = torch.cat(features_list, dim=2) bbox_results = self._bbox_forward_train(features, points, sample_results) losses = dict() losses.update(bbox_results['loss_bbox']) return losses def simple_test(self, feats_dict, img_metas, proposal_list, **kwargs): """Simple testing forward function of PointRCNNRoIHead. Note: This function assumes that the batch size is 1 Args: feats_dict (dict): Contains features from the first stage. img_metas (list[dict]): Meta info of each image. proposal_list (list[dict]): Proposal information from rpn. Returns: dict: Bbox results of one frame. """ rois = bbox3d2roi([res['boxes_3d'].tensor for res in proposal_list]) labels_3d = [res['labels_3d'] for res in proposal_list] features = feats_dict['features'] points = feats_dict['points'] point_cls_preds = feats_dict['points_cls_preds'] sem_scores = point_cls_preds.sigmoid() point_scores = sem_scores.max(-1)[0] features = features.transpose(1, 2).contiguous() point_depths = points.norm(dim=2) / self.depth_normalizer - 0.5 features_list = [ point_scores.unsqueeze(2), point_depths.unsqueeze(2), features ] features = torch.cat(features_list, dim=2) batch_size = features.shape[0] bbox_results = self._bbox_forward(features, points, batch_size, rois) object_score = bbox_results['cls_score'].sigmoid() bbox_list = self.bbox_head.get_bboxes( rois, object_score, bbox_results['bbox_pred'], labels_3d, img_metas, cfg=self.test_cfg) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] return bbox_results def _bbox_forward_train(self, features, points, sampling_results): """Forward training function of roi_extractor and bbox_head. Args: features (torch.Tensor): Backbone features with depth and \ semantic features. points (torch.Tensor): Pointcloud. sampling_results (:obj:`SamplingResult`): Sampled results used for training. Returns: dict: Forward results including losses and predictions. """ rois = bbox3d2roi([res.bboxes for res in sampling_results]) batch_size = features.shape[0] bbox_results = self._bbox_forward(features, points, batch_size, rois) bbox_targets = self.bbox_head.get_targets(sampling_results, self.train_cfg) loss_bbox = self.bbox_head.loss(bbox_results['cls_score'], bbox_results['bbox_pred'], rois, *bbox_targets) bbox_results.update(loss_bbox=loss_bbox) return bbox_results def _bbox_forward(self, features, points, batch_size, rois): """Forward function of roi_extractor and bbox_head used in both training and testing. Args: features (torch.Tensor): Backbone features with depth and semantic features. points (torch.Tensor): Pointcloud. batch_size (int): Batch size. rois (torch.Tensor): RoI boxes. Returns: dict: Contains predictions of bbox_head and features of roi_extractor. """ pooled_point_feats = self.point_roi_extractor(features, points, batch_size, rois) cls_score, bbox_pred = self.bbox_head(pooled_point_feats) bbox_results = dict(cls_score=cls_score, bbox_pred=bbox_pred) return bbox_results def _assign_and_sample(self, proposal_list, gt_bboxes_3d, gt_labels_3d): """Assign and sample proposals for training. Args: proposal_list (list[dict]): Proposals produced by RPN. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth boxes. gt_labels_3d (list[torch.Tensor]): Ground truth labels Returns: list[:obj:`SamplingResult`]: Sampled results of each training sample. """ sampling_results = [] # bbox assign for batch_idx in range(len(proposal_list)): cur_proposal_list = proposal_list[batch_idx] cur_boxes = cur_proposal_list['boxes_3d'] cur_labels_3d = cur_proposal_list['labels_3d'] cur_gt_bboxes = gt_bboxes_3d[batch_idx].to(cur_boxes.device) cur_gt_labels = gt_labels_3d[batch_idx] batch_num_gts = 0 # 0 is bg batch_gt_indis = cur_gt_labels.new_full((len(cur_boxes), ), 0) batch_max_overlaps = cur_boxes.tensor.new_zeros(len(cur_boxes)) # -1 is bg batch_gt_labels = cur_gt_labels.new_full((len(cur_boxes), ), -1) # each class may have its own assigner if isinstance(self.bbox_assigner, list): for i, assigner in enumerate(self.bbox_assigner): gt_per_cls = (cur_gt_labels == i) pred_per_cls = (cur_labels_3d == i) cur_assign_res = assigner.assign( cur_boxes.tensor[pred_per_cls], cur_gt_bboxes.tensor[gt_per_cls], gt_labels=cur_gt_labels[gt_per_cls]) # gather assign_results in different class into one result batch_num_gts += cur_assign_res.num_gts # gt inds (1-based) gt_inds_arange_pad = gt_per_cls.nonzero( as_tuple=False).view(-1) + 1 # pad 0 for indice unassigned gt_inds_arange_pad = F.pad( gt_inds_arange_pad, (1, 0), mode='constant', value=0) # pad -1 for indice ignore gt_inds_arange_pad = F.pad( gt_inds_arange_pad, (1, 0), mode='constant', value=-1) # convert to 0~gt_num+2 for indices gt_inds_arange_pad += 1 # now 0 is bg, >1 is fg in batch_gt_indis batch_gt_indis[pred_per_cls] = gt_inds_arange_pad[ cur_assign_res.gt_inds + 1] - 1 batch_max_overlaps[ pred_per_cls] = cur_assign_res.max_overlaps batch_gt_labels[pred_per_cls] = cur_assign_res.labels assign_result = AssignResult(batch_num_gts, batch_gt_indis, batch_max_overlaps, batch_gt_labels) else: # for single class assign_result = self.bbox_assigner.assign( cur_boxes.tensor, cur_gt_bboxes.tensor, gt_labels=cur_gt_labels) # sample boxes sampling_result = self.bbox_sampler.sample(assign_result, cur_boxes.tensor, cur_gt_bboxes.tensor, cur_gt_labels) sampling_results.append(sampling_result) return sampling_results ================================================ FILE: mmdet3d/models/roi_heads/roi_extractors/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmdet.models.roi_heads.roi_extractors import SingleRoIExtractor from .single_roiaware_extractor import Single3DRoIAwareExtractor from .single_roipoint_extractor import Single3DRoIPointExtractor __all__ = [ 'SingleRoIExtractor', 'Single3DRoIAwareExtractor', 'Single3DRoIPointExtractor' ] ================================================ FILE: mmdet3d/models/roi_heads/roi_extractors/single_roiaware_extractor.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv import ops from mmcv.runner import BaseModule from mmdet3d.models.builder import ROI_EXTRACTORS @ROI_EXTRACTORS.register_module() class Single3DRoIAwareExtractor(BaseModule): """Point-wise roi-aware Extractor. Extract Point-wise roi features. Args: roi_layer (dict): The config of roi layer. """ def __init__(self, roi_layer=None, init_cfg=None): super(Single3DRoIAwareExtractor, self).__init__(init_cfg=init_cfg) self.roi_layer = self.build_roi_layers(roi_layer) def build_roi_layers(self, layer_cfg): """Build roi layers using `layer_cfg`""" cfg = layer_cfg.copy() layer_type = cfg.pop('type') assert hasattr(ops, layer_type) layer_cls = getattr(ops, layer_type) roi_layers = layer_cls(**cfg) return roi_layers def forward(self, feats, coordinate, batch_inds, rois): """Extract point-wise roi features. Args: feats (torch.FloatTensor): Point-wise features with shape (batch, npoints, channels) for pooling. coordinate (torch.FloatTensor): Coordinate of each point. batch_inds (torch.LongTensor): Indicate the batch of each point. rois (torch.FloatTensor): Roi boxes with batch indices. Returns: torch.FloatTensor: Pooled features """ pooled_roi_feats = [] for batch_idx in range(int(batch_inds.max()) + 1): roi_inds = (rois[..., 0].int() == batch_idx) coors_inds = (batch_inds.int() == batch_idx) pooled_roi_feat = self.roi_layer(rois[..., 1:][roi_inds], coordinate[coors_inds], feats[coors_inds]) pooled_roi_feats.append(pooled_roi_feat) pooled_roi_feats = torch.cat(pooled_roi_feats, 0) return pooled_roi_feats ================================================ FILE: mmdet3d/models/roi_heads/roi_extractors/single_roipoint_extractor.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv import ops from torch import nn as nn from mmdet3d.core.bbox.structures import rotation_3d_in_axis from mmdet3d.models.builder import ROI_EXTRACTORS @ROI_EXTRACTORS.register_module() class Single3DRoIPointExtractor(nn.Module): """Point-wise roi-aware Extractor. Extract Point-wise roi features. Args: roi_layer (dict): The config of roi layer. """ def __init__(self, roi_layer=None): super(Single3DRoIPointExtractor, self).__init__() self.roi_layer = self.build_roi_layers(roi_layer) def build_roi_layers(self, layer_cfg): """Build roi layers using `layer_cfg`""" cfg = layer_cfg.copy() layer_type = cfg.pop('type') assert hasattr(ops, layer_type) layer_cls = getattr(ops, layer_type) roi_layers = layer_cls(**cfg) return roi_layers def forward(self, feats, coordinate, batch_inds, rois): """Extract point-wise roi features. Args: feats (torch.FloatTensor): Point-wise features with shape (batch, npoints, channels) for pooling. coordinate (torch.FloatTensor): Coordinate of each point. batch_inds (torch.LongTensor): Indicate the batch of each point. rois (torch.FloatTensor): Roi boxes with batch indices. Returns: torch.FloatTensor: Pooled features """ rois = rois[..., 1:] rois = rois.view(batch_inds, -1, rois.shape[-1]) with torch.no_grad(): pooled_roi_feat, pooled_empty_flag = self.roi_layer( coordinate, feats, rois) # canonical transformation roi_center = rois[:, :, 0:3] pooled_roi_feat[:, :, :, 0:3] -= roi_center.unsqueeze(dim=2) pooled_roi_feat = pooled_roi_feat.view(-1, pooled_roi_feat.shape[-2], pooled_roi_feat.shape[-1]) pooled_roi_feat[:, :, 0:3] = rotation_3d_in_axis( pooled_roi_feat[:, :, 0:3], -(rois.view(-1, rois.shape[-1])[:, 6]), axis=2) pooled_roi_feat[pooled_empty_flag.view(-1) > 0] = 0 return pooled_roi_feat ================================================ FILE: mmdet3d/models/segmentors/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .base import Base3DSegmentor from .encoder_decoder import EncoderDecoder3D __all__ = ['Base3DSegmentor', 'EncoderDecoder3D'] ================================================ FILE: mmdet3d/models/segmentors/base.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from os import path as osp import mmcv import numpy as np import torch from mmcv.parallel import DataContainer as DC from mmcv.runner import auto_fp16 from mmdet3d.core import show_seg_result from mmseg.models.segmentors import BaseSegmentor class Base3DSegmentor(BaseSegmentor): """Base class for 3D segmentors. The main difference with `BaseSegmentor` is that we modify the keys in data_dict and use a 3D seg specific visualization function. """ @property def with_regularization_loss(self): """bool: whether the segmentor has regularization loss for weight""" return hasattr(self, 'loss_regularization') and \ self.loss_regularization is not None def forward_test(self, points, img_metas, **kwargs): """Calls either simple_test or aug_test depending on the length of outer list of points. If len(points) == 1, call simple_test. Otherwise call aug_test to aggregate the test results by e.g. voting. Args: points (list[list[torch.Tensor]]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape BXNxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch. """ for var, name in [(points, 'points'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError(f'{name} must be a list, but got {type(var)}') num_augs = len(points) if num_augs != len(img_metas): raise ValueError(f'num of augmentations ({len(points)}) != ' f'num of image meta ({len(img_metas)})') if num_augs == 1: return self.simple_test(points[0], img_metas[0], **kwargs) else: return self.aug_test(points, img_metas, **kwargs) @auto_fp16(apply_to=('points')) def forward(self, return_loss=True, **kwargs): """Calls either forward_train or forward_test depending on whether return_loss=True. Note this setting will change the expected inputs. When `return_loss=True`, point and img_metas are single-nested (i.e. torch.Tensor and list[dict]), and when `resturn_loss=False`, point and img_metas should be double nested (i.e. list[torch.Tensor], list[list[dict]]), with the outer list indicating test time augmentations. """ if return_loss: return self.forward_train(**kwargs) else: return self.forward_test(**kwargs) def show_results(self, data, result, palette=None, out_dir=None, ignore_index=None, show=False, score_thr=None): """Results visualization. Args: data (list[dict]): Input points and the information of the sample. result (list[dict]): Prediction results. palette (list[list[int]]] | np.ndarray): The palette of segmentation map. If None is given, random palette will be generated. Default: None out_dir (str): Output directory of visualization result. ignore_index (int, optional): The label index to be ignored, e.g. unannotated points. If None is given, set to len(self.CLASSES). Defaults to None. show (bool, optional): Determines whether you are going to show result by open3d. Defaults to False. TODO: implement score_thr of Base3DSegmentor. score_thr (float, optional): Score threshold of bounding boxes. Default to None. Not implemented yet, but it is here for unification. """ assert out_dir is not None, 'Expect out_dir, got none.' if palette is None: if self.PALETTE is None: palette = np.random.randint( 0, 255, size=(len(self.CLASSES), 3)) else: palette = self.PALETTE palette = np.array(palette) for batch_id in range(len(result)): if isinstance(data['points'][0], DC): points = data['points'][0]._data[0][batch_id].numpy() elif mmcv.is_list_of(data['points'][0], torch.Tensor): points = data['points'][0][batch_id] else: ValueError(f"Unsupported data type {type(data['points'][0])} " f'for visualization!') if isinstance(data['img_metas'][0], DC): pts_filename = data['img_metas'][0]._data[0][batch_id][ 'pts_filename'] elif mmcv.is_list_of(data['img_metas'][0], dict): pts_filename = data['img_metas'][0][batch_id]['pts_filename'] else: ValueError( f"Unsupported data type {type(data['img_metas'][0])} " f'for visualization!') file_name = osp.split(pts_filename)[-1].split('.')[0] pred_sem_mask = result[batch_id]['semantic_mask'].cpu().numpy() show_seg_result( points, None, pred_sem_mask, out_dir, file_name, palette, ignore_index, show=show) ================================================ FILE: mmdet3d/models/segmentors/encoder_decoder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from torch import nn as nn from torch.nn import functional as F from mmseg.core import add_prefix from ..builder import (SEGMENTORS, build_backbone, build_head, build_loss, build_neck) from .base import Base3DSegmentor @SEGMENTORS.register_module() class EncoderDecoder3D(Base3DSegmentor): """3D Encoder Decoder segmentors. EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. Note that auxiliary_head is only used for deep supervision during training, which could be thrown during inference. """ def __init__(self, backbone, decode_head, neck=None, auxiliary_head=None, loss_regularization=None, train_cfg=None, test_cfg=None, pretrained=None, init_cfg=None): super(EncoderDecoder3D, self).__init__(init_cfg=init_cfg) self.backbone = build_backbone(backbone) if neck is not None: self.neck = build_neck(neck) self._init_decode_head(decode_head) self._init_auxiliary_head(auxiliary_head) self._init_loss_regularization(loss_regularization) self.train_cfg = train_cfg self.test_cfg = test_cfg assert self.with_decode_head, \ '3D EncoderDecoder Segmentor should have a decode_head' def _init_decode_head(self, decode_head): """Initialize ``decode_head``""" self.decode_head = build_head(decode_head) self.num_classes = self.decode_head.num_classes def _init_auxiliary_head(self, auxiliary_head): """Initialize ``auxiliary_head``""" if auxiliary_head is not None: if isinstance(auxiliary_head, list): self.auxiliary_head = nn.ModuleList() for head_cfg in auxiliary_head: self.auxiliary_head.append(build_head(head_cfg)) else: self.auxiliary_head = build_head(auxiliary_head) def _init_loss_regularization(self, loss_regularization): """Initialize ``loss_regularization``""" if loss_regularization is not None: if isinstance(loss_regularization, list): self.loss_regularization = nn.ModuleList() for loss_cfg in loss_regularization: self.loss_regularization.append(build_loss(loss_cfg)) else: self.loss_regularization = build_loss(loss_regularization) def extract_feat(self, points): """Extract features from points.""" x = self.backbone(points) if self.with_neck: x = self.neck(x) return x def encode_decode(self, points, img_metas): """Encode points with backbone and decode into a semantic segmentation map of the same size as input. Args: points (torch.Tensor): Input points of shape [B, N, 3+C]. img_metas (list[dict]): Meta information of each sample. Returns: torch.Tensor: Segmentation logits of shape [B, num_classes, N]. """ x = self.extract_feat(points) out = self._decode_head_forward_test(x, img_metas) return out def _decode_head_forward_train(self, x, img_metas, pts_semantic_mask): """Run forward function and calculate loss for decode head in training.""" losses = dict() loss_decode = self.decode_head.forward_train(x, img_metas, pts_semantic_mask, self.train_cfg) losses.update(add_prefix(loss_decode, 'decode')) return losses def _decode_head_forward_test(self, x, img_metas): """Run forward function and calculate loss for decode head in inference.""" seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg) return seg_logits def _auxiliary_head_forward_train(self, x, img_metas, pts_semantic_mask): """Run forward function and calculate loss for auxiliary head in training.""" losses = dict() if isinstance(self.auxiliary_head, nn.ModuleList): for idx, aux_head in enumerate(self.auxiliary_head): loss_aux = aux_head.forward_train(x, img_metas, pts_semantic_mask, self.train_cfg) losses.update(add_prefix(loss_aux, f'aux_{idx}')) else: loss_aux = self.auxiliary_head.forward_train( x, img_metas, pts_semantic_mask, self.train_cfg) losses.update(add_prefix(loss_aux, 'aux')) return losses def _loss_regularization_forward_train(self): """Calculate regularization loss for model weight in training.""" losses = dict() if isinstance(self.loss_regularization, nn.ModuleList): for idx, regularize_loss in enumerate(self.loss_regularization): loss_regularize = dict( loss_regularize=regularize_loss(self.modules())) losses.update(add_prefix(loss_regularize, f'regularize_{idx}')) else: loss_regularize = dict( loss_regularize=self.loss_regularization(self.modules())) losses.update(add_prefix(loss_regularize, 'regularize')) return losses def forward_dummy(self, points): """Dummy forward function.""" seg_logit = self.encode_decode(points, None) return seg_logit def forward_train(self, points, img_metas, pts_semantic_mask): """Forward function for training. Args: points (list[torch.Tensor]): List of points of shape [N, C]. img_metas (list): Image metas. pts_semantic_mask (list[torch.Tensor]): List of point-wise semantic labels of shape [N]. Returns: dict[str, Tensor]: Losses. """ points_cat = torch.stack(points) pts_semantic_mask_cat = torch.stack(pts_semantic_mask) # extract features using backbone x = self.extract_feat(points_cat) losses = dict() loss_decode = self._decode_head_forward_train(x, img_metas, pts_semantic_mask_cat) losses.update(loss_decode) if self.with_auxiliary_head: loss_aux = self._auxiliary_head_forward_train( x, img_metas, pts_semantic_mask_cat) losses.update(loss_aux) if self.with_regularization_loss: loss_regularize = self._loss_regularization_forward_train() losses.update(loss_regularize) return losses @staticmethod def _input_generation(coords, patch_center, coord_max, feats, use_normalized_coord=False): """Generating model input. Generate input by subtracting patch center and adding additional features. Currently support colors and normalized xyz as features. Args: coords (torch.Tensor): Sampled 3D point coordinate of shape [S, 3]. patch_center (torch.Tensor): Center coordinate of the patch. coord_max (torch.Tensor): Max coordinate of all 3D points. feats (torch.Tensor): Features of sampled points of shape [S, C]. use_normalized_coord (bool, optional): Whether to use normalized xyz as additional features. Defaults to False. Returns: torch.Tensor: The generated input data of shape [S, 3+C']. """ # subtract patch center, the z dimension is not centered centered_coords = coords.clone() centered_coords[:, 0] -= patch_center[0] centered_coords[:, 1] -= patch_center[1] # normalized coordinates as extra features if use_normalized_coord: normalized_coord = coords / coord_max feats = torch.cat([feats, normalized_coord], dim=1) points = torch.cat([centered_coords, feats], dim=1) return points def _sliding_patch_generation(self, points, num_points, block_size, sample_rate=0.5, use_normalized_coord=False, eps=1e-3): """Sampling points in a sliding window fashion. First sample patches to cover all the input points. Then sample points in each patch to batch points of a certain number. Args: points (torch.Tensor): Input points of shape [N, 3+C]. num_points (int): Number of points to be sampled in each patch. block_size (float, optional): Size of a patch to sample. sample_rate (float, optional): Stride used in sliding patch. Defaults to 0.5. use_normalized_coord (bool, optional): Whether to use normalized xyz as additional features. Defaults to False. eps (float, optional): A value added to patch boundary to guarantee points coverage. Defaults to 1e-3. Returns: np.ndarray | np.ndarray: - patch_points (torch.Tensor): Points of different patches of shape [K, N, 3+C]. - patch_idxs (torch.Tensor): Index of each point in `patch_points`, of shape [K, N]. """ device = points.device # we assume the first three dims are points' 3D coordinates # and the rest dims are their per-point features coords = points[:, :3] feats = points[:, 3:] coord_max = coords.max(0)[0] coord_min = coords.min(0)[0] stride = block_size * sample_rate num_grid_x = int( torch.ceil((coord_max[0] - coord_min[0] - block_size) / stride).item() + 1) num_grid_y = int( torch.ceil((coord_max[1] - coord_min[1] - block_size) / stride).item() + 1) patch_points, patch_idxs = [], [] for idx_y in range(num_grid_y): s_y = coord_min[1] + idx_y * stride e_y = torch.min(s_y + block_size, coord_max[1]) s_y = e_y - block_size for idx_x in range(num_grid_x): s_x = coord_min[0] + idx_x * stride e_x = torch.min(s_x + block_size, coord_max[0]) s_x = e_x - block_size # extract points within this patch cur_min = torch.tensor([s_x, s_y, coord_min[2]]).to(device) cur_max = torch.tensor([e_x, e_y, coord_max[2]]).to(device) cur_choice = ((coords >= cur_min - eps) & (coords <= cur_max + eps)).all(dim=1) if not cur_choice.any(): # no points in this patch continue # sample points in this patch to multiple batches cur_center = cur_min + block_size / 2.0 point_idxs = torch.nonzero(cur_choice, as_tuple=True)[0] num_batch = int(np.ceil(point_idxs.shape[0] / num_points)) point_size = int(num_batch * num_points) replace = point_size > 2 * point_idxs.shape[0] num_repeat = point_size - point_idxs.shape[0] if replace: # duplicate point_idxs_repeat = point_idxs[torch.randint( 0, point_idxs.shape[0], size=(num_repeat, )).to(device)] else: point_idxs_repeat = point_idxs[torch.randperm( point_idxs.shape[0])[:num_repeat]] choices = torch.cat([point_idxs, point_idxs_repeat], dim=0) choices = choices[torch.randperm(choices.shape[0])] # construct model input point_batches = self._input_generation( coords[choices], cur_center, coord_max, feats[choices], use_normalized_coord=use_normalized_coord) patch_points.append(point_batches) patch_idxs.append(choices) patch_points = torch.cat(patch_points, dim=0) patch_idxs = torch.cat(patch_idxs, dim=0) # make sure all points are sampled at least once assert torch.unique(patch_idxs).shape[0] == points.shape[0], \ 'some points are not sampled in sliding inference' return patch_points, patch_idxs def slide_inference(self, point, img_meta, rescale): """Inference by sliding-window with overlap. Args: point (torch.Tensor): Input points of shape [N, 3+C]. img_meta (dict): Meta information of input sample. rescale (bool): Whether transform to original number of points. Will be used for voxelization based segmentors. Returns: Tensor: The output segmentation map of shape [num_classes, N]. """ num_points = self.test_cfg.num_points block_size = self.test_cfg.block_size sample_rate = self.test_cfg.sample_rate use_normalized_coord = self.test_cfg.use_normalized_coord batch_size = self.test_cfg.batch_size * num_points # patch_points is of shape [K*N, 3+C], patch_idxs is of shape [K*N] patch_points, patch_idxs = self._sliding_patch_generation( point, num_points, block_size, sample_rate, use_normalized_coord) feats_dim = patch_points.shape[1] seg_logits = [] # save patch predictions for batch_idx in range(0, patch_points.shape[0], batch_size): batch_points = patch_points[batch_idx:batch_idx + batch_size] batch_points = batch_points.view(-1, num_points, feats_dim) # batch_seg_logit is of shape [B, num_classes, N] batch_seg_logit = self.encode_decode(batch_points, img_meta) batch_seg_logit = batch_seg_logit.transpose(1, 2).contiguous() seg_logits.append(batch_seg_logit.view(-1, self.num_classes)) # aggregate per-point logits by indexing sum and dividing count seg_logits = torch.cat(seg_logits, dim=0) # [K*N, num_classes] expand_patch_idxs = patch_idxs.unsqueeze(1).repeat(1, self.num_classes) preds = point.new_zeros((point.shape[0], self.num_classes)).\ scatter_add_(dim=0, index=expand_patch_idxs, src=seg_logits) count_mat = torch.bincount(patch_idxs) preds = preds / count_mat[:, None] # TODO: if rescale and voxelization segmentor return preds.transpose(0, 1) # to [num_classes, K*N] def whole_inference(self, points, img_metas, rescale): """Inference with full scene (one forward pass without sliding).""" seg_logit = self.encode_decode(points, img_metas) # TODO: if rescale and voxelization segmentor return seg_logit def inference(self, points, img_metas, rescale): """Inference with slide/whole style. Args: points (torch.Tensor): Input points of shape [B, N, 3+C]. img_metas (list[dict]): Meta information of each sample. rescale (bool): Whether transform to original number of points. Will be used for voxelization based segmentors. Returns: Tensor: The output segmentation map. """ assert self.test_cfg.mode in ['slide', 'whole'] if self.test_cfg.mode == 'slide': seg_logit = torch.stack([ self.slide_inference(point, img_meta, rescale) for point, img_meta in zip(points, img_metas) ], 0) else: seg_logit = self.whole_inference(points, img_metas, rescale) output = F.softmax(seg_logit, dim=1) return output def simple_test(self, points, img_metas, rescale=True): """Simple test with single scene. Args: points (list[torch.Tensor]): List of points of shape [N, 3+C]. img_metas (list[dict]): Meta information of each sample. rescale (bool): Whether transform to original number of points. Will be used for voxelization based segmentors. Defaults to True. Returns: list[dict]: The output prediction result with following keys: - semantic_mask (Tensor): Segmentation mask of shape [N]. """ # 3D segmentation requires per-point prediction, so it's impossible # to use down-sampling to get a batch of scenes with same num_points # therefore, we only support testing one scene every time seg_pred = [] for point, img_meta in zip(points, img_metas): seg_prob = self.inference(point.unsqueeze(0), [img_meta], rescale)[0] seg_map = seg_prob.argmax(0) # [N] # to cpu tensor for consistency with det3d seg_map = seg_map.cpu() seg_pred.append(seg_map) # warp in dict seg_pred = [dict(semantic_mask=seg_map) for seg_map in seg_pred] return seg_pred def aug_test(self, points, img_metas, rescale=True): """Test with augmentations. Args: points (list[torch.Tensor]): List of points of shape [B, N, 3+C]. img_metas (list[list[dict]]): Meta information of each sample. Outer list are different samples while inner is different augs. rescale (bool): Whether transform to original number of points. Will be used for voxelization based segmentors. Defaults to True. Returns: list[dict]: The output prediction result with following keys: - semantic_mask (Tensor): Segmentation mask of shape [N]. """ # in aug_test, one scene going through different augmentations could # have the same number of points and are stacked as a batch # to save memory, we get augmented seg logit inplace seg_pred = [] for point, img_meta in zip(points, img_metas): seg_prob = self.inference(point, img_meta, rescale) seg_prob = seg_prob.mean(0) # [num_classes, N] seg_map = seg_prob.argmax(0) # [N] # to cpu tensor for consistency with det3d seg_map = seg_map.cpu() seg_pred.append(seg_map) # warp in dict seg_pred = [dict(semantic_mask=seg_map) for seg_map in seg_pred] return seg_pred ================================================ FILE: mmdet3d/models/utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .clip_sigmoid import clip_sigmoid from .edge_indices import get_edge_indices from .gen_keypoints import get_keypoints from .handle_objs import filter_outside_objs, handle_proj_objs from .mlp import MLP __all__ = [ 'clip_sigmoid', 'MLP', 'get_edge_indices', 'filter_outside_objs', 'handle_proj_objs', 'get_keypoints' ] ================================================ FILE: mmdet3d/models/utils/clip_sigmoid.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch def clip_sigmoid(x, eps=1e-4): """Sigmoid function for input feature. Args: x (torch.Tensor): Input feature map with the shape of [B, N, H, W]. eps (float, optional): Lower bound of the range to be clamped to. Defaults to 1e-4. Returns: torch.Tensor: Feature map after sigmoid. """ y = torch.clamp(x.sigmoid_(), min=eps, max=1 - eps) return y ================================================ FILE: mmdet3d/models/utils/edge_indices.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch def get_edge_indices(img_metas, downsample_ratio, step=1, pad_mode='default', dtype=np.float32, device='cpu'): """Function to filter the objects label outside the image. The edge_indices are generated using numpy on cpu rather than on CUDA due to the latency issue. When batch size = 8, this function with numpy array is ~8 times faster than that with CUDA tensor (0.09s and 0.72s in 100 runs). Args: img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. downsample_ratio (int): Downsample ratio of output feature, step (int, optional): Step size used for generateing edge indices. Default: 1. pad_mode (str, optional): Padding mode during data pipeline. Default: 'default'. dtype (torch.dtype, optional): Dtype of edge indices tensor. Default: np.float32. device (str, optional): Device of edge indices tensor. Default: 'cpu'. Returns: list[Tensor]: Edge indices for each image in batch data. """ edge_indices_list = [] for i in range(len(img_metas)): img_shape = img_metas[i]['img_shape'] pad_shape = img_metas[i]['pad_shape'] h, w = img_shape[:2] pad_h, pad_w = pad_shape edge_indices = [] if pad_mode == 'default': x_min = 0 y_min = 0 x_max = (w - 1) // downsample_ratio y_max = (h - 1) // downsample_ratio elif pad_mode == 'center': x_min = np.ceil((pad_w - w) / 2 * downsample_ratio) y_min = np.ceil((pad_h - h) / 2 * downsample_ratio) x_max = x_min + w // downsample_ratio y_max = y_min + h // downsample_ratio else: raise NotImplementedError # left y = np.arange(y_min, y_max, step, dtype=dtype) x = np.ones(len(y)) * x_min edge_indices_edge = np.stack((x, y), axis=1) edge_indices.append(edge_indices_edge) # bottom x = np.arange(x_min, x_max, step, dtype=dtype) y = np.ones(len(x)) * y_max edge_indices_edge = np.stack((x, y), axis=1) edge_indices.append(edge_indices_edge) # right y = np.arange(y_max, y_min, -step, dtype=dtype) x = np.ones(len(y)) * x_max edge_indices_edge = np.stack((x, y), axis=1) edge_indices.append(edge_indices_edge) # top x = np.arange(x_max, x_min, -step, dtype=dtype) y = np.ones(len(x)) * y_min edge_indices_edge = np.stack((x, y), axis=1) edge_indices.append(edge_indices_edge) edge_indices = \ np.concatenate([index for index in edge_indices], axis=0) edge_indices = torch.from_numpy(edge_indices).to(device).long() edge_indices_list.append(edge_indices) return edge_indices_list ================================================ FILE: mmdet3d/models/utils/gen_keypoints.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmdet3d.core.bbox import points_cam2img def get_keypoints(gt_bboxes_3d_list, centers2d_list, img_metas, use_local_coords=True): """Function to filter the objects label outside the image. Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, shape (num_gt, 4). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, shape (num_gt, 2). img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. use_local_coords (bool, optional): Wheher to use local coordinates for keypoints. Default: True. Returns: tuple[list[Tensor]]: It contains two elements, the first is the keypoints for each projected 2D bbox in batch data. The second is the visible mask of depth calculated by keypoints. """ assert len(gt_bboxes_3d_list) == len(centers2d_list) bs = len(gt_bboxes_3d_list) keypoints2d_list = [] keypoints_depth_mask_list = [] for i in range(bs): gt_bboxes_3d = gt_bboxes_3d_list[i] centers2d = centers2d_list[i] img_shape = img_metas[i]['img_shape'] cam2img = img_metas[i]['cam2img'] h, w = img_shape[:2] # (N, 8, 3) corners3d = gt_bboxes_3d.corners top_centers3d = torch.mean(corners3d[:, [0, 1, 4, 5], :], dim=1) bot_centers3d = torch.mean(corners3d[:, [2, 3, 6, 7], :], dim=1) # (N, 2, 3) top_bot_centers3d = torch.stack((top_centers3d, bot_centers3d), dim=1) keypoints3d = torch.cat((corners3d, top_bot_centers3d), dim=1) # (N, 10, 2) keypoints2d = points_cam2img(keypoints3d, cam2img) # keypoints mask: keypoints must be inside # the image and in front of the camera keypoints_x_visible = (keypoints2d[..., 0] >= 0) & ( keypoints2d[..., 0] <= w - 1) keypoints_y_visible = (keypoints2d[..., 1] >= 0) & ( keypoints2d[..., 1] <= h - 1) keypoints_z_visible = (keypoints3d[..., -1] > 0) # (N, 1O) keypoints_visible = keypoints_x_visible & \ keypoints_y_visible & keypoints_z_visible # center, diag-02, diag-13 keypoints_depth_valid = torch.stack( (keypoints_visible[:, [8, 9]].all(dim=1), keypoints_visible[:, [0, 3, 5, 6]].all(dim=1), keypoints_visible[:, [1, 2, 4, 7]].all(dim=1)), dim=1) keypoints_visible = keypoints_visible.float() if use_local_coords: keypoints2d = torch.cat((keypoints2d - centers2d.unsqueeze(1), keypoints_visible.unsqueeze(-1)), dim=2) else: keypoints2d = torch.cat( (keypoints2d, keypoints_visible.unsqueeze(-1)), dim=2) keypoints2d_list.append(keypoints2d) keypoints_depth_mask_list.append(keypoints_depth_valid) return (keypoints2d_list, keypoints_depth_mask_list) ================================================ FILE: mmdet3d/models/utils/handle_objs.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch def filter_outside_objs(gt_bboxes_list, gt_labels_list, gt_bboxes_3d_list, gt_labels_3d_list, centers2d_list, img_metas): """Function to filter the objects label outside the image. Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, each has shape (num_gt, 4). gt_labels_list (list[Tensor]): Ground truth labels of each box, each has shape (num_gt,). gt_bboxes_3d_list (list[Tensor]): 3D Ground truth bboxes of each image, each has shape (num_gt, bbox_code_size). gt_labels_3d_list (list[Tensor]): 3D Ground truth labels of each box, each has shape (num_gt,). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, each has shape (num_gt, 2). img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. """ bs = len(centers2d_list) for i in range(bs): centers2d = centers2d_list[i].clone() img_shape = img_metas[i]['img_shape'] keep_inds = (centers2d[:, 0] > 0) & \ (centers2d[:, 0] < img_shape[1]) & \ (centers2d[:, 1] > 0) & \ (centers2d[:, 1] < img_shape[0]) centers2d_list[i] = centers2d[keep_inds] gt_labels_list[i] = gt_labels_list[i][keep_inds] gt_bboxes_list[i] = gt_bboxes_list[i][keep_inds] gt_bboxes_3d_list[i].tensor = gt_bboxes_3d_list[i].tensor[keep_inds] gt_labels_3d_list[i] = gt_labels_3d_list[i][keep_inds] def get_centers2d_target(centers2d, centers, img_shape): """Function to get target centers2d. Args: centers2d (Tensor): Projected 3D centers onto 2D images. centers (Tensor): Centers of 2d gt bboxes. img_shape (tuple): Resized image shape. Returns: torch.Tensor: Projected 3D centers (centers2D) target. """ N = centers2d.shape[0] h, w = img_shape[:2] valid_intersects = centers2d.new_zeros((N, 2)) a = (centers[:, 1] - centers2d[:, 1]) / (centers[:, 0] - centers2d[:, 0]) b = centers[:, 1] - a * centers[:, 0] left_y = b right_y = (w - 1) * a + b top_x = -b / a bottom_x = (h - 1 - b) / a left_coors = torch.stack((left_y.new_zeros(N, ), left_y), dim=1) right_coors = torch.stack((right_y.new_full((N, ), w - 1), right_y), dim=1) top_coors = torch.stack((top_x, top_x.new_zeros(N, )), dim=1) bottom_coors = torch.stack((bottom_x, bottom_x.new_full((N, ), h - 1)), dim=1) intersects = torch.stack( [left_coors, right_coors, top_coors, bottom_coors], dim=1) intersects_x = intersects[:, :, 0] intersects_y = intersects[:, :, 1] inds = (intersects_x >= 0) & (intersects_x <= w - 1) & (intersects_y >= 0) & ( intersects_y <= h - 1) valid_intersects = intersects[inds].reshape(N, 2, 2) dist = torch.norm(valid_intersects - centers2d.unsqueeze(1), dim=2) min_idx = torch.argmin(dist, dim=1) min_idx = min_idx.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, 2) centers2d_target = valid_intersects.gather(dim=1, index=min_idx).squeeze(1) return centers2d_target def handle_proj_objs(centers2d_list, gt_bboxes_list, img_metas): """Function to handle projected object centers2d, generate target centers2d. Args: gt_bboxes_list (list[Tensor]): Ground truth bboxes of each image, shape (num_gt, 4). centers2d_list (list[Tensor]): Projected 3D centers onto 2D image, shape (num_gt, 2). img_metas (list[dict]): Meta information of each image, e.g., image size, scaling factor, etc. Returns: tuple[list[Tensor]]: It contains three elements. The first is the target centers2d after handling the truncated objects. The second is the offsets between target centers2d and round int dtype centers2d,and the last is the truncation mask for each object in batch data. """ bs = len(centers2d_list) centers2d_target_list = [] trunc_mask_list = [] offsets2d_list = [] # for now, only pad mode that img is padded by right and # bottom side is supported. for i in range(bs): centers2d = centers2d_list[i] gt_bbox = gt_bboxes_list[i] img_shape = img_metas[i]['img_shape'] centers2d_target = centers2d.clone() inside_inds = (centers2d[:, 0] > 0) & \ (centers2d[:, 0] < img_shape[1]) & \ (centers2d[:, 1] > 0) & \ (centers2d[:, 1] < img_shape[0]) outside_inds = ~inside_inds # if there are outside objects if outside_inds.any(): centers = (gt_bbox[:, :2] + gt_bbox[:, 2:]) / 2 outside_centers2d = centers2d[outside_inds] match_centers = centers[outside_inds] target_outside_centers2d = get_centers2d_target( outside_centers2d, match_centers, img_shape) centers2d_target[outside_inds] = target_outside_centers2d offsets2d = centers2d - centers2d_target.round().int() trunc_mask = outside_inds centers2d_target_list.append(centers2d_target) trunc_mask_list.append(trunc_mask) offsets2d_list.append(offsets2d) return (centers2d_target_list, offsets2d_list, trunc_mask_list) ================================================ FILE: mmdet3d/models/utils/mlp.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn import ConvModule from mmcv.runner import BaseModule from torch import nn as nn class MLP(BaseModule): """A simple MLP module. Pass features (B, C, N) through an MLP. Args: in_channels (int, optional): Number of channels of input features. Default: 18. conv_channels (tuple[int], optional): Out channels of the convolution. Default: (256, 256). conv_cfg (dict, optional): Config of convolution. Default: dict(type='Conv1d'). norm_cfg (dict, optional): Config of normalization. Default: dict(type='BN1d'). act_cfg (dict, optional): Config of activation. Default: dict(type='ReLU'). """ def __init__(self, in_channel=18, conv_channels=(256, 256), conv_cfg=dict(type='Conv1d'), norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.mlp = nn.Sequential() prev_channels = in_channel for i, conv_channel in enumerate(conv_channels): self.mlp.add_module( f'layer{i}', ConvModule( prev_channels, conv_channels[i], 1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=True, inplace=True)) prev_channels = conv_channels[i] def forward(self, img_features): return self.mlp(img_features) ================================================ FILE: mmdet3d/models/voxel_encoders/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .pillar_encoder import DynamicPillarFeatureNet, PillarFeatureNet from .voxel_encoder import DynamicSimpleVFE, DynamicVFE, HardSimpleVFE, HardVFE __all__ = [ 'PillarFeatureNet', 'DynamicPillarFeatureNet', 'HardVFE', 'DynamicVFE', 'HardSimpleVFE', 'DynamicSimpleVFE' ] ================================================ FILE: mmdet3d/models/voxel_encoders/pillar_encoder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import build_norm_layer from mmcv.ops import DynamicScatter from mmcv.runner import force_fp32 from torch import nn from ..builder import VOXEL_ENCODERS from .utils import PFNLayer, get_paddings_indicator @VOXEL_ENCODERS.register_module() class PillarFeatureNet(nn.Module): """Pillar Feature Net. The network prepares the pillar features and performs forward pass through PFNLayers. Args: in_channels (int, optional): Number of input features, either x, y, z or x, y, z, r. Defaults to 4. feat_channels (tuple, optional): Number of features in each of the N PFNLayers. Defaults to (64, ). with_distance (bool, optional): Whether to include Euclidean distance to points. Defaults to False. with_cluster_center (bool, optional): [description]. Defaults to True. with_voxel_center (bool, optional): [description]. Defaults to True. voxel_size (tuple[float], optional): Size of voxels, only utilize x and y size. Defaults to (0.2, 0.2, 4). point_cloud_range (tuple[float], optional): Point cloud range, only utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1). norm_cfg ([type], optional): [description]. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). mode (str, optional): The mode to gather point features. Options are 'max' or 'avg'. Defaults to 'max'. legacy (bool, optional): Whether to use the new behavior or the original behavior. Defaults to True. """ def __init__(self, in_channels=4, feat_channels=(64, ), with_distance=False, with_cluster_center=True, with_voxel_center=True, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', legacy=True): super(PillarFeatureNet, self).__init__() assert len(feat_channels) > 0 self.legacy = legacy if with_cluster_center: in_channels += 3 if with_voxel_center: in_channels += 3 if with_distance: in_channels += 1 self._with_distance = with_distance self._with_cluster_center = with_cluster_center self._with_voxel_center = with_voxel_center self.fp16_enabled = False # Create PillarFeatureNet layers self.in_channels = in_channels feat_channels = [in_channels] + list(feat_channels) pfn_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i < len(feat_channels) - 2: last_layer = False else: last_layer = True pfn_layers.append( PFNLayer( in_filters, out_filters, norm_cfg=norm_cfg, last_layer=last_layer, mode=mode)) self.pfn_layers = nn.ModuleList(pfn_layers) # Need pillar (voxel) size and x/y offset in order to calculate offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.vz = voxel_size[2] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] self.z_offset = self.vz / 2 + point_cloud_range[2] self.point_cloud_range = point_cloud_range @force_fp32(out_fp16=True) def forward(self, features, num_points, coors): """Forward function. Args: features (torch.Tensor): Point features or raw points in shape (N, M, C). num_points (torch.Tensor): Number of points in each pillar. coors (torch.Tensor): Coordinates of each voxel. Returns: torch.Tensor: Features of pillars. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: points_mean = features[:, :, :3].sum( dim=1, keepdim=True) / num_points.type_as(features).view( -1, 1, 1) f_cluster = features[:, :, :3] - points_mean features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center dtype = features.dtype if self._with_voxel_center: if not self.legacy: f_center = torch.zeros_like(features[:, :, :3]) f_center[:, :, 0] = features[:, :, 0] - ( coors[:, 3].to(dtype).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = features[:, :, 1] - ( coors[:, 2].to(dtype).unsqueeze(1) * self.vy + self.y_offset) f_center[:, :, 2] = features[:, :, 2] - ( coors[:, 1].to(dtype).unsqueeze(1) * self.vz + self.z_offset) else: f_center = features[:, :, :3] f_center[:, :, 0] = f_center[:, :, 0] - ( coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = f_center[:, :, 1] - ( coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset) f_center[:, :, 2] = f_center[:, :, 2] - ( coors[:, 1].type_as(features).unsqueeze(1) * self.vz + self.z_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations features = torch.cat(features_ls, dim=-1) # The feature decorations were calculated without regard to whether # pillar was empty. Need to ensure that # empty pillars remain set to zeros. voxel_count = features.shape[1] mask = get_paddings_indicator(num_points, voxel_count, axis=0) mask = torch.unsqueeze(mask, -1).type_as(features) features *= mask for pfn in self.pfn_layers: features = pfn(features, num_points) return features.squeeze(1) @VOXEL_ENCODERS.register_module() class DynamicPillarFeatureNet(PillarFeatureNet): """Pillar Feature Net using dynamic voxelization. The network prepares the pillar features and performs forward pass through PFNLayers. The main difference is that it is used for dynamic voxels, which contains different number of points inside a voxel without limits. Args: in_channels (int, optional): Number of input features, either x, y, z or x, y, z, r. Defaults to 4. feat_channels (tuple, optional): Number of features in each of the N PFNLayers. Defaults to (64, ). with_distance (bool, optional): Whether to include Euclidean distance to points. Defaults to False. with_cluster_center (bool, optional): [description]. Defaults to True. with_voxel_center (bool, optional): [description]. Defaults to True. voxel_size (tuple[float], optional): Size of voxels, only utilize x and y size. Defaults to (0.2, 0.2, 4). point_cloud_range (tuple[float], optional): Point cloud range, only utilizes x and y min. Defaults to (0, -40, -3, 70.4, 40, 1). norm_cfg ([type], optional): [description]. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). mode (str, optional): The mode to gather point features. Options are 'max' or 'avg'. Defaults to 'max'. legacy (bool, optional): Whether to use the new behavior or the original behavior. Defaults to True. """ def __init__(self, in_channels=4, feat_channels=(64, ), with_distance=False, with_cluster_center=True, with_voxel_center=True, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', legacy=True): super(DynamicPillarFeatureNet, self).__init__( in_channels, feat_channels, with_distance, with_cluster_center=with_cluster_center, with_voxel_center=with_voxel_center, voxel_size=voxel_size, point_cloud_range=point_cloud_range, norm_cfg=norm_cfg, mode=mode, legacy=legacy) self.fp16_enabled = False feat_channels = [self.in_channels] + list(feat_channels) pfn_layers = [] # TODO: currently only support one PFNLayer for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i > 0: in_filters *= 2 norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) pfn_layers.append( nn.Sequential( nn.Linear(in_filters, out_filters, bias=False), norm_layer, nn.ReLU(inplace=True))) self.num_pfn = len(pfn_layers) self.pfn_layers = nn.ModuleList(pfn_layers) self.pfn_scatter = DynamicScatter(voxel_size, point_cloud_range, (mode != 'max')) self.cluster_scatter = DynamicScatter( voxel_size, point_cloud_range, average_points=True) def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): """Map the centers of voxels to its corresponding points. Args: pts_coors (torch.Tensor): The coordinates of each points, shape (M, 3), where M is the number of points. voxel_mean (torch.Tensor): The mean or aggregated features of a voxel, shape (N, C), where N is the number of voxels. voxel_coors (torch.Tensor): The coordinates of each voxel. Returns: torch.Tensor: Corresponding voxel centers of each points, shape (M, C), where M is the number of points. """ # Step 1: scatter voxel into canvas # Calculate necessary things for canvas creation canvas_y = int( (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy) canvas_x = int( (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx) canvas_channel = voxel_mean.size(1) batch_size = pts_coors[-1, 0] + 1 canvas_len = canvas_y * canvas_x * batch_size # Create the canvas for this sample canvas = voxel_mean.new_zeros(canvas_channel, canvas_len) # Only include non-empty pillars indices = ( voxel_coors[:, 0] * canvas_y * canvas_x + voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3]) # Scatter the blob back to the canvas canvas[:, indices.long()] = voxel_mean.t() # Step 2: get voxel mean for each point voxel_index = ( pts_coors[:, 0] * canvas_y * canvas_x + pts_coors[:, 2] * canvas_x + pts_coors[:, 3]) center_per_point = canvas[:, voxel_index.long()].t() return center_per_point @force_fp32(out_fp16=True) def forward(self, features, coors): """Forward function. Args: features (torch.Tensor): Point features or raw points in shape (N, M, C). coors (torch.Tensor): Coordinates of each voxel Returns: torch.Tensor: Features of pillars. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: voxel_mean, mean_coors = self.cluster_scatter(features, coors) points_mean = self.map_voxel_center_to_point( coors, voxel_mean, mean_coors) # TODO: maybe also do cluster for reflectivity f_cluster = features[:, :3] - points_mean[:, :3] features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center if self._with_voxel_center: f_center = features.new_zeros(size=(features.size(0), 3)) f_center[:, 0] = features[:, 0] - ( coors[:, 3].type_as(features) * self.vx + self.x_offset) f_center[:, 1] = features[:, 1] - ( coors[:, 2].type_as(features) * self.vy + self.y_offset) f_center[:, 2] = features[:, 2] - ( coors[:, 1].type_as(features) * self.vz + self.z_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations features = torch.cat(features_ls, dim=-1) for i, pfn in enumerate(self.pfn_layers): point_feats = pfn(features) voxel_feats, voxel_coors = self.pfn_scatter(point_feats, coors) if i != len(self.pfn_layers) - 1: # need to concat voxel feats if it is not the last pfn feat_per_point = self.map_voxel_center_to_point( coors, voxel_feats, voxel_coors) features = torch.cat([point_feats, feat_per_point], dim=1) return voxel_feats, voxel_coors ================================================ FILE: mmdet3d/models/voxel_encoders/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import build_norm_layer from mmcv.runner import auto_fp16 from torch import nn from torch.nn import functional as F def get_paddings_indicator(actual_num, max_num, axis=0): """Create boolean mask by actually number of a padded tensor. Args: actual_num (torch.Tensor): Actual number of points in each voxel. max_num (int): Max number of points in each voxel Returns: torch.Tensor: Mask indicates which points are valid inside a voxel. """ actual_num = torch.unsqueeze(actual_num, axis + 1) # tiled_actual_num: [N, M, 1] max_num_shape = [1] * len(actual_num.shape) max_num_shape[axis + 1] = -1 max_num = torch.arange( max_num, dtype=torch.int, device=actual_num.device).view(max_num_shape) # tiled_actual_num: [[3,3,3,3,3], [4,4,4,4,4], [2,2,2,2,2]] # tiled_max_num: [[0,1,2,3,4], [0,1,2,3,4], [0,1,2,3,4]] paddings_indicator = actual_num.int() > max_num # paddings_indicator shape: [batch_size, max_num] return paddings_indicator class VFELayer(nn.Module): """Voxel Feature Encoder layer. The voxel encoder is composed of a series of these layers. This module do not support average pooling and only support to use max pooling to gather features inside a VFE. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. norm_cfg (dict): Config dict of normalization layers max_out (bool): Whether aggregate the features of points inside each voxel and only return voxel features. cat_max (bool): Whether concatenate the aggregated features and pointwise features. """ def __init__(self, in_channels, out_channels, norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), max_out=True, cat_max=True): super(VFELayer, self).__init__() self.fp16_enabled = False self.cat_max = cat_max self.max_out = max_out # self.units = int(out_channels / 2) self.norm = build_norm_layer(norm_cfg, out_channels)[1] self.linear = nn.Linear(in_channels, out_channels, bias=False) @auto_fp16(apply_to=('inputs'), out_fp32=True) def forward(self, inputs): """Forward function. Args: inputs (torch.Tensor): Voxels features of shape (N, M, C). N is the number of voxels, M is the number of points in voxels, C is the number of channels of point features. Returns: torch.Tensor: Voxel features. There are three mode under which the features have different meaning. - `max_out=False`: Return point-wise features in shape (N, M, C). - `max_out=True` and `cat_max=False`: Return aggregated voxel features in shape (N, C) - `max_out=True` and `cat_max=True`: Return concatenated point-wise features in shape (N, M, C). """ # [K, T, 7] tensordot [7, units] = [K, T, units] voxel_count = inputs.shape[1] x = self.linear(inputs) x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() pointwise = F.relu(x) # [K, T, units] if self.max_out: aggregated = torch.max(pointwise, dim=1, keepdim=True)[0] else: # this is for fusion layer return pointwise if not self.cat_max: return aggregated.squeeze(1) else: # [K, 1, units] repeated = aggregated.repeat(1, voxel_count, 1) concatenated = torch.cat([pointwise, repeated], dim=2) # [K, T, 2 * units] return concatenated class PFNLayer(nn.Module): """Pillar Feature Net Layer. The Pillar Feature Net is composed of a series of these layers, but the PointPillars paper results only used a single PFNLayer. Args: in_channels (int): Number of input channels. out_channels (int): Number of output channels. norm_cfg (dict, optional): Config dict of normalization layers. Defaults to dict(type='BN1d', eps=1e-3, momentum=0.01). last_layer (bool, optional): If last_layer, there is no concatenation of features. Defaults to False. mode (str, optional): Pooling model to gather features inside voxels. Defaults to 'max'. """ def __init__(self, in_channels, out_channels, norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), last_layer=False, mode='max'): super().__init__() self.fp16_enabled = False self.name = 'PFNLayer' self.last_vfe = last_layer if not self.last_vfe: out_channels = out_channels // 2 self.units = out_channels self.norm = build_norm_layer(norm_cfg, self.units)[1] self.linear = nn.Linear(in_channels, self.units, bias=False) assert mode in ['max', 'avg'] self.mode = mode @auto_fp16(apply_to=('inputs'), out_fp32=True) def forward(self, inputs, num_voxels=None, aligned_distance=None): """Forward function. Args: inputs (torch.Tensor): Pillar/Voxel inputs with shape (N, M, C). N is the number of voxels, M is the number of points in voxels, C is the number of channels of point features. num_voxels (torch.Tensor, optional): Number of points in each voxel. Defaults to None. aligned_distance (torch.Tensor, optional): The distance of each points to the voxel center. Defaults to None. Returns: torch.Tensor: Features of Pillars. """ x = self.linear(inputs) x = self.norm(x.permute(0, 2, 1).contiguous()).permute(0, 2, 1).contiguous() x = F.relu(x) if self.mode == 'max': if aligned_distance is not None: x = x.mul(aligned_distance.unsqueeze(-1)) x_max = torch.max(x, dim=1, keepdim=True)[0] elif self.mode == 'avg': if aligned_distance is not None: x = x.mul(aligned_distance.unsqueeze(-1)) x_max = x.sum( dim=1, keepdim=True) / num_voxels.type_as(inputs).view( -1, 1, 1) if self.last_vfe: return x_max else: x_repeat = x_max.repeat(1, inputs.shape[1], 1) x_concatenated = torch.cat([x, x_repeat], dim=2) return x_concatenated ================================================ FILE: mmdet3d/models/voxel_encoders/voxel_encoder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import build_norm_layer from mmcv.ops import DynamicScatter from mmcv.runner import force_fp32 from torch import nn from .. import builder from ..builder import VOXEL_ENCODERS from .utils import VFELayer, get_paddings_indicator @VOXEL_ENCODERS.register_module() class HardSimpleVFE(nn.Module): """Simple voxel feature encoder used in SECOND. It simply averages the values of points in a voxel. Args: num_features (int, optional): Number of features to use. Default: 4. """ def __init__(self, num_features=4): super(HardSimpleVFE, self).__init__() self.num_features = num_features self.fp16_enabled = False @force_fp32(out_fp16=True) def forward(self, features, num_points, coors): """Forward function. Args: features (torch.Tensor): Point features in shape (N, M, 3(4)). N is the number of voxels and M is the maximum number of points inside a single voxel. num_points (torch.Tensor): Number of points in each voxel, shape (N, ). coors (torch.Tensor): Coordinates of voxels. Returns: torch.Tensor: Mean of points inside each voxel in shape (N, 3(4)) """ points_mean = features[:, :, :self.num_features].sum( dim=1, keepdim=False) / num_points.type_as(features).view(-1, 1) return points_mean.contiguous() @VOXEL_ENCODERS.register_module() class DynamicSimpleVFE(nn.Module): """Simple dynamic voxel feature encoder used in DV-SECOND. It simply averages the values of points in a voxel. But the number of points in a voxel is dynamic and varies. Args: voxel_size (tupe[float]): Size of a single voxel point_cloud_range (tuple[float]): Range of the point cloud and voxels """ def __init__(self, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1)): super(DynamicSimpleVFE, self).__init__() self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) self.fp16_enabled = False @torch.no_grad() @force_fp32(out_fp16=True) def forward(self, features, coors): """Forward function. Args: features (torch.Tensor): Point features in shape (N, 3(4)). N is the number of points. coors (torch.Tensor): Coordinates of voxels. Returns: torch.Tensor: Mean of points inside each voxel in shape (M, 3(4)). M is the number of voxels. """ # This function is used from the start of the voxelnet # num_points: [concated_num_points] features, features_coors = self.scatter(features, coors) return features, features_coors @VOXEL_ENCODERS.register_module() class DynamicVFE(nn.Module): """Dynamic Voxel feature encoder used in DV-SECOND. It encodes features of voxels and their points. It could also fuse image feature into voxel features in a point-wise manner. The number of points inside the voxel varies. Args: in_channels (int, optional): Input channels of VFE. Defaults to 4. feat_channels (list(int), optional): Channels of features in VFE. with_distance (bool, optional): Whether to use the L2 distance of points to the origin point. Defaults to False. with_cluster_center (bool, optional): Whether to use the distance to cluster center of points inside a voxel. Defaults to False. with_voxel_center (bool, optional): Whether to use the distance to center of voxel for each points inside a voxel. Defaults to False. voxel_size (tuple[float], optional): Size of a single voxel. Defaults to (0.2, 0.2, 4). point_cloud_range (tuple[float], optional): The range of points or voxels. Defaults to (0, -40, -3, 70.4, 40, 1). norm_cfg (dict, optional): Config dict of normalization layers. mode (str, optional): The mode when pooling features of points inside a voxel. Available options include 'max' and 'avg'. Defaults to 'max'. fusion_layer (dict, optional): The config dict of fusion layer used in multi-modal detectors. Defaults to None. return_point_feats (bool, optional): Whether to return the features of each points. Defaults to False. """ def __init__(self, in_channels=4, feat_channels=[], with_distance=False, with_cluster_center=False, with_voxel_center=False, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', fusion_layer=None, return_point_feats=False): super(DynamicVFE, self).__init__() assert mode in ['avg', 'max'] assert len(feat_channels) > 0 if with_cluster_center: in_channels += 3 if with_voxel_center: in_channels += 3 if with_distance: in_channels += 1 self.in_channels = in_channels self._with_distance = with_distance self._with_cluster_center = with_cluster_center self._with_voxel_center = with_voxel_center self.return_point_feats = return_point_feats self.fp16_enabled = False # Need pillar (voxel) size and x/y offset in order to calculate offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.vz = voxel_size[2] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] self.z_offset = self.vz / 2 + point_cloud_range[2] self.point_cloud_range = point_cloud_range self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) feat_channels = [self.in_channels] + list(feat_channels) vfe_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i > 0: in_filters *= 2 norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) vfe_layers.append( nn.Sequential( nn.Linear(in_filters, out_filters, bias=False), norm_layer, nn.ReLU(inplace=True))) self.vfe_layers = nn.ModuleList(vfe_layers) self.num_vfe = len(vfe_layers) self.vfe_scatter = DynamicScatter(voxel_size, point_cloud_range, (mode != 'max')) self.cluster_scatter = DynamicScatter( voxel_size, point_cloud_range, average_points=True) self.fusion_layer = None if fusion_layer is not None: self.fusion_layer = builder.build_fusion_layer(fusion_layer) def map_voxel_center_to_point(self, pts_coors, voxel_mean, voxel_coors): """Map voxel features to its corresponding points. Args: pts_coors (torch.Tensor): Voxel coordinate of each point. voxel_mean (torch.Tensor): Voxel features to be mapped. voxel_coors (torch.Tensor): Coordinates of valid voxels Returns: torch.Tensor: Features or centers of each point. """ # Step 1: scatter voxel into canvas # Calculate necessary things for canvas creation canvas_z = int( (self.point_cloud_range[5] - self.point_cloud_range[2]) / self.vz) canvas_y = int( (self.point_cloud_range[4] - self.point_cloud_range[1]) / self.vy) canvas_x = int( (self.point_cloud_range[3] - self.point_cloud_range[0]) / self.vx) # canvas_channel = voxel_mean.size(1) batch_size = pts_coors[-1, 0] + 1 canvas_len = canvas_z * canvas_y * canvas_x * batch_size # Create the canvas for this sample canvas = voxel_mean.new_zeros(canvas_len, dtype=torch.long) # Only include non-empty pillars indices = ( voxel_coors[:, 0] * canvas_z * canvas_y * canvas_x + voxel_coors[:, 1] * canvas_y * canvas_x + voxel_coors[:, 2] * canvas_x + voxel_coors[:, 3]) # Scatter the blob back to the canvas canvas[indices.long()] = torch.arange( start=0, end=voxel_mean.size(0), device=voxel_mean.device) # Step 2: get voxel mean for each point voxel_index = ( pts_coors[:, 0] * canvas_z * canvas_y * canvas_x + pts_coors[:, 1] * canvas_y * canvas_x + pts_coors[:, 2] * canvas_x + pts_coors[:, 3]) voxel_inds = canvas[voxel_index.long()] center_per_point = voxel_mean[voxel_inds, ...] return center_per_point @force_fp32(out_fp16=True) def forward(self, features, coors, points=None, img_feats=None, img_metas=None): """Forward functions. Args: features (torch.Tensor): Features of voxels, shape is NxC. coors (torch.Tensor): Coordinates of voxels, shape is Nx(1+NDim). points (list[torch.Tensor], optional): Raw points used to guide the multi-modality fusion. Defaults to None. img_feats (list[torch.Tensor], optional): Image features used for multi-modality fusion. Defaults to None. img_metas (dict, optional): [description]. Defaults to None. Returns: tuple: If `return_point_feats` is False, returns voxel features and its coordinates. If `return_point_feats` is True, returns feature of each points inside voxels. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: voxel_mean, mean_coors = self.cluster_scatter(features, coors) points_mean = self.map_voxel_center_to_point( coors, voxel_mean, mean_coors) # TODO: maybe also do cluster for reflectivity f_cluster = features[:, :3] - points_mean[:, :3] features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center if self._with_voxel_center: f_center = features.new_zeros(size=(features.size(0), 3)) f_center[:, 0] = features[:, 0] - ( coors[:, 3].type_as(features) * self.vx + self.x_offset) f_center[:, 1] = features[:, 1] - ( coors[:, 2].type_as(features) * self.vy + self.y_offset) f_center[:, 2] = features[:, 2] - ( coors[:, 1].type_as(features) * self.vz + self.z_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :3], 2, 1, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations features = torch.cat(features_ls, dim=-1) for i, vfe in enumerate(self.vfe_layers): point_feats = vfe(features) if (i == len(self.vfe_layers) - 1 and self.fusion_layer is not None and img_feats is not None): point_feats = self.fusion_layer(img_feats, points, point_feats, img_metas) voxel_feats, voxel_coors = self.vfe_scatter(point_feats, coors) if i != len(self.vfe_layers) - 1: # need to concat voxel feats if it is not the last vfe feat_per_point = self.map_voxel_center_to_point( coors, voxel_feats, voxel_coors) features = torch.cat([point_feats, feat_per_point], dim=1) if self.return_point_feats: return point_feats return voxel_feats, voxel_coors @VOXEL_ENCODERS.register_module() class HardVFE(nn.Module): """Voxel feature encoder used in DV-SECOND. It encodes features of voxels and their points. It could also fuse image feature into voxel features in a point-wise manner. Args: in_channels (int, optional): Input channels of VFE. Defaults to 4. feat_channels (list(int), optional): Channels of features in VFE. with_distance (bool, optional): Whether to use the L2 distance of points to the origin point. Defaults to False. with_cluster_center (bool, optional): Whether to use the distance to cluster center of points inside a voxel. Defaults to False. with_voxel_center (bool, optional): Whether to use the distance to center of voxel for each points inside a voxel. Defaults to False. voxel_size (tuple[float], optional): Size of a single voxel. Defaults to (0.2, 0.2, 4). point_cloud_range (tuple[float], optional): The range of points or voxels. Defaults to (0, -40, -3, 70.4, 40, 1). norm_cfg (dict, optional): Config dict of normalization layers. mode (str, optional): The mode when pooling features of points inside a voxel. Available options include 'max' and 'avg'. Defaults to 'max'. fusion_layer (dict, optional): The config dict of fusion layer used in multi-modal detectors. Defaults to None. return_point_feats (bool, optional): Whether to return the features of each points. Defaults to False. """ def __init__(self, in_channels=4, feat_channels=[], with_distance=False, with_cluster_center=False, with_voxel_center=False, voxel_size=(0.2, 0.2, 4), point_cloud_range=(0, -40, -3, 70.4, 40, 1), norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01), mode='max', fusion_layer=None, return_point_feats=False): super(HardVFE, self).__init__() assert len(feat_channels) > 0 if with_cluster_center: in_channels += 3 if with_voxel_center: in_channels += 3 if with_distance: in_channels += 1 self.in_channels = in_channels self._with_distance = with_distance self._with_cluster_center = with_cluster_center self._with_voxel_center = with_voxel_center self.return_point_feats = return_point_feats self.fp16_enabled = False # Need pillar (voxel) size and x/y offset to calculate pillar offset self.vx = voxel_size[0] self.vy = voxel_size[1] self.vz = voxel_size[2] self.x_offset = self.vx / 2 + point_cloud_range[0] self.y_offset = self.vy / 2 + point_cloud_range[1] self.z_offset = self.vz / 2 + point_cloud_range[2] self.point_cloud_range = point_cloud_range self.scatter = DynamicScatter(voxel_size, point_cloud_range, True) feat_channels = [self.in_channels] + list(feat_channels) vfe_layers = [] for i in range(len(feat_channels) - 1): in_filters = feat_channels[i] out_filters = feat_channels[i + 1] if i > 0: in_filters *= 2 # TODO: pass norm_cfg to VFE # norm_name, norm_layer = build_norm_layer(norm_cfg, out_filters) if i == (len(feat_channels) - 2): cat_max = False max_out = True if fusion_layer: max_out = False else: max_out = True cat_max = True vfe_layers.append( VFELayer( in_filters, out_filters, norm_cfg=norm_cfg, max_out=max_out, cat_max=cat_max)) self.vfe_layers = nn.ModuleList(vfe_layers) self.num_vfe = len(vfe_layers) self.fusion_layer = None if fusion_layer is not None: self.fusion_layer = builder.build_fusion_layer(fusion_layer) @force_fp32(out_fp16=True) def forward(self, features, num_points, coors, img_feats=None, img_metas=None): """Forward functions. Args: features (torch.Tensor): Features of voxels, shape is MxNxC. num_points (torch.Tensor): Number of points in each voxel. coors (torch.Tensor): Coordinates of voxels, shape is Mx(1+NDim). img_feats (list[torch.Tensor], optional): Image features used for multi-modality fusion. Defaults to None. img_metas (dict, optional): [description]. Defaults to None. Returns: tuple: If `return_point_feats` is False, returns voxel features and its coordinates. If `return_point_feats` is True, returns feature of each points inside voxels. """ features_ls = [features] # Find distance of x, y, and z from cluster center if self._with_cluster_center: points_mean = ( features[:, :, :3].sum(dim=1, keepdim=True) / num_points.type_as(features).view(-1, 1, 1)) # TODO: maybe also do cluster for reflectivity f_cluster = features[:, :, :3] - points_mean features_ls.append(f_cluster) # Find distance of x, y, and z from pillar center if self._with_voxel_center: f_center = features.new_zeros( size=(features.size(0), features.size(1), 3)) f_center[:, :, 0] = features[:, :, 0] - ( coors[:, 3].type_as(features).unsqueeze(1) * self.vx + self.x_offset) f_center[:, :, 1] = features[:, :, 1] - ( coors[:, 2].type_as(features).unsqueeze(1) * self.vy + self.y_offset) f_center[:, :, 2] = features[:, :, 2] - ( coors[:, 1].type_as(features).unsqueeze(1) * self.vz + self.z_offset) features_ls.append(f_center) if self._with_distance: points_dist = torch.norm(features[:, :, :3], 2, 2, keepdim=True) features_ls.append(points_dist) # Combine together feature decorations voxel_feats = torch.cat(features_ls, dim=-1) # The feature decorations were calculated without regard to whether # pillar was empty. # Need to ensure that empty voxels remain set to zeros. voxel_count = voxel_feats.shape[1] mask = get_paddings_indicator(num_points, voxel_count, axis=0) voxel_feats *= mask.unsqueeze(-1).type_as(voxel_feats) for i, vfe in enumerate(self.vfe_layers): voxel_feats = vfe(voxel_feats) if (self.fusion_layer is not None and img_feats is not None): voxel_feats = self.fusion_with_mask(features, mask, voxel_feats, coors, img_feats, img_metas) return voxel_feats def fusion_with_mask(self, features, mask, voxel_feats, coors, img_feats, img_metas): """Fuse image and point features with mask. Args: features (torch.Tensor): Features of voxel, usually it is the values of points in voxels. mask (torch.Tensor): Mask indicates valid features in each voxel. voxel_feats (torch.Tensor): Features of voxels. coors (torch.Tensor): Coordinates of each single voxel. img_feats (list[torch.Tensor]): Multi-scale feature maps of image. img_metas (list(dict)): Meta information of image and points. Returns: torch.Tensor: Fused features of each voxel. """ # the features is consist of a batch of points batch_size = coors[-1, 0] + 1 points = [] for i in range(batch_size): single_mask = (coors[:, 0] == i) points.append(features[single_mask][mask[single_mask]]) point_feats = voxel_feats[mask] point_feats = self.fusion_layer(img_feats, points, point_feats, img_metas) voxel_canvas = voxel_feats.new_zeros( size=(voxel_feats.size(0), voxel_feats.size(1), point_feats.size(-1))) voxel_canvas[mask] = point_feats out = torch.max(voxel_canvas, dim=1)[0] return out ================================================ FILE: mmdet3d/ops/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.ops import (RoIAlign, SigmoidFocalLoss, get_compiler_version, get_compiling_cuda_version, nms, roi_align, sigmoid_focal_loss) from mmcv.ops.assign_score_withk import assign_score_withk from mmcv.ops.ball_query import ball_query from mmcv.ops.furthest_point_sample import (furthest_point_sample, furthest_point_sample_with_dist) from mmcv.ops.gather_points import gather_points from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation from mmcv.ops.knn import knn from mmcv.ops.points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, points_in_boxes_part) from mmcv.ops.points_sampler import PointsSampler as Points_Sampler from mmcv.ops.roiaware_pool3d import RoIAwarePool3d from mmcv.ops.roipoint_pool3d import RoIPointPool3d from mmcv.ops.scatter_points import DynamicScatter, dynamic_scatter from mmcv.ops.three_interpolate import three_interpolate from mmcv.ops.three_nn import three_nn from mmcv.ops.voxelize import Voxelization, voxelization from .dgcnn_modules import DGCNNFAModule, DGCNNFPModule, DGCNNGFModule from .norm import NaiveSyncBatchNorm1d, NaiveSyncBatchNorm2d from .paconv import PAConv, PAConvCUDA from .pointnet_modules import (PAConvCUDASAModule, PAConvCUDASAModuleMSG, PAConvSAModule, PAConvSAModuleMSG, PointFPModule, PointSAModule, PointSAModuleMSG, build_sa_module) from .sparse_block import (SparseBasicBlock, SparseBottleneck, make_sparse_convmodule) __all__ = [ 'nms', 'soft_nms', 'RoIAlign', 'roi_align', 'get_compiler_version', 'get_compiling_cuda_version', 'NaiveSyncBatchNorm1d', 'NaiveSyncBatchNorm2d', 'batched_nms', 'Voxelization', 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'sigmoid_focal_loss', 'SigmoidFocalLoss', 'SparseBasicBlock', 'SparseBottleneck', 'RoIAwarePool3d', 'points_in_boxes_part', 'points_in_boxes_cpu', 'make_sparse_convmodule', 'ball_query', 'knn', 'furthest_point_sample', 'furthest_point_sample_with_dist', 'three_interpolate', 'three_nn', 'gather_points', 'grouping_operation', 'GroupAll', 'QueryAndGroup', 'PointSAModule', 'PointSAModuleMSG', 'PointFPModule', 'DGCNNFPModule', 'DGCNNGFModule', 'DGCNNFAModule', 'points_in_boxes_all', 'get_compiler_version', 'assign_score_withk', 'get_compiling_cuda_version', 'Points_Sampler', 'build_sa_module', 'PAConv', 'PAConvCUDA', 'PAConvSAModuleMSG', 'PAConvSAModule', 'PAConvCUDASAModule', 'PAConvCUDASAModuleMSG', 'RoIPointPool3d' ] ================================================ FILE: mmdet3d/ops/bev_pool_v2/__init__.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. ================================================ FILE: mmdet3d/ops/bev_pool_v2/bev_pool.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import numpy as np import torch from . import bev_pool_v2_ext __all__ = ['bev_pool_v2', 'TRTBEVPoolv2'] class QuickCumsumCuda(torch.autograd.Function): @staticmethod def forward(ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths): ranks_bev = ranks_bev.int() depth = depth.contiguous().float() feat = feat.contiguous().float() ranks_depth = ranks_depth.contiguous().int() ranks_feat = ranks_feat.contiguous().int() interval_lengths = interval_lengths.contiguous().int() interval_starts = interval_starts.contiguous().int() out = feat.new_zeros(bev_feat_shape) bev_pool_v2_ext.bev_pool_v2_forward( depth, feat, out, ranks_depth, ranks_feat, ranks_bev, interval_lengths, interval_starts, ) ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth) return out @staticmethod def backward(ctx, out_grad): ranks_bev, depth, feat, ranks_feat, ranks_depth = ctx.saved_tensors order = ranks_feat.argsort() ranks_feat, ranks_depth, ranks_bev = \ ranks_feat[order], ranks_depth[order], ranks_bev[order] kept = torch.ones( ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_feat[1:] != ranks_feat[:-1] interval_starts_bp = torch.where(kept)[0].int() interval_lengths_bp = torch.zeros_like(interval_starts_bp) interval_lengths_bp[:-1] = interval_starts_bp[ 1:] - interval_starts_bp[:-1] interval_lengths_bp[-1] = ranks_bev.shape[0] - interval_starts_bp[-1] depth = depth.contiguous() feat = feat.contiguous() ranks_depth = ranks_depth.contiguous() ranks_feat = ranks_feat.contiguous() ranks_bev = ranks_bev.contiguous() interval_lengths_bp = interval_lengths_bp.contiguous() interval_starts_bp = interval_starts_bp.contiguous() depth_grad = depth.new_zeros(depth.shape) feat_grad = feat.new_zeros(feat.shape) out_grad = out_grad.contiguous() bev_pool_v2_ext.bev_pool_v2_backward( out_grad, depth_grad, feat_grad, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_lengths_bp, interval_starts_bp, ) return depth_grad, feat_grad, None, None, None, None, None, \ None, None, None def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths): x = QuickCumsumCuda.apply(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) x = x.permute(0, 4, 1, 2, 3).contiguous() return x class TRTBEVPoolv2(torch.autograd.Function): @staticmethod def symbolic(g, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, out_height=128, out_width=128): """symbolic function for creating onnx op.""" return g.op( 'mmdeploy::bev_pool_v2', depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, out_height_i=out_height, out_width_i=out_width) @staticmethod def forward(g, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, out_height=128, out_width=128): """run forward.""" n, d, h, w = depth.shape feat = feat.view(1, n, feat.shape[3], h, w) feat = feat.permute(0, 1, 3, 4, 2) depth = depth.view(1, n, d, h, w) bev_feat_shape = (depth.shape[0], 1, out_height, out_width, feat.shape[-1]) # (B, Z, Y, X, C) bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, bev_feat_shape, interval_starts, interval_lengths) bev_feat = bev_feat.squeeze(2) bev_feat = bev_feat.permute(0, 2, 3, 1) return bev_feat def test_bev_pool_v2(): depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9]) depth = torch.from_numpy(depth).float().cuda() depth = depth.view(1, 1, 2, 2, 2).requires_grad_() feat = torch.ones( size=[1, 1, 2, 2, 2], dtype=torch.float, device='cuda').requires_grad_() ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda() ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda() ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda() kept = torch.ones( ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool) kept[1:] = ranks_bev[1:] != ranks_bev[:-1] interval_starts = torch.where(kept)[0].int() if len(interval_starts) == 0: return None, None, None, None, None interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1] bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev, (1, 1, 2, 2, 2), interval_starts, interval_lengths) loss = torch.sum(bev_feat) loss.backward() assert loss == 4.4 grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.]) grad_depth = torch.from_numpy(grad_depth).float() grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2) assert depth.grad.allclose(grad_depth) grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.]) grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2) assert feat.grad.allclose(grad_feat) ================================================ FILE: mmdet3d/ops/bev_pool_v2/src/bev_pool.cpp ================================================ #include #include // CUDA function declarations void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out); void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad, const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad); /* Function: pillar pooling (forward, cuda) Args: depth : input depth, FloatTensor[n, d, h, w] feat : input features, FloatTensor[n, h, w, c] out : output features, FloatTensor[b, c, h_out, w_out] ranks_depth : depth index of points, IntTensor[n_points] ranks_feat : feat index of points, IntTensor[n_points] ranks_bev : output index of points, IntTensor[n_points] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] Return: */ void bev_pool_v2_forward( const at::Tensor _depth, const at::Tensor _feat, at::Tensor _out, const at::Tensor _ranks_depth, const at::Tensor _ranks_feat, const at::Tensor _ranks_bev, const at::Tensor _interval_lengths, const at::Tensor _interval_starts ) { int c = _feat.size(4); int n_intervals = _interval_lengths.size(0); const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth)); const float* depth = _depth.data_ptr(); const float* feat = _feat.data_ptr(); const int* ranks_depth = _ranks_depth.data_ptr(); const int* ranks_feat = _ranks_feat.data_ptr(); const int* ranks_bev = _ranks_bev.data_ptr(); const int* interval_lengths = _interval_lengths.data_ptr(); const int* interval_starts = _interval_starts.data_ptr(); float* out = _out.data_ptr(); bev_pool_v2( c, n_intervals, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, out ); } /* Function: pillar pooling (backward, cuda) Args: out_grad : grad of output bev feature, FloatTensor[b, c, h_out, w_out] depth_grad : grad of input depth, FloatTensor[n, d, h, w] feat_grad : grad of input feature, FloatTensor[n, h, w, c] depth : input depth, FloatTensor[n, d, h, w] feat : input features, FloatTensor[n, h, w, c] ranks_depth : depth index of points, IntTensor[n_points] ranks_feat : feat index of points, IntTensor[n_points] ranks_bev : output index of points, IntTensor[n_points] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] */ void bev_pool_v2_backward( const at::Tensor _out_grad, at::Tensor _depth_grad, at::Tensor _feat_grad, const at::Tensor _depth, const at::Tensor _feat, const at::Tensor _ranks_depth, const at::Tensor _ranks_feat, const at::Tensor _ranks_bev, const at::Tensor _interval_lengths, const at::Tensor _interval_starts ) { int c = _out_grad.size(4); int n_intervals = _interval_lengths.size(0); const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad)); const float* out_grad = _out_grad.data_ptr(); float* depth_grad = _depth_grad.data_ptr(); float* feat_grad = _feat_grad.data_ptr(); const float* depth = _depth.data_ptr(); const float* feat = _feat.data_ptr(); const int* ranks_depth = _ranks_depth.data_ptr(); const int* ranks_feat = _ranks_feat.data_ptr(); const int* ranks_bev = _ranks_bev.data_ptr(); const int* interval_lengths = _interval_lengths.data_ptr(); const int* interval_starts = _interval_starts.data_ptr(); bev_pool_v2_grad( c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad ); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("bev_pool_v2_forward", &bev_pool_v2_forward, "bev_pool_v2_forward"); m.def("bev_pool_v2_backward", &bev_pool_v2_backward, "bev_pool_v2_backward"); } ================================================ FILE: mmdet3d/ops/bev_pool_v2/src/bev_pool_cuda.cu ================================================ #include #include /* Function: pillar pooling Args: c : number of channels n_intervals : number of unique points depth : input depth, FloatTensor[b,n,d,h,w] feat : input feat, FloatTensor[b,n,h,w,c] ranks_depth : input index of depth, IntTensor[n] ranks_feat : input index of feat, IntTensor[n] ranks_bev : output index, IntTensor[n] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] out : output features, FloatTensor[b, d, h, w, c] */ __global__ void bev_pool_v2_kernel(int c, int n_intervals, const float *__restrict__ depth, const float *__restrict__ feat, const int *__restrict__ ranks_depth, const int *__restrict__ ranks_feat, const int *__restrict__ ranks_bev, const int *__restrict__ interval_starts, const int *__restrict__ interval_lengths, float* __restrict__ out) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int index = idx / c; int cur_c = idx % c; if (index >= n_intervals) return; int interval_start = interval_starts[index]; int interval_length = interval_lengths[index]; float psum = 0; const float* cur_depth; const float* cur_feat; for(int i = 0; i < interval_length; i++){ cur_depth = depth + ranks_depth[interval_start+i]; cur_feat = feat + ranks_feat[interval_start+i] * c + cur_c; psum += *cur_feat * *cur_depth; } const int* cur_rank = ranks_bev + interval_start; float* cur_out = out + *cur_rank * c + cur_c; *cur_out = psum; } /* Function: pillar pooling backward Args: c : number of channels n_intervals : number of unique points out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c] depth : input depth, FloatTensor[b,n,d,h,w] feat : input feat, FloatTensor[b,n,h,w,c] ranks_depth : input index of depth, IntTensor[n] ranks_feat : input index of feat, IntTensor[n] ranks_bev : output index, IntTensor[n] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] depth_grad : gradient of the depth fmap, FloatTensor feat_grad : gradient of the feature fmap, FloatTensor */ __global__ void bev_pool_grad_kernel(int c, int n_intervals, const float *__restrict__ out_grad, const float *__restrict__ depth, const float *__restrict__ feat, const int *__restrict__ ranks_depth, const int *__restrict__ ranks_feat, const int *__restrict__ ranks_bev, const int *__restrict__ interval_starts, const int *__restrict__ interval_lengths, float* __restrict__ depth_grad, float* __restrict__ feat_grad) { int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx >= n_intervals) return; int interval_start = interval_starts[idx]; int interval_length = interval_lengths[idx]; const int* cur_rank; const float* cur_out_grad; const float* cur_out_grad_start; const float* cur_feat; const float* cur_feat_start; float* cur_depth_grad; float grad_sum; for(int i = 0; i < interval_length; i++){ cur_rank = ranks_bev + interval_start + i; cur_out_grad_start = out_grad + * cur_rank * c; cur_feat_start = feat + ranks_feat[interval_start+i] * c; grad_sum = 0; for(int cur_c = 0; cur_c < c; cur_c++){ cur_out_grad = cur_out_grad_start + cur_c; cur_feat = cur_feat_start + cur_c; grad_sum += *cur_out_grad * *cur_feat; } cur_depth_grad = depth_grad + ranks_depth[interval_start+i]; *cur_depth_grad = grad_sum; } float* cur_feat_grad; const float* cur_depth; for(int cur_c = 0; cur_c < c; cur_c++){ grad_sum = 0; for(int i = 0; i < interval_length; i++){ cur_rank = ranks_bev + interval_start + i; cur_out_grad = out_grad + *cur_rank * c + cur_c; cur_depth = depth + ranks_depth[interval_start+i]; grad_sum += *cur_out_grad * *cur_depth; } cur_feat_grad = feat_grad + ranks_feat[interval_start] * c + cur_c ; * cur_feat_grad = grad_sum; } } void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* out) { bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>( c, n_intervals, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, out ); } void bev_pool_v2_grad(int c, int n_intervals, const float* out_grad, const float* depth, const float* feat, const int* ranks_depth, const int* ranks_feat, const int* ranks_bev, const int* interval_starts, const int* interval_lengths, float* depth_grad, float* feat_grad) { bev_pool_grad_kernel<<<(int)ceil(((double)n_intervals / 256)), 256>>>( c, n_intervals, out_grad, depth, feat, ranks_depth, ranks_feat, ranks_bev, interval_starts, interval_lengths, depth_grad, feat_grad ); } ================================================ FILE: mmdet3d/ops/dgcnn_modules/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .dgcnn_fa_module import DGCNNFAModule from .dgcnn_fp_module import DGCNNFPModule from .dgcnn_gf_module import DGCNNGFModule __all__ = ['DGCNNFAModule', 'DGCNNFPModule', 'DGCNNGFModule'] ================================================ FILE: mmdet3d/ops/dgcnn_modules/dgcnn_fa_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn class DGCNNFAModule(BaseModule): """Point feature aggregation module used in DGCNN. Aggregate all the features of points. Args: mlp_channels (list[int]): List of mlp channels. norm_cfg (dict, optional): Type of normalization method. Defaults to dict(type='BN1d'). act_cfg (dict, optional): Type of activation method. Defaults to dict(type='ReLU'). init_cfg (dict, optional): Initialization config. Defaults to None. """ def __init__(self, mlp_channels, norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.fp16_enabled = False self.mlps = nn.Sequential() for i in range(len(mlp_channels) - 1): self.mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, ), stride=(1, ), conv_cfg=dict(type='Conv1d'), norm_cfg=norm_cfg, act_cfg=act_cfg)) @force_fp32() def forward(self, points): """forward. Args: points (List[Tensor]): tensor of the features to be aggregated. Returns: Tensor: (B, N, M) M = mlp[-1], tensor of the output points. """ if len(points) > 1: new_points = torch.cat(points[1:], dim=-1) new_points = new_points.transpose(1, 2).contiguous() # (B, C, N) new_points_copy = new_points new_points = self.mlps(new_points) new_fa_points = new_points.max(dim=-1, keepdim=True)[0] new_fa_points = new_fa_points.repeat(1, 1, new_points.shape[-1]) new_points = torch.cat([new_fa_points, new_points_copy], dim=1) new_points = new_points.transpose(1, 2).contiguous() else: new_points = points return new_points ================================================ FILE: mmdet3d/ops/dgcnn_modules/dgcnn_fp_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn import ConvModule from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn class DGCNNFPModule(BaseModule): """Point feature propagation module used in DGCNN. Propagate the features from one set to another. Args: mlp_channels (list[int]): List of mlp channels. norm_cfg (dict, optional): Type of activation method. Defaults to dict(type='BN1d'). act_cfg (dict, optional): Type of activation method. Defaults to dict(type='ReLU'). init_cfg (dict, optional): Initialization config. Defaults to None. """ def __init__(self, mlp_channels, norm_cfg=dict(type='BN1d'), act_cfg=dict(type='ReLU'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.fp16_enabled = False self.mlps = nn.Sequential() for i in range(len(mlp_channels) - 1): self.mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, ), stride=(1, ), conv_cfg=dict(type='Conv1d'), norm_cfg=norm_cfg, act_cfg=act_cfg)) @force_fp32() def forward(self, points): """forward. Args: points (Tensor): (B, N, C) tensor of the input points. Returns: Tensor: (B, N, M) M = mlp[-1], tensor of the new points. """ if points is not None: new_points = points.transpose(1, 2).contiguous() # (B, C, N) new_points = self.mlps(new_points) new_points = new_points.transpose(1, 2).contiguous() else: new_points = points return new_points ================================================ FILE: mmdet3d/ops/dgcnn_modules/dgcnn_gf_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.ops.group_points import GroupAll, QueryAndGroup, grouping_operation from torch import nn as nn from torch.nn import functional as F class BaseDGCNNGFModule(nn.Module): """Base module for point graph feature module used in DGCNN. Args: radii (list[float]): List of radius in each knn or ball query. sample_nums (list[int]): Number of samples in each knn or ball query. mlp_channels (list[list[int]]): Specify of the dgcnn before the global pooling for each graph feature module. knn_modes (list[str], optional): Type of KNN method, valid mode ['F-KNN', 'D-KNN'], Defaults to ['F-KNN']. dilated_group (bool, optional): Whether to use dilated ball query. Defaults to False. use_xyz (bool, optional): Whether to use xyz as point features. Defaults to True. pool_mode (str, optional): Type of pooling method. Defaults to 'max'. normalize_xyz (bool, optional): If ball query, whether to normalize local XYZ with radius. Defaults to False. grouper_return_grouped_xyz (bool, optional): Whether to return grouped xyz in `QueryAndGroup`. Defaults to False. grouper_return_grouped_idx (bool, optional): Whether to return grouped idx in `QueryAndGroup`. Defaults to False. """ def __init__(self, radii, sample_nums, mlp_channels, knn_modes=['F-KNN'], dilated_group=False, use_xyz=True, pool_mode='max', normalize_xyz=False, grouper_return_grouped_xyz=False, grouper_return_grouped_idx=False): super(BaseDGCNNGFModule, self).__init__() assert len(sample_nums) == len( mlp_channels ), 'Num_samples and mlp_channels should have the same length.' assert pool_mode in ['max', 'avg' ], "Pool_mode should be one of ['max', 'avg']." assert isinstance(knn_modes, list) or isinstance( knn_modes, tuple), 'The type of knn_modes should be list or tuple.' if isinstance(mlp_channels, tuple): mlp_channels = list(map(list, mlp_channels)) self.mlp_channels = mlp_channels self.pool_mode = pool_mode self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() self.knn_modes = knn_modes for i in range(len(sample_nums)): sample_num = sample_nums[i] if sample_num is not None: if self.knn_modes[i] == 'D-KNN': grouper = QueryAndGroup( radii[i], sample_num, use_xyz=use_xyz, normalize_xyz=normalize_xyz, return_grouped_xyz=grouper_return_grouped_xyz, return_grouped_idx=True) else: grouper = QueryAndGroup( radii[i], sample_num, use_xyz=use_xyz, normalize_xyz=normalize_xyz, return_grouped_xyz=grouper_return_grouped_xyz, return_grouped_idx=grouper_return_grouped_idx) else: grouper = GroupAll(use_xyz) self.groupers.append(grouper) def _pool_features(self, features): """Perform feature aggregation using pooling operation. Args: features (torch.Tensor): (B, C, N, K) Features of locally grouped points before pooling. Returns: torch.Tensor: (B, C, N) Pooled features aggregating local information. """ if self.pool_mode == 'max': # (B, C, N, 1) new_features = F.max_pool2d( features, kernel_size=[1, features.size(3)]) elif self.pool_mode == 'avg': # (B, C, N, 1) new_features = F.avg_pool2d( features, kernel_size=[1, features.size(3)]) else: raise NotImplementedError return new_features.squeeze(-1).contiguous() def forward(self, points): """forward. Args: points (Tensor): (B, N, C) input points. Returns: List[Tensor]: (B, N, C1) new points generated from each graph feature module. """ new_points_list = [points] for i in range(len(self.groupers)): new_points = new_points_list[i] new_points_trans = new_points.transpose( 1, 2).contiguous() # (B, C, N) if self.knn_modes[i] == 'D-KNN': # (B, N, C) -> (B, N, K) idx = self.groupers[i](new_points[..., -3:].contiguous(), new_points[..., -3:].contiguous())[-1] grouped_results = grouping_operation( new_points_trans, idx) # (B, C, N) -> (B, C, N, K) grouped_results -= new_points_trans.unsqueeze(-1) else: grouped_results = self.groupers[i]( new_points, new_points) # (B, N, C) -> (B, C, N, K) new_points = new_points_trans.unsqueeze(-1).repeat( 1, 1, 1, grouped_results.shape[-1]) new_points = torch.cat([grouped_results, new_points], dim=1) # (B, mlp[-1], N, K) new_points = self.mlps[i](new_points) # (B, mlp[-1], N) new_points = self._pool_features(new_points) new_points = new_points.transpose(1, 2).contiguous() new_points_list.append(new_points) return new_points class DGCNNGFModule(BaseDGCNNGFModule): """Point graph feature module used in DGCNN. Args: mlp_channels (list[int]): Specify of the dgcnn before the global pooling for each graph feature module. num_sample (int, optional): Number of samples in each knn or ball query. Defaults to None. knn_mode (str, optional): Type of KNN method, valid mode ['F-KNN', 'D-KNN']. Defaults to 'F-KNN'. radius (float, optional): Radius to group with. Defaults to None. dilated_group (bool, optional): Whether to use dilated ball query. Defaults to False. norm_cfg (dict, optional): Type of normalization method. Defaults to dict(type='BN2d'). act_cfg (dict, optional): Type of activation method. Defaults to dict(type='ReLU'). use_xyz (bool, optional): Whether to use xyz as point features. Defaults to True. pool_mode (str, optional): Type of pooling method. Defaults to 'max'. normalize_xyz (bool, optional): If ball query, whether to normalize local XYZ with radius. Defaults to False. bias (bool | str, optional): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise False. Defaults to 'auto'. """ def __init__(self, mlp_channels, num_sample=None, knn_mode='F-KNN', radius=None, dilated_group=False, norm_cfg=dict(type='BN2d'), act_cfg=dict(type='ReLU'), use_xyz=True, pool_mode='max', normalize_xyz=False, bias='auto'): super(DGCNNGFModule, self).__init__( mlp_channels=[mlp_channels], sample_nums=[num_sample], knn_modes=[knn_mode], radii=[radius], use_xyz=use_xyz, pool_mode=pool_mode, normalize_xyz=normalize_xyz, dilated_group=dilated_group) for i in range(len(self.mlp_channels)): mlp_channel = self.mlp_channels[i] mlp = nn.Sequential() for i in range(len(mlp_channel) - 1): mlp.add_module( f'layer{i}', ConvModule( mlp_channel[i], mlp_channel[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg, act_cfg=act_cfg, bias=bias)) self.mlps.append(mlp) ================================================ FILE: mmdet3d/ops/norm.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import NORM_LAYERS from mmcv.runner import force_fp32 from torch import distributed as dist from torch import nn as nn from torch.autograd.function import Function class AllReduce(Function): @staticmethod def forward(ctx, input): input_list = [ torch.zeros_like(input) for k in range(dist.get_world_size()) ] # Use allgather instead of allreduce in-place operations is unreliable dist.all_gather(input_list, input, async_op=False) inputs = torch.stack(input_list, dim=0) return torch.sum(inputs, dim=0) @staticmethod def backward(ctx, grad_output): dist.all_reduce(grad_output, async_op=False) return grad_output @NORM_LAYERS.register_module('naiveSyncBN1d') class NaiveSyncBatchNorm1d(nn.BatchNorm1d): """Synchronized Batch Normalization for 3D Tensors. Note: This implementation is modified from https://github.com/facebookresearch/detectron2/ `torch.nn.SyncBatchNorm` has known unknown bugs. It produces significantly worse AP (and sometimes goes NaN) when the batch size on each worker is quite different (e.g., when scale augmentation is used). In 3D detection, different workers has points of different shapes, which also cause instability. Use this implementation before `nn.SyncBatchNorm` is fixed. It is slower than `nn.SyncBatchNorm`. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fp16_enabled = False # customized normalization layer still needs this decorator # to force the input to be fp32 and the output to be fp16 # TODO: make mmcv fp16 utils handle customized norm layers @force_fp32(out_fp16=True) def forward(self, input): """ Args: input (tensor): Has shape (N, C) or (N, C, L), where N is the batch size, C is the number of features or channels, and L is the sequence length Returns: tensor: Has shape (N, C) or (N, C, L), has same shape as input. """ assert input.dtype == torch.float32, \ f'input should be in float32 type, got {input.dtype}' using_dist = dist.is_available() and dist.is_initialized() if (not using_dist) or dist.get_world_size() == 1 \ or not self.training: return super().forward(input) assert input.shape[0] > 0, 'SyncBN does not support empty inputs' is_two_dim = input.dim() == 2 if is_two_dim: input = input.unsqueeze(2) C = input.shape[1] mean = torch.mean(input, dim=[0, 2]) meansqr = torch.mean(input * input, dim=[0, 2]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * ( mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1) bias = bias.reshape(1, -1, 1) output = input * scale + bias if is_two_dim: output = output.squeeze(2) return output @NORM_LAYERS.register_module('naiveSyncBN2d') class NaiveSyncBatchNorm2d(nn.BatchNorm2d): """Synchronized Batch Normalization for 4D Tensors. Note: This implementation is modified from https://github.com/facebookresearch/detectron2/ `torch.nn.SyncBatchNorm` has known unknown bugs. It produces significantly worse AP (and sometimes goes NaN) when the batch size on each worker is quite different (e.g., when scale augmentation is used). This phenomenon also occurs when the multi-modality feature fusion modules of multi-modality detectors use SyncBN. Use this implementation before `nn.SyncBatchNorm` is fixed. It is slower than `nn.SyncBatchNorm`. """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.fp16_enabled = False # customized normalization layer still needs this decorator # to force the input to be fp32 and the output to be fp16 # TODO: make mmcv fp16 utils handle customized norm layers @force_fp32(out_fp16=True) def forward(self, input): """ Args: Input (tensor): Feature has shape (N, C, H, W). Returns: tensor: Has shape (N, C, H, W), same shape as input. """ assert input.dtype == torch.float32, \ f'input should be in float32 type, got {input.dtype}' using_dist = dist.is_available() and dist.is_initialized() if (not using_dist) or \ dist.get_world_size() == 1 or \ not self.training: return super().forward(input) assert input.shape[0] > 0, 'SyncBN does not support empty inputs' C = input.shape[1] mean = torch.mean(input, dim=[0, 2, 3]) meansqr = torch.mean(input * input, dim=[0, 2, 3]) vec = torch.cat([mean, meansqr], dim=0) vec = AllReduce.apply(vec) * (1.0 / dist.get_world_size()) mean, meansqr = torch.split(vec, C) var = meansqr - mean * mean self.running_mean += self.momentum * ( mean.detach() - self.running_mean) self.running_var += self.momentum * (var.detach() - self.running_var) invstd = torch.rsqrt(var + self.eps) scale = self.weight * invstd bias = self.bias - mean * scale scale = scale.reshape(1, -1, 1, 1) bias = bias.reshape(1, -1, 1, 1) return input * scale + bias ================================================ FILE: mmdet3d/ops/ops_dcnv3/functions/__init__.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from .dcnv3_func import DCNv3Function, dcnv3_core_pytorch ================================================ FILE: mmdet3d/ops/ops_dcnv3/functions/dcnv3_func.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from __future__ import division import torch import torch.nn.functional as F from torch.autograd import Function from torch.autograd.function import once_differentiable from torch.cuda.amp import custom_bwd, custom_fwd import DCNv3 class DCNv3Function(Function): @staticmethod @custom_fwd def forward( ctx, input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, im2col_step): ctx.kernel_h = kernel_h ctx.kernel_w = kernel_w ctx.stride_h = stride_h ctx.stride_w = stride_w ctx.pad_h = pad_h ctx.pad_w = pad_w ctx.dilation_h = dilation_h ctx.dilation_w = dilation_w ctx.group = group ctx.group_channels = group_channels ctx.offset_scale = offset_scale ctx.im2col_step = im2col_step output = DCNv3.dcnv3_forward( input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, ctx.im2col_step) ctx.save_for_backward(input, offset, mask) return output @staticmethod @once_differentiable @custom_bwd def backward(ctx, grad_output): input, offset, mask = ctx.saved_tensors grad_input, grad_offset, grad_mask = \ DCNv3.dcnv3_backward( input, offset, mask, ctx.kernel_h, ctx.kernel_w, ctx.stride_h, ctx.stride_w, ctx.pad_h, ctx.pad_w, ctx.dilation_h, ctx.dilation_w, ctx.group, ctx.group_channels, ctx.offset_scale, grad_output.contiguous(), ctx.im2col_step) return grad_input, grad_offset, grad_mask, \ None, None, None, None, None, None, None, None, None, None, None, None @staticmethod def symbolic(g, input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, im2col_step): """Symbolic function for mmdeploy::DCNv3. Returns: DCNv3 op for onnx. """ return g.op( 'mmdeploy::TRTDCNv3', input, offset, mask, kernel_h_i=int(kernel_h), kernel_w_i=int(kernel_w), stride_h_i=int(stride_h), stride_w_i=int(stride_w), pad_h_i=int(pad_h), pad_w_i=int(pad_w), dilation_h_i=int(dilation_h), dilation_w_i=int(dilation_w), group_i=int(group), group_channels_i=int(group_channels), offset_scale_f=float(offset_scale), im2col_step_i=int(im2col_step), ) def _get_reference_points(spatial_shapes, device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h=0, pad_w=0, stride_h=1, stride_w=1): _, H_, W_, _ = spatial_shapes H_out = (H_ - (dilation_h * (kernel_h - 1) + 1)) // stride_h + 1 W_out = (W_ - (dilation_w * (kernel_w - 1) + 1)) // stride_w + 1 ref_y, ref_x = torch.meshgrid( torch.linspace( # pad_h + 0.5, # H_ - pad_h - 0.5, (dilation_h * (kernel_h - 1)) // 2 + 0.5, (dilation_h * (kernel_h - 1)) // 2 + 0.5 + (H_out - 1) * stride_h, H_out, dtype=torch.float32, device=device), torch.linspace( # pad_w + 0.5, # W_ - pad_w - 0.5, (dilation_w * (kernel_w - 1)) // 2 + 0.5, (dilation_w * (kernel_w - 1)) // 2 + 0.5 + (W_out - 1) * stride_w, W_out, dtype=torch.float32, device=device)) ref_y = ref_y.reshape(-1)[None] / H_ ref_x = ref_x.reshape(-1)[None] / W_ ref = torch.stack((ref_x, ref_y), -1).reshape( 1, H_out, W_out, 1, 2) return ref def _generate_dilation_grids(spatial_shapes, kernel_h, kernel_w, dilation_h, dilation_w, group, device): _, H_, W_, _ = spatial_shapes points_list = [] x, y = torch.meshgrid( torch.linspace( -((dilation_w * (kernel_w - 1)) // 2), -((dilation_w * (kernel_w - 1)) // 2) + (kernel_w - 1) * dilation_w, kernel_w, dtype=torch.float32, device=device), torch.linspace( -((dilation_h * (kernel_h - 1)) // 2), -((dilation_h * (kernel_h - 1)) // 2) + (kernel_h - 1) * dilation_h, kernel_h, dtype=torch.float32, device=device)) points_list.extend([x / W_, y / H_]) grid = torch.stack(points_list, -1).reshape(-1, 1, 2).\ repeat(1, group, 1).permute(1, 0, 2) grid = grid.reshape(1, 1, 1, group * kernel_h * kernel_w, 2) return grid def dcnv3_core_pytorch( input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale): # for debug and test only, # need to use cuda version instead input = F.pad( input, [0, 0, pad_h, pad_h, pad_w, pad_w]) N_, H_in, W_in, _ = input.shape _, H_out, W_out, _ = offset.shape ref = _get_reference_points( input.shape, input.device, kernel_h, kernel_w, dilation_h, dilation_w, pad_h, pad_w, stride_h, stride_w) grid = _generate_dilation_grids( input.shape, kernel_h, kernel_w, dilation_h, dilation_w, group, input.device) spatial_norm = torch.tensor([W_in, H_in]).reshape(1, 1, 1, 2).\ repeat(1, 1, 1, group*kernel_h*kernel_w).to(input.device) sampling_locations = (ref + grid * offset_scale).repeat(N_, 1, 1, 1, 1).flatten(3, 4) + \ offset * offset_scale / spatial_norm P_ = kernel_h * kernel_w sampling_grids = 2 * sampling_locations - 1 # N_, H_in, W_in, group*group_channels -> N_, H_in*W_in, group*group_channels -> N_, group*group_channels, H_in*W_in -> N_*group, group_channels, H_in, W_in input_ = input.view(N_, H_in*W_in, group*group_channels).transpose(1, 2).\ reshape(N_*group, group_channels, H_in, W_in) # N_, H_out, W_out, group*P_*2 -> N_, H_out*W_out, group, P_, 2 -> N_, group, H_out*W_out, P_, 2 -> N_*group, H_out*W_out, P_, 2 sampling_grid_ = sampling_grids.view(N_, H_out*W_out, group, P_, 2).transpose(1, 2).\ flatten(0, 1) # N_*group, group_channels, H_out*W_out, P_ sampling_input_ = F.grid_sample( input_, sampling_grid_, mode='bilinear', padding_mode='zeros', align_corners=False) # (N_, H_out, W_out, group*P_) -> N_, H_out*W_out, group, P_ -> (N_, group, H_out*W_out, P_) -> (N_*group, 1, H_out*W_out, P_) mask = mask.view(N_, H_out*W_out, group, P_).transpose(1, 2).\ reshape(N_*group, 1, H_out*W_out, P_) output = (sampling_input_ * mask).sum(-1).view(N_, group*group_channels, H_out*W_out) return output.transpose(1, 2).reshape(N_, H_out, W_out, -1).contiguous() ================================================ FILE: mmdet3d/ops/ops_dcnv3/make.sh ================================================ #!/usr/bin/env bash # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- python setup.py build install ================================================ FILE: mmdet3d/ops/ops_dcnv3/modules/__init__.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from .dcnv3 import DCNv3, DCNv3_pytorch ================================================ FILE: mmdet3d/ops/ops_dcnv3/modules/dcnv3.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from __future__ import division import warnings import torch from torch import nn import torch.nn.functional as F from torch.nn.init import xavier_uniform_, constant_ from ..functions import DCNv3Function, dcnv3_core_pytorch class to_channels_first(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x.permute(0, 3, 1, 2) class to_channels_last(nn.Module): def __init__(self): super().__init__() def forward(self, x): return x.permute(0, 2, 3, 1) def build_norm_layer(dim, norm_layer, in_format='channels_last', out_format='channels_last', eps=1e-6): layers = [] if norm_layer == 'BN': if in_format == 'channels_last': layers.append(to_channels_first()) layers.append(nn.BatchNorm2d(dim)) if out_format == 'channels_last': layers.append(to_channels_last()) elif norm_layer == 'LN': if in_format == 'channels_first': layers.append(to_channels_last()) layers.append(nn.LayerNorm(dim, eps=eps)) if out_format == 'channels_first': layers.append(to_channels_first()) else: raise NotImplementedError( f'build_norm_layer does not support {norm_layer}') return nn.Sequential(*layers) def build_act_layer(act_layer): if act_layer == 'ReLU': return nn.ReLU(inplace=True) elif act_layer == 'SiLU': return nn.SiLU(inplace=True) elif act_layer == 'GELU': return nn.GELU() raise NotImplementedError(f'build_act_layer does not support {act_layer}') def _is_power_of_2(n): if (not isinstance(n, int)) or (n < 0): raise ValueError( "invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) return (n & (n - 1) == 0) and n != 0 class CenterFeatureScaleModule(nn.Module): def forward(self, query, center_feature_scale_proj_weight, center_feature_scale_proj_bias): center_feature_scale = F.linear(query, weight=center_feature_scale_proj_weight, bias=center_feature_scale_proj_bias).sigmoid() return center_feature_scale class DCNv3_pytorch(nn.Module): def __init__( self, channels=64, kernel_size=3, dw_kernel_size=None, stride=1, pad=1, dilation=1, group=4, offset_scale=1.0, act_layer='GELU', norm_layer='LN', center_feature_scale=False): """ DCNv3 Module :param channels :param kernel_size :param stride :param pad :param dilation :param group :param offset_scale :param act_layer :param norm_layer """ super().__init__() if channels % group != 0: raise ValueError( f'channels must be divisible by group, but got {channels} and {group}') _d_per_group = channels // group dw_kernel_size = dw_kernel_size if dw_kernel_size is not None else kernel_size # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_group): warnings.warn( "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.offset_scale = offset_scale self.channels = channels self.kernel_size = kernel_size self.dw_kernel_size = dw_kernel_size self.stride = stride self.dilation = dilation self.pad = pad self.group = group self.group_channels = channels // group self.offset_scale = offset_scale self.center_feature_scale = center_feature_scale self.dw_conv = nn.Sequential( nn.Conv2d( channels, channels, kernel_size=dw_kernel_size, stride=1, padding=(dw_kernel_size - 1) // 2, groups=channels), build_norm_layer( channels, norm_layer, 'channels_first', 'channels_last'), build_act_layer(act_layer)) self.offset = nn.Linear( channels, group * kernel_size * kernel_size * 2) self.mask = nn.Linear( channels, group * kernel_size * kernel_size) self.input_proj = nn.Linear(channels, channels) self.output_proj = nn.Linear(channels, channels) self._reset_parameters() if center_feature_scale: self.center_feature_scale_proj_weight = nn.Parameter( torch.zeros((group, channels), dtype=torch.float)) self.center_feature_scale_proj_bias = nn.Parameter( torch.tensor(0.0, dtype=torch.float).view((1,)).repeat(group, )) self.center_feature_scale_module = CenterFeatureScaleModule() def _reset_parameters(self): constant_(self.offset.weight.data, 0.) constant_(self.offset.bias.data, 0.) constant_(self.mask.weight.data, 0.) constant_(self.mask.bias.data, 0.) xavier_uniform_(self.input_proj.weight.data) constant_(self.input_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, input): """ :param query (N, H, W, C) :return output (N, H, W, C) """ N, H, W, _ = input.shape x = self.input_proj(input) x_proj = x x1 = input.permute(0, 3, 1, 2) x1 = self.dw_conv(x1) offset = self.offset(x1) mask = self.mask(x1).reshape(N, H, W, self.group, -1) mask = F.softmax(mask, -1).reshape(N, H, W, -1) x = dcnv3_core_pytorch( x, offset, mask, self.kernel_size, self.kernel_size, self.stride, self.stride, self.pad, self.pad, self.dilation, self.dilation, self.group, self.group_channels, self.offset_scale) if self.center_feature_scale: center_feature_scale = self.center_feature_scale_module( x1, self.center_feature_scale_proj_weight, self.center_feature_scale_proj_bias) # N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels center_feature_scale = center_feature_scale[..., None].repeat( 1, 1, 1, 1, self.channels // self.group).flatten(-2) x = x * (1 - center_feature_scale) + x_proj * center_feature_scale x = self.output_proj(x) return x class DCNv3(nn.Module): def __init__( self, channels=64, kernel_size=3, dw_kernel_size=None, stride=1, pad=1, dilation=1, group=4, offset_scale=1.0, act_layer='GELU', norm_layer='LN', center_feature_scale=False): """ DCNv3 Module :param channels :param kernel_size :param stride :param pad :param dilation :param group :param offset_scale :param act_layer :param norm_layer """ super().__init__() if channels % group != 0: raise ValueError( f'channels must be divisible by group, but got {channels} and {group}') _d_per_group = channels // group dw_kernel_size = dw_kernel_size if dw_kernel_size is not None else kernel_size # you'd better set _d_per_group to a power of 2 which is more efficient in our CUDA implementation if not _is_power_of_2(_d_per_group): warnings.warn( "You'd better set channels in DCNv3 to make the dimension of each attention head a power of 2 " "which is more efficient in our CUDA implementation.") self.offset_scale = offset_scale self.channels = channels self.kernel_size = kernel_size self.dw_kernel_size = dw_kernel_size self.stride = stride self.dilation = dilation self.pad = pad self.group = group self.group_channels = channels // group self.offset_scale = offset_scale self.center_feature_scale = center_feature_scale self.dw_conv = nn.Sequential( nn.Conv2d( channels, channels, kernel_size=dw_kernel_size, stride=1, padding=(dw_kernel_size - 1) // 2, groups=channels), build_norm_layer( channels, norm_layer, 'channels_first', 'channels_last'), build_act_layer(act_layer)) self.offset = nn.Linear( channels, group * kernel_size * kernel_size * 2) self.mask = nn.Linear( channels, group * kernel_size * kernel_size) self.input_proj = nn.Linear(channels, channels) self.output_proj = nn.Linear(channels, channels) self._reset_parameters() if center_feature_scale: self.center_feature_scale_proj_weight = nn.Parameter( torch.zeros((group, channels), dtype=torch.float)) self.center_feature_scale_proj_bias = nn.Parameter( torch.tensor(0.0, dtype=torch.float).view((1,)).repeat(group, )) self.center_feature_scale_module = CenterFeatureScaleModule() def _reset_parameters(self): constant_(self.offset.weight.data, 0.) constant_(self.offset.bias.data, 0.) constant_(self.mask.weight.data, 0.) constant_(self.mask.bias.data, 0.) xavier_uniform_(self.input_proj.weight.data) constant_(self.input_proj.bias.data, 0.) xavier_uniform_(self.output_proj.weight.data) constant_(self.output_proj.bias.data, 0.) def forward(self, input): """ :param query (N, H, W, C) :return output (N, H, W, C) """ N, H, W, _ = input.shape x = self.input_proj(input) x_proj = x dtype = x.dtype x1 = input.permute(0, 3, 1, 2) x1 = self.dw_conv(x1) offset = self.offset(x1) mask = self.mask(x1).reshape(N, H, W, self.group, -1) mask = F.softmax(mask, -1).reshape(N, H, W, -1).type(dtype) x = DCNv3Function.apply( x, offset, mask, self.kernel_size, self.kernel_size, self.stride, self.stride, self.pad, self.pad, self.dilation, self.dilation, self.group, self.group_channels, self.offset_scale, 256) if self.center_feature_scale: center_feature_scale = self.center_feature_scale_module( x1, self.center_feature_scale_proj_weight, self.center_feature_scale_proj_bias) # N, H, W, groups -> N, H, W, groups, 1 -> N, H, W, groups, _d_per_group -> N, H, W, channels center_feature_scale = center_feature_scale[..., None].repeat( 1, 1, 1, 1, self.channels // self.group).flatten(-2) x = x * (1 - center_feature_scale) + x_proj * center_feature_scale x = self.output_proj(x) return x ================================================ FILE: mmdet3d/ops/ops_dcnv3/setup.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import os import glob import torch from torch.utils.cpp_extension import CUDA_HOME from torch.utils.cpp_extension import CppExtension from torch.utils.cpp_extension import CUDAExtension from setuptools import find_packages from setuptools import setup requirements = ["torch", "torchvision"] def get_extensions(): this_dir = os.path.dirname(os.path.abspath(__file__)) extensions_dir = os.path.join(this_dir, "src") main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) sources = main_file + source_cpu extension = CppExtension extra_compile_args = {"cxx": []} define_macros = [] if torch.cuda.is_available() and CUDA_HOME is not None: extension = CUDAExtension sources += source_cuda define_macros += [("WITH_CUDA", None)] extra_compile_args["nvcc"] = [ # "-DCUDA_HAS_FP16=1", # "-D__CUDA_NO_HALF_OPERATORS__", # "-D__CUDA_NO_HALF_CONVERSIONS__", # "-D__CUDA_NO_HALF2_OPERATORS__", ] else: raise NotImplementedError('Cuda is not availabel') sources = [os.path.join(extensions_dir, s) for s in sources] include_dirs = [extensions_dir] ext_modules = [ extension( "DCNv3", sources, include_dirs=include_dirs, define_macros=define_macros, extra_compile_args=extra_compile_args, ) ] return ext_modules setup( name="DCNv3", version="1.0", author="InternImage", url="https://github.com/OpenGVLab/InternImage", description= "PyTorch Wrapper for CUDA Functions of DCNv3", packages=find_packages(exclude=( "configs", "tests", )), ext_modules=get_extensions(), cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, ) ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/cpu/dcnv3_cpu.cpp ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include #include #include at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step) { AT_ERROR("Not implement on cpu"); } std::vector dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step) { AT_ERROR("Not implement on cpu"); } ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/cpu/dcnv3_cpu.h ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include at::Tensor dcnv3_cpu_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step); std::vector dcnv3_cpu_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/cuda/dcnv3_cuda.cu ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include "cuda/dcnv3_im2col_cuda.cuh" #include #include #include #include #include #include at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step) { AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous"); AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous"); AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); const int batch = input.size(0); const int height_in = input.size(1); const int width_in = input.size(2); const int channels = input.size(3); const int height_out = (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); AT_ASSERTM( channels == (group * group_channels), "Input channels and group times group channels wont match: (%d vs %d).", channels, group * group_channels); auto output = at::zeros({batch, height_out, width_out, group * group_channels}, input.options()); const int batch_n = im2col_step_; auto output_n = output.view({batch / batch_n, batch_n, height_out, width_out, group * group_channels}); auto per_input_size = height_in * width_in * group * group_channels; auto per_offset_size = height_out * width_out * group * kernel_h * kernel_w * 2; auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w; for (int n = 0; n < batch / im2col_step_; ++n) { auto columns = output_n.select(0, n); // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_forward_cuda", ([&] { dcnv3_im2col_cuda( at::cuda::getCurrentCUDAStream(), input.data() + n * im2col_step_ * per_input_size, offset.data() + n * im2col_step_ * per_offset_size, mask.data() + n * im2col_step_ * per_mask_size, columns.data(), kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, batch_n, height_in, width_in, height_out, width_out, offset_scale); })); } return output; } std::vector dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step) { AT_ASSERTM(input.is_contiguous(), "input tensor has to be contiguous"); AT_ASSERTM(offset.is_contiguous(), "offset tensor has to be contiguous"); AT_ASSERTM(mask.is_contiguous(), "mask tensor has to be contiguous"); AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); AT_ASSERTM(input.type().is_cuda(), "input must be a CUDA tensor"); AT_ASSERTM(offset.type().is_cuda(), "offset must be a CUDA tensor"); AT_ASSERTM(mask.type().is_cuda(), "mask must be a CUDA tensor"); AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); const int batch = input.size(0); const int height_in = input.size(1); const int width_in = input.size(2); const int channels = input.size(3); const int height_out = (height_in + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; const int width_out = (width_in + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; const int im2col_step_ = std::min(batch, im2col_step); AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); AT_ASSERTM( channels == (group * group_channels), "Input channels and group times group channels wont match: (%d vs %d).", channels, group * group_channels); auto dtype = input.dtype(); if (dtype == at::kHalf) { dtype = at::kFloat; } auto grad_input = at::zeros_like(input, dtype); auto grad_offset = at::zeros_like(offset, dtype); auto grad_mask = at::zeros_like(mask, dtype); const int batch_n = im2col_step_; auto per_input_size = height_in * width_in * group * group_channels; auto per_offset_size = height_out * width_out * group * kernel_h * kernel_w * 2; auto per_mask_size = height_out * width_out * group * kernel_h * kernel_w; auto grad_output_n = grad_output.view({batch / im2col_step_, batch_n, height_out * width_out, group, group_channels}); for (int n = 0; n < batch / im2col_step_; ++n) { auto grad_output_g = grad_output_n.select(0, n); // AT_DISPATCH_FLOATING_TYPES( AT_DISPATCH_FLOATING_TYPES_AND_HALF( input.type(), "ms_deform_attn_backward_cuda", ([&] { dcnv3_col2im_cuda( at::cuda::getCurrentCUDAStream(), grad_output_g.data(), input.data() + n * im2col_step_ * per_input_size, offset.data() + n * im2col_step_ * per_offset_size, mask.data() + n * im2col_step_ * per_mask_size, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, batch_n, height_in, width_in, height_out, width_out, offset_scale, grad_input.data() + n * im2col_step_ * per_input_size, grad_offset.data() + n * im2col_step_ * per_offset_size, grad_mask.data() + n * im2col_step_ * per_mask_size); })); } if (input.dtype() == torch::kHalf) { return {grad_input.to(torch::kHalf), grad_offset.to(torch::kHalf), grad_mask.to(torch::kHalf)}; } else { return {grad_input, grad_offset, grad_mask}; } } ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/cuda/dcnv3_cuda.h ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include at::Tensor dcnv3_cuda_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step); std::vector dcnv3_cuda_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step); ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/cuda/dcnv3_im2col_cuda.cuh ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include #include #include #include #include #include #include #define CUDA_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) const int CUDA_NUM_THREADS = 256; inline int GET_BLOCKS(const int N, const int num_threads) { return (N + num_threads - 1) / num_threads; } #define opmath_t at::opmath_type template __device__ opmath_t dcnv3_im2col_bilinear(const scalar_t *&bottom_data, const int &height, const int &width, const int &group, const int &group_channels, const opmath_t &h, const opmath_t &w, const int &g, const int &c) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const opmath_t lh = h - h_low; const opmath_t lw = w - w_low; const opmath_t hh = 1 - lh, hw = 1 - lw; const int w_stride = group * group_channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = g * group_channels + c; opmath_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; } opmath_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; } opmath_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; } opmath_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; } const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); return val; } template __device__ void dcnv3_col2im_bilinear( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &group_channels, const opmath_t &h, const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale, const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const opmath_t lh = h - h_low; const opmath_t lw = w - w_low; const opmath_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * group_channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * group_channels + c; const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const opmath_t top_grad_im = top_grad * mask; opmath_t grad_h_weight = 0, grad_w_weight = 0; opmath_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_im + ptr1, w1 * top_grad_im); } opmath_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_im + ptr2, w2 * top_grad_im); } opmath_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_im + ptr3, w3 * top_grad_im); } opmath_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_im + ptr4, w4 * top_grad_im); } const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); *grad_mask = top_grad * val; *grad_offset = offset_scale * grad_w_weight * top_grad_im; *(grad_offset + 1) = offset_scale * grad_h_weight * top_grad_im; } template __device__ void dcnv3_col2im_bilinear_gm( const scalar_t *&bottom_data, const int &height, const int &width, const int &nheads, const int &group_channels, const opmath_t &h, const opmath_t &w, const int &m, const int &c, const opmath_t offset_scale, const opmath_t &top_grad, const opmath_t &mask, opmath_t *&grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { const int h_low = floor(h); const int w_low = floor(w); const int h_high = h_low + 1; const int w_high = w_low + 1; const opmath_t lh = h - h_low; const opmath_t lw = w - w_low; const opmath_t hh = 1 - lh, hw = 1 - lw; const int w_stride = nheads * group_channels; const int h_stride = width * w_stride; const int h_low_ptr_offset = h_low * h_stride; const int h_high_ptr_offset = h_low_ptr_offset + h_stride; const int w_low_ptr_offset = w_low * w_stride; const int w_high_ptr_offset = w_low_ptr_offset + w_stride; const int base_ptr = m * group_channels + c; const opmath_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; const opmath_t top_grad_im = top_grad * mask; opmath_t grad_h_weight = 0, grad_w_weight = 0; opmath_t v1 = 0; if (h_low >= 0 && w_low >= 0) { const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; v1 = bottom_data[ptr1]; grad_h_weight -= hw * v1; grad_w_weight -= hh * v1; atomicAdd(grad_im + ptr1, w1 * top_grad_im); } opmath_t v2 = 0; if (h_low >= 0 && w_high <= width - 1) { const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; v2 = bottom_data[ptr2]; grad_h_weight -= lw * v2; grad_w_weight += hh * v2; atomicAdd(grad_im + ptr2, w2 * top_grad_im); } opmath_t v3 = 0; if (h_high <= height - 1 && w_low >= 0) { const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; v3 = bottom_data[ptr3]; grad_h_weight += hw * v3; grad_w_weight -= lh * v3; atomicAdd(grad_im + ptr3, w3 * top_grad_im); } opmath_t v4 = 0; if (h_high <= height - 1 && w_high <= width - 1) { const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; v4 = bottom_data[ptr4]; grad_h_weight += lw * v4; grad_w_weight += lh * v4; atomicAdd(grad_im + ptr4, w4 * top_grad_im); } const opmath_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); atomicAdd(grad_mask, top_grad * val); atomicAdd(grad_offset, offset_scale * grad_w_weight * top_grad_im); atomicAdd(grad_offset + 1, offset_scale * grad_h_weight * top_grad_im); } template __global__ void dcnv3_im2col_gpu_kernel( const int num_kernels, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale) { CUDA_KERNEL_LOOP(index, num_kernels) { int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const int input_size = height_in * width_in; scalar_t *data_col_ptr = data_col + index; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int qid_stride = group * group_channels; opmath_t col = 0; const scalar_t *data_im_ptr = data_im + b_col * input_size * qid_stride; // top-left const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { col += dcnv3_im2col_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col) * weight; } data_weight_ptr += 1; data_loc_w_ptr += 2; } } *data_col_ptr = col; } } // debug template __global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { __shared__ opmath_t cache_grad_offset[blockSize * 2]; __shared__ opmath_t cache_grad_mask[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); if (tid == 0) { opmath_t _grad_w = cache_grad_offset[0], _grad_h = cache_grad_offset[1], _grad_a = cache_grad_mask[0]; int sid = 2; for (unsigned int tid = 1; tid < blockSize; ++tid) { _grad_w += cache_grad_offset[sid]; _grad_h += cache_grad_offset[sid + 1]; _grad_a += cache_grad_mask[tid]; sid += 2; } *grad_offset = _grad_w; *(grad_offset + 1) = _grad_h; *grad_mask = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { __shared__ opmath_t cache_grad_offset[blockSize * 2]; __shared__ opmath_t cache_grad_mask[blockSize]; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); for (unsigned int s = blockSize / 2; s > 0; s >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_mask[tid] += cache_grad_mask[tid + s]; cache_grad_offset[xid1] += cache_grad_offset[xid2]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1]; } __syncthreads(); } if (tid == 0) { *grad_offset = cache_grad_offset[0]; *(grad_offset + 1) = cache_grad_offset[1]; *grad_mask = cache_grad_mask[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v1( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { extern __shared__ int _s[]; opmath_t *cache_grad_offset = (opmath_t *)_s; opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); if (tid == 0) { opmath_t _grad_w = cache_grad_offset[0], _grad_h = cache_grad_offset[1], _grad_a = cache_grad_mask[0]; int sid = 2; for (unsigned int tid = 1; tid < blockDim.x; ++tid) { _grad_w += cache_grad_offset[sid]; _grad_h += cache_grad_offset[sid + 1]; _grad_a += cache_grad_mask[tid]; sid += 2; } *grad_offset = _grad_w; *(grad_offset + 1) = _grad_h; *grad_mask = _grad_a; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { extern __shared__ int _s[]; opmath_t *cache_grad_offset = (opmath_t *)_s; opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_mask[tid] += cache_grad_mask[tid + s]; cache_grad_offset[xid1] += cache_grad_offset[xid2]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_mask[tid] += cache_grad_mask[tid + (s << 1)]; cache_grad_offset[xid1] += cache_grad_offset[xid2 + (s << 1)]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { *grad_offset = cache_grad_offset[0]; *(grad_offset + 1) = cache_grad_offset[1]; *grad_mask = cache_grad_mask[0]; } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { extern __shared__ int _s[]; opmath_t *cache_grad_offset = (opmath_t *)_s; opmath_t *cache_grad_mask = cache_grad_offset + 2 * blockDim.x; unsigned int tid = threadIdx.x; int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; *(cache_grad_offset + (threadIdx.x << 1)) = 0; *(cache_grad_offset + ((threadIdx.x << 1) + 1)) = 0; *(cache_grad_mask + threadIdx.x) = 0; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, cache_grad_offset + (threadIdx.x << 1), cache_grad_mask + threadIdx.x); } __syncthreads(); for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0; s >>= 1, spre >>= 1) { if (tid < s) { const unsigned int xid1 = tid << 1; const unsigned int xid2 = (tid + s) << 1; cache_grad_mask[tid] += cache_grad_mask[tid + s]; cache_grad_offset[xid1] += cache_grad_offset[xid2]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1]; if (tid + (s << 1) < spre) { cache_grad_mask[tid] += cache_grad_mask[tid + (s << 1)]; cache_grad_offset[xid1] += cache_grad_offset[xid2 + (s << 1)]; cache_grad_offset[xid1 + 1] += cache_grad_offset[xid2 + 1 + (s << 1)]; } } __syncthreads(); } if (tid == 0) { atomicAdd(grad_offset, cache_grad_offset[0]); atomicAdd(grad_offset + 1, cache_grad_offset[1]); atomicAdd(grad_mask, cache_grad_mask[0]); } __syncthreads(); data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template __global__ void dcnv3_col2im_gpu_kernel_gm( const int num_kernels, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { CUDA_KERNEL_LOOP(index, num_kernels) { int _temp = index; const int c_col = _temp % group_channels; _temp /= group_channels; const int sampling_index = _temp; const int g_col = _temp % group; _temp /= group; const int p0_w = ((dilation_w * (kernel_w - 1)) >> 1) - pad_w + (_temp % width_out) * stride_w; _temp /= width_out; const int p0_h = ((dilation_h * (kernel_h - 1)) >> 1) - pad_h + (_temp % height_out) * stride_h; _temp /= height_out; const int b_col = _temp; const opmath_t top_grad = grad_col[index]; const int input_size = height_in * width_in; const int kernel_size = kernel_h * kernel_w; int data_weight_ptr = sampling_index * kernel_size; int data_loc_w_ptr = data_weight_ptr << 1; const int grad_sampling_ptr = data_weight_ptr; grad_offset += grad_sampling_ptr << 1; grad_mask += grad_sampling_ptr; const int qid_stride = group * group_channels; const int im_ptr_offset = b_col * input_size * qid_stride; const scalar_t *data_im_ptr = data_im + im_ptr_offset; opmath_t *grad_im_ptr = grad_im + im_ptr_offset; const opmath_t p0_w_ = p0_w - ((dilation_w * (kernel_w - 1)) >> 1) * offset_scale; const opmath_t p0_h_ = p0_h - ((dilation_h * (kernel_h - 1)) >> 1) * offset_scale; for (int i = 0; i < kernel_w; ++i) { for (int j = 0; j < kernel_h; ++j) { const opmath_t offset_w = data_offset[data_loc_w_ptr]; const opmath_t offset_h = data_offset[data_loc_w_ptr + 1]; const opmath_t loc_w = p0_w_ + (i * dilation_w + offset_w) * offset_scale; const opmath_t loc_h = p0_h_ + (j * dilation_h + offset_h) * offset_scale; const opmath_t weight = data_mask[data_weight_ptr]; if (loc_h > -1 && loc_w > -1 && loc_h < height_in && loc_w < width_in) { dcnv3_col2im_bilinear_gm( data_im_ptr, height_in, width_in, group, group_channels, loc_h, loc_w, g_col, c_col, offset_scale, top_grad, weight, grad_im_ptr, grad_offset, grad_mask); } data_weight_ptr += 1; data_loc_w_ptr += 2; grad_mask += 1; grad_offset += 2; } } } } template void dcnv3_im2col_cuda(cudaStream_t stream, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, scalar_t *data_col, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int batch_n, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale) { const int num_kernels = batch_n * height_out * width_out * group * group_channels; const int num_actual_kernels = batch_n * height_out * width_out * group * group_channels; const int num_threads = CUDA_NUM_THREADS; dcnv3_im2col_gpu_kernel <<>>(num_kernels, data_im, data_offset, data_mask, data_col, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale); cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in dcnv3_im2col_cuda: %s\n", cudaGetErrorString(err)); } } template void dcnv3_col2im_cuda( cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_im, const scalar_t *data_offset, const scalar_t *data_mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const int batch_n, const int height_in, const int width_in, const int height_out, const int width_out, const opmath_t offset_scale, opmath_t *grad_im, opmath_t *grad_offset, opmath_t *grad_mask) { const int num_threads = (group_channels > CUDA_NUM_THREADS) ? CUDA_NUM_THREADS : group_channels; const int num_kernels = batch_n * height_out * width_out * group * group_channels; const int num_actual_kernels = batch_n * height_out * width_out * group * group_channels; if (group_channels > 1024) { if ((group_channels & 1023) == 0) { dcnv3_col2im_gpu_kernel_shm_reduce_v2_multi_blocks <<>>( num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } else { dcnv3_col2im_gpu_kernel_gm <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } } else { switch (group_channels) { case 1: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 2: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 4: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 8: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 16: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 32: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 64: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 128: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 256: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 512: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; case 1024: dcnv3_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 <<>>(num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); break; default: if (group_channels < 64) { dcnv3_col2im_gpu_kernel_shm_reduce_v1 <<>>( num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } else { dcnv3_col2im_gpu_kernel_shm_reduce_v2 <<>>( num_kernels, grad_col, data_im, data_offset, data_mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, height_in, width_in, height_out, width_out, offset_scale, grad_im, grad_offset, grad_mask); } } } cudaError_t err = cudaGetLastError(); if (err != cudaSuccess) { printf("error in dcnv3_col2im_cuda: %s\n", cudaGetErrorString(err)); } } ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/dcnv3.h ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #pragma once #include "cpu/dcnv3_cpu.h" #ifdef WITH_CUDA #include "cuda/dcnv3_cuda.h" #endif at::Tensor dcnv3_forward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcnv3_cuda_forward(input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } std::vector dcnv3_backward(const at::Tensor &input, const at::Tensor &offset, const at::Tensor &mask, const int kernel_h, const int kernel_w, const int stride_h, const int stride_w, const int pad_h, const int pad_w, const int dilation_h, const int dilation_w, const int group, const int group_channels, const float offset_scale, const at::Tensor &grad_output, const int im2col_step) { if (input.type().is_cuda()) { #ifdef WITH_CUDA return dcnv3_cuda_backward(input, offset, mask, kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group, group_channels, offset_scale, grad_output, im2col_step); #else AT_ERROR("Not compiled with GPU support"); #endif } AT_ERROR("Not implemented on the CPU"); } ================================================ FILE: mmdet3d/ops/ops_dcnv3/src/vision.cpp ================================================ /*! ************************************************************************************************** * InternImage * Copyright (c) 2022 OpenGVLab * Licensed under The MIT License [see LICENSE for details] ************************************************************************************************** * Modified from *https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 ************************************************************************************************** */ #include "dcnv3.h" PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("dcnv3_forward", &dcnv3_forward, "dcnv3_forward"); m.def("dcnv3_backward", &dcnv3_backward, "dcnv3_backward"); } ================================================ FILE: mmdet3d/ops/ops_dcnv3/test.py ================================================ # -------------------------------------------------------- # InternImage # Copyright (c) 2022 OpenGVLab # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- from __future__ import absolute_import from __future__ import print_function from __future__ import division import time import torch import torch.nn as nn import math from torch.autograd import gradcheck from functions.dcnv3_func import DCNv3Function, dcnv3_core_pytorch H_in, W_in = 8, 8 N, M, D = 2, 4, 16 Kh, Kw = 3, 3 P = Kh * Kw offset_scale = 2.0 pad = 1 dilation = 1 stride = 1 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 torch.manual_seed(3) @torch.no_grad() def check_forward_equal_with_pytorch_double(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask /= mask.sum(-1, keepdim=True) mask = mask.reshape(N, H_out, W_out, M*P) output_pytorch = dcnv3_core_pytorch( input.double(), offset.double(), mask.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu() im2col_step = 2 output_cuda = DCNv3Function.apply( input.double(), offset.double(), mask.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print('>>> forward double') print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_forward_equal_with_pytorch_float(): input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask /= mask.sum(-1, keepdim=True) mask = mask.reshape(N, H_out, W_out, M*P) output_pytorch = dcnv3_core_pytorch( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale).detach().cpu() im2col_step = 2 output_cuda = DCNv3Function.apply( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step).detach().cpu() fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) max_abs_err = (output_cuda - output_pytorch).abs().max() max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() print('>>> forward float') print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_backward_equal_with_pytorch_double(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 M = 2 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 D = channels input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask0 /= mask0.sum(-1, keepdim=True) mask0 = mask0.reshape(N, H_out, W_out, M*P) input0.requires_grad = grad_input offset0.requires_grad = grad_offset mask0.requires_grad = grad_mask output_pytorch = dcnv3_core_pytorch( input0.double(), offset0.double(), mask0.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale) output_pytorch.sum().backward() input1 = input0.detach() offset1 = offset0.detach() mask1 = mask0.detach() input1.requires_grad = grad_input offset1.requires_grad = grad_offset mask1.requires_grad = grad_mask im2col_step = 2 output_cuda = DCNv3Function.apply( input1.double(), offset1.double(), mask1.double(), Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step) output_cuda.sum().backward() print(f'>>> backward double: channels {D}') bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (input0.grad - input1.grad).abs().max() max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max() print( f'* {bwdok} input_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (offset0.grad - offset1.grad).abs().max() max_rel_err = ((offset0.grad - offset1.grad).abs() / offset0.grad.abs()).max() print( f'* {bwdok} offset_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (mask0.grad - mask1.grad).abs().max() max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max() print( f'* {bwdok} mask_grad check_backward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') def check_backward_equal_with_pytorch_float(channels=4, grad_input=True, grad_offset=True, grad_mask=True): # H_in, W_in = 4, 4 N = 2 M = 2 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 D = channels input0 = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset0 = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask0 = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask0 /= mask0.sum(-1, keepdim=True) mask0 = mask0.reshape(N, H_out, W_out, M*P) input0.requires_grad = grad_input offset0.requires_grad = grad_offset mask0.requires_grad = grad_mask output_pytorch = dcnv3_core_pytorch( input0, offset0, mask0, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale) output_pytorch.sum().backward() input1 = input0.detach() offset1 = offset0.detach() mask1 = mask0.detach() input1.requires_grad = grad_input offset1.requires_grad = grad_offset mask1.requires_grad = grad_mask im2col_step = 2 output_cuda = DCNv3Function.apply( input1, offset1, mask1, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, offset_scale, im2col_step) output_cuda.sum().backward() print(f'>>> backward float: channels {D}') bwdok = torch.allclose(input0.grad, input1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (input0.grad - input1.grad).abs().max() max_rel_err = ((input0.grad - input1.grad).abs() / input0.grad.abs()).max() print( f'* {bwdok} input_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(offset0.grad, offset1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (offset0.grad - offset1.grad).abs().max() max_rel_err = ((offset0.grad - offset1.grad).abs() / offset0.grad.abs()).max() print( f'* {bwdok} offset_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') bwdok = torch.allclose(mask0.grad, mask1.grad, rtol=1e-2, atol=1e-3) max_abs_err = (mask0.grad - mask1.grad).abs().max() max_rel_err = ((mask0.grad - mask1.grad).abs() / mask0.grad.abs()).max() print( f'* {bwdok} mask_grad check_backward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') @torch.no_grad() def check_time_cost(im2col_step=128): N = 512 H_in, W_in = 64, 64 H_out = (H_in + 2 * pad - (dilation * (Kh - 1) + 1)) // stride + 1 W_out = (W_in + 2 * pad - (dilation * (Kw - 1) + 1)) // stride + 1 input = torch.rand(N, H_in, W_in, M*D).cuda() * 0.01 offset = torch.rand(N, H_out, W_out, M*P*2).cuda() * 10 mask = torch.rand(N, H_out, W_out, M, P).cuda() + 1e-5 mask /= mask.sum(-1, keepdim=True) mask = mask.reshape(N, H_out, W_out, M*P) print( f'>>> time cost: im2col_step {im2col_step}; input {input.shape}; points {P} ') repeat = 100 for i in range(repeat): output_cuda = DCNv3Function.apply( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0, im2col_step) torch.cuda.synchronize() start = time.time() for i in range(repeat): output_cuda = DCNv3Function.apply( input, offset, mask, Kh, Kw, stride, stride, Kh // 2, Kw // 2, dilation, dilation, M, D, 1.0, im2col_step) torch.cuda.synchronize() print(f'foward time cost: {(time.time() - start) / repeat}') if __name__ == '__main__': check_forward_equal_with_pytorch_double() check_forward_equal_with_pytorch_float() for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_double(channels, True, True, True) for channels in [1, 16, 30, 32, 64, 71, 1025]: check_backward_equal_with_pytorch_float(channels, True, True, True) for i in range(3): im2col_step = 128 * (2 ** i) check_time_cost(im2col_step) ================================================ FILE: mmdet3d/ops/paconv/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .paconv import PAConv, PAConvCUDA __all__ = ['PAConv', 'PAConvCUDA'] ================================================ FILE: mmdet3d/ops/paconv/paconv.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import torch from mmcv.cnn import (ConvModule, build_activation_layer, build_norm_layer, constant_init) from mmcv.ops import assign_score_withk as assign_score_cuda from torch import nn as nn from torch.nn import functional as F from .utils import assign_kernel_withoutk, assign_score, calc_euclidian_dist class ScoreNet(nn.Module): r"""ScoreNet that outputs coefficient scores to assemble kernel weights in the weight bank according to the relative position of point pairs. Args: mlp_channels (List[int]): Hidden unit sizes of SharedMLP layers. last_bn (bool, optional): Whether to use BN on the last output of mlps. Defaults to False. score_norm (str, optional): Normalization function of output scores. Can be 'softmax', 'sigmoid' or 'identity'. Defaults to 'softmax'. temp_factor (float, optional): Temperature factor to scale the output scores before softmax. Defaults to 1.0. norm_cfg (dict, optional): Type of normalization method. Defaults to dict(type='BN2d'). bias (bool | str, optional): If specified as `auto`, it will be decided by the norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise False. Defaults to 'auto'. Note: The official code applies xavier_init to all Conv layers in ScoreNet, see `PAConv `_. However in our experiments, we did not find much difference in applying such xavier initialization or not. So we neglect this initialization in our implementation. """ def __init__(self, mlp_channels, last_bn=False, score_norm='softmax', temp_factor=1.0, norm_cfg=dict(type='BN2d'), bias='auto'): super(ScoreNet, self).__init__() assert score_norm in ['softmax', 'sigmoid', 'identity'], \ f'unsupported score_norm function {score_norm}' self.score_norm = score_norm self.temp_factor = temp_factor self.mlps = nn.Sequential() for i in range(len(mlp_channels) - 2): self.mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg, bias=bias)) # for the last mlp that outputs scores, no relu and possibly no bn i = len(mlp_channels) - 2 self.mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg if last_bn else None, act_cfg=None, bias=bias)) def forward(self, xyz_features): """Forward. Args: xyz_features (torch.Tensor): (B, C, N, K), features constructed from xyz coordinates of point pairs. May contain relative positions, Euclidean distance, etc. Returns: torch.Tensor: (B, N, K, M), predicted scores for `M` kernels. """ scores = self.mlps(xyz_features) # (B, M, N, K) # perform score normalization if self.score_norm == 'softmax': scores = F.softmax(scores / self.temp_factor, dim=1) elif self.score_norm == 'sigmoid': scores = torch.sigmoid(scores / self.temp_factor) else: # 'identity' scores = scores scores = scores.permute(0, 2, 3, 1) # (B, N, K, M) return scores class PAConv(nn.Module): """Non-CUDA version of PAConv. PAConv stores a trainable weight bank containing several kernel weights. Given input points and features, it computes coefficient scores to assemble those kernels to form conv kernels, and then runs convolution on the input. Args: in_channels (int): Input channels of point features. out_channels (int): Output channels of point features. num_kernels (int): Number of kernel weights in the weight bank. norm_cfg (dict, optional): Type of normalization method. Defaults to dict(type='BN2d', momentum=0.1). act_cfg (dict, optional): Type of activation method. Defaults to dict(type='ReLU', inplace=True). scorenet_input (str, optional): Type of input to ScoreNet. Can be 'identity', 'w_neighbor' or 'w_neighbor_dist'. Defaults to 'w_neighbor_dist'. weight_bank_init (str, optional): Init method of weight bank kernels. Can be 'kaiming' or 'xavier'. Defaults to 'kaiming'. kernel_input (str, optional): Input features to be multiplied with kernel weights. Can be 'identity' or 'w_neighbor'. Defaults to 'w_neighbor'. scorenet_cfg (dict, optional): Config of the ScoreNet module, which may contain the following keys and values: - mlp_channels (List[int]): Hidden units of MLPs. - score_norm (str): Normalization function of output scores. Can be 'softmax', 'sigmoid' or 'identity'. - temp_factor (float): Temperature factor to scale the output scores before softmax. - last_bn (bool): Whether to use BN on the last output of mlps. """ def __init__(self, in_channels, out_channels, num_kernels, norm_cfg=dict(type='BN2d', momentum=0.1), act_cfg=dict(type='ReLU', inplace=True), scorenet_input='w_neighbor_dist', weight_bank_init='kaiming', kernel_input='w_neighbor', scorenet_cfg=dict( mlp_channels=[16, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False)): super(PAConv, self).__init__() # determine weight kernel size according to used features if kernel_input == 'identity': # only use grouped_features kernel_mul = 1 elif kernel_input == 'w_neighbor': # concat of (grouped_features - center_features, grouped_features) kernel_mul = 2 else: raise NotImplementedError( f'unsupported kernel_input {kernel_input}') self.kernel_input = kernel_input in_channels = kernel_mul * in_channels # determine mlp channels in ScoreNet according to used xyz features if scorenet_input == 'identity': # only use relative position (grouped_xyz - center_xyz) self.scorenet_in_channels = 3 elif scorenet_input == 'w_neighbor': # (grouped_xyz - center_xyz, grouped_xyz) self.scorenet_in_channels = 6 elif scorenet_input == 'w_neighbor_dist': # (center_xyz, grouped_xyz - center_xyz, Euclidean distance) self.scorenet_in_channels = 7 else: raise NotImplementedError( f'unsupported scorenet_input {scorenet_input}') self.scorenet_input = scorenet_input # construct kernel weights in weight bank # self.weight_bank is of shape [C, num_kernels * out_c] # where C can be in_c or (2 * in_c) if weight_bank_init == 'kaiming': weight_init = nn.init.kaiming_normal_ elif weight_bank_init == 'xavier': weight_init = nn.init.xavier_normal_ else: raise NotImplementedError( f'unsupported weight bank init method {weight_bank_init}') self.num_kernels = num_kernels # the parameter `m` in the paper weight_bank = weight_init( torch.empty(self.num_kernels, in_channels, out_channels)) weight_bank = weight_bank.permute(1, 0, 2).reshape( in_channels, self.num_kernels * out_channels).contiguous() self.weight_bank = nn.Parameter(weight_bank, requires_grad=True) # construct ScoreNet scorenet_cfg_ = copy.deepcopy(scorenet_cfg) scorenet_cfg_['mlp_channels'].insert(0, self.scorenet_in_channels) scorenet_cfg_['mlp_channels'].append(self.num_kernels) self.scorenet = ScoreNet(**scorenet_cfg_) self.bn = build_norm_layer(norm_cfg, out_channels)[1] if \ norm_cfg is not None else None self.activate = build_activation_layer(act_cfg) if \ act_cfg is not None else None # set some basic attributes of Conv layers self.in_channels = in_channels self.out_channels = out_channels self.init_weights() def init_weights(self): """Initialize weights of shared MLP layers and BN layers.""" if self.bn is not None: constant_init(self.bn, val=1, bias=0) def _prepare_scorenet_input(self, points_xyz): """Prepare input point pairs features for self.ScoreNet. Args: points_xyz (torch.Tensor): (B, 3, npoint, K) Coordinates of the grouped points. Returns: torch.Tensor: (B, C, npoint, K) The generated features per point pair. """ B, _, npoint, K = points_xyz.size() center_xyz = points_xyz[..., :1].repeat(1, 1, 1, K) xyz_diff = points_xyz - center_xyz # [B, 3, npoint, K] if self.scorenet_input == 'identity': xyz_features = xyz_diff elif self.scorenet_input == 'w_neighbor': xyz_features = torch.cat((xyz_diff, points_xyz), dim=1) else: # w_neighbor_dist euclidian_dist = calc_euclidian_dist( center_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3), points_xyz.permute(0, 2, 3, 1).reshape(B * npoint * K, 3)).\ reshape(B, 1, npoint, K) xyz_features = torch.cat((center_xyz, xyz_diff, euclidian_dist), dim=1) return xyz_features def forward(self, inputs): """Forward. Args: inputs (tuple(torch.Tensor)): - features (torch.Tensor): (B, in_c, npoint, K) Features of the queried points. - points_xyz (torch.Tensor): (B, 3, npoint, K) Coordinates of the grouped points. Returns: Tuple[torch.Tensor]: - new_features: (B, out_c, npoint, K), features after PAConv. - points_xyz: same as input. """ features, points_xyz = inputs B, _, npoint, K = features.size() if self.kernel_input == 'w_neighbor': center_features = features[..., :1].repeat(1, 1, 1, K) features_diff = features - center_features # to (B, 2 * in_c, npoint, K) features = torch.cat((features_diff, features), dim=1) # prepare features for between each point and its grouping center xyz_features = self._prepare_scorenet_input(points_xyz) # scores to assemble kernel weights scores = self.scorenet(xyz_features) # [B, npoint, K, m] # first compute out features over all kernels # features is [B, C, npoint, K], weight_bank is [C, m * out_c] new_features = torch.matmul( features.permute(0, 2, 3, 1), self.weight_bank).view(B, npoint, K, self.num_kernels, -1) # [B, npoint, K, m, out_c] # then aggregate using scores new_features = assign_score(scores, new_features) # to [B, out_c, npoint, K] new_features = new_features.permute(0, 3, 1, 2).contiguous() if self.bn is not None: new_features = self.bn(new_features) if self.activate is not None: new_features = self.activate(new_features) # in order to keep input output consistency # so that we can wrap PAConv in Sequential return (new_features, points_xyz) class PAConvCUDA(PAConv): """CUDA version of PAConv that implements a cuda op to efficiently perform kernel assembling. Different from vanilla PAConv, the input features of this function is not grouped by centers. Instead, they will be queried on-the-fly by the additional input `points_idx`. This avoids the large intermediate matrix. See the `paper `_ appendix Sec. D for more detailed descriptions. """ def __init__(self, in_channels, out_channels, num_kernels, norm_cfg=dict(type='BN2d', momentum=0.1), act_cfg=dict(type='ReLU', inplace=True), scorenet_input='w_neighbor_dist', weight_bank_init='kaiming', kernel_input='w_neighbor', scorenet_cfg=dict( mlp_channels=[8, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False)): super(PAConvCUDA, self).__init__( in_channels=in_channels, out_channels=out_channels, num_kernels=num_kernels, norm_cfg=norm_cfg, act_cfg=act_cfg, scorenet_input=scorenet_input, weight_bank_init=weight_bank_init, kernel_input=kernel_input, scorenet_cfg=scorenet_cfg) assert self.kernel_input == 'w_neighbor', \ 'CUDA implemented PAConv only supports w_neighbor kernel_input' def forward(self, inputs): """Forward. Args: inputs (tuple(torch.Tensor)): - features (torch.Tensor): (B, in_c, N) Features of all points in the current point cloud. Different from non-CUDA version PAConv, here the features are not grouped by each center to form a K dim. - points_xyz (torch.Tensor): (B, 3, npoint, K) Coordinates of the grouped points. - points_idx (torch.Tensor): (B, npoint, K) Index of the grouped points. Returns: Tuple[torch.Tensor]: - new_features: (B, out_c, npoint, K), features after PAConv. - points_xyz: same as input. - points_idx: same as input. """ features, points_xyz, points_idx = inputs # prepare features for between each point and its grouping center xyz_features = self._prepare_scorenet_input(points_xyz) # scores to assemble kernel weights scores = self.scorenet(xyz_features) # [B, npoint, K, m] # pre-compute features for points and centers separately # features is [B, in_c, N], weight_bank is [C, m * out_dim] point_feat, center_feat = assign_kernel_withoutk( features, self.weight_bank, self.num_kernels) # aggregate features using custom cuda op new_features = assign_score_cuda( scores, point_feat, center_feat, points_idx, 'sum').contiguous() # [B, out_c, npoint, K] if self.bn is not None: new_features = self.bn(new_features) if self.activate is not None: new_features = self.activate(new_features) # in order to keep input output consistency return (new_features, points_xyz, points_idx) ================================================ FILE: mmdet3d/ops/paconv/utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch def calc_euclidian_dist(xyz1, xyz2): """Calculate the Euclidean distance between two sets of points. Args: xyz1 (torch.Tensor): (N, 3), the first set of points. xyz2 (torch.Tensor): (N, 3), the second set of points. Returns: torch.Tensor: (N, ), the Euclidean distance between each point pair. """ assert xyz1.shape[0] == xyz2.shape[0], 'number of points are not the same' assert xyz1.shape[1] == xyz2.shape[1] == 3, \ 'points coordinates dimension is not 3' return torch.norm(xyz1 - xyz2, dim=-1) def assign_score(scores, point_features): """Perform weighted sum to aggregate output features according to scores. This function is used in non-CUDA version of PAConv. Compared to the cuda op assigh_score_withk, this pytorch implementation pre-computes output features for the neighbors of all centers, and then performs aggregation. It consumes more GPU memories. Args: scores (torch.Tensor): (B, npoint, K, M), predicted scores to aggregate weight matrices in the weight bank. `npoint` is the number of sampled centers. `K` is the number of queried neighbors. `M` is the number of weight matrices in the weight bank. point_features (torch.Tensor): (B, npoint, K, M, out_dim) Pre-computed point features to be aggregated. Returns: torch.Tensor: (B, npoint, K, out_dim), the aggregated features. """ B, npoint, K, M = scores.size() scores = scores.view(B, npoint, K, 1, M) output = torch.matmul(scores, point_features).view(B, npoint, K, -1) return output def assign_kernel_withoutk(features, kernels, M): """Pre-compute features with weight matrices in weight bank. This function is used before cuda op assign_score_withk in CUDA version PAConv. Args: features (torch.Tensor): (B, in_dim, N), input features of all points. `N` is the number of points in current point cloud. kernels (torch.Tensor): (2 * in_dim, M * out_dim), weight matrices in the weight bank, transformed from (M, 2 * in_dim, out_dim). `2 * in_dim` is because the input features are concatenation of (point_features - center_features, point_features). M (int): Number of weight matrices in the weight bank. Returns: Tuple[torch.Tensor]: both of shape (B, N, M, out_dim): - point_features: Pre-computed features for points. - center_features: Pre-computed features for centers. """ B, in_dim, N = features.size() feat_trans = features.permute(0, 2, 1) # [B, N, in_dim] out_feat_half1 = torch.matmul(feat_trans, kernels[:in_dim]).view( B, N, M, -1) # [B, N, M, out_dim] out_feat_half2 = torch.matmul(feat_trans, kernels[in_dim:]).view( B, N, M, -1) # [B, N, M, out_dim] # TODO: why this hard-coded if condition? # when the network input is only xyz without additional features # xyz will be used as features, so that features.size(1) == 3 % 2 != 0 # we need to compensate center_features because otherwise # `point_features - center_features` will result in all zeros? if features.size(1) % 2 != 0: out_feat_half_coord = torch.matmul( feat_trans[:, :, :3], # [B, N, 3] kernels[in_dim:in_dim + 3]).view(B, N, M, -1) # [B, N, M, out_dim] else: out_feat_half_coord = torch.zeros_like(out_feat_half2) point_features = out_feat_half1 + out_feat_half2 center_features = out_feat_half1 + out_feat_half_coord return point_features, center_features ================================================ FILE: mmdet3d/ops/pointnet_modules/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .builder import build_sa_module from .paconv_sa_module import (PAConvCUDASAModule, PAConvCUDASAModuleMSG, PAConvSAModule, PAConvSAModuleMSG) from .point_fp_module import PointFPModule from .point_sa_module import PointSAModule, PointSAModuleMSG __all__ = [ 'build_sa_module', 'PointSAModuleMSG', 'PointSAModule', 'PointFPModule', 'PAConvSAModule', 'PAConvSAModuleMSG', 'PAConvCUDASAModule', 'PAConvCUDASAModuleMSG' ] ================================================ FILE: mmdet3d/ops/pointnet_modules/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.utils import Registry SA_MODULES = Registry('point_sa_module') def build_sa_module(cfg, *args, **kwargs): """Build PointNet2 set abstraction (SA) module. Args: cfg (None or dict): The SA module config, which should contain: - type (str): Module type. - module args: Args needed to instantiate an SA module. args (argument list): Arguments passed to the `__init__` method of the corresponding module. kwargs (keyword arguments): Keyword arguments passed to the `__init__` method of the corresponding SA module . Returns: nn.Module: Created SA module. """ if cfg is None: cfg_ = dict(type='PointSAModule') else: if not isinstance(cfg, dict): raise TypeError('cfg must be a dict') if 'type' not in cfg: raise KeyError('the cfg dict must contain the key "type"') cfg_ = cfg.copy() module_type = cfg_.pop('type') if module_type not in SA_MODULES: raise KeyError(f'Unrecognized module type {module_type}') else: sa_module = SA_MODULES.get(module_type) module = sa_module(*args, **kwargs, **cfg_) return module ================================================ FILE: mmdet3d/ops/pointnet_modules/paconv_sa_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from torch import nn as nn from mmdet3d.ops import PAConv, PAConvCUDA from .builder import SA_MODULES from .point_sa_module import BasePointSAModule @SA_MODULES.register_module() class PAConvSAModuleMSG(BasePointSAModule): r"""Point set abstraction module with multi-scale grouping (MSG) used in PAConv networks. Replace the MLPs in `PointSAModuleMSG` with PAConv layers. See the `paper `_ for more details. Args: paconv_num_kernels (list[list[int]]): Number of kernel weights in the weight banks of each layer's PAConv. paconv_kernel_input (str, optional): Input features to be multiplied with kernel weights. Can be 'identity' or 'w_neighbor'. Defaults to 'w_neighbor'. scorenet_input (str, optional): Type of the input to ScoreNet. Defaults to 'w_neighbor_dist'. Can be the following values: - 'identity': Use xyz coordinates as input. - 'w_neighbor': Use xyz coordinates and the difference with center points as input. - 'w_neighbor_dist': Use xyz coordinates, the difference with center points and the Euclidean distance as input. scorenet_cfg (dict, optional): Config of the ScoreNet module, which may contain the following keys and values: - mlp_channels (List[int]): Hidden units of MLPs. - score_norm (str): Normalization function of output scores. Can be 'softmax', 'sigmoid' or 'identity'. - temp_factor (float): Temperature factor to scale the output scores before softmax. - last_bn (bool): Whether to use BN on the last output of mlps. """ def __init__(self, num_point, radii, sample_nums, mlp_channels, paconv_num_kernels, fps_mod=['D-FPS'], fps_sample_range_list=[-1], dilated_group=False, norm_cfg=dict(type='BN2d', momentum=0.1), use_xyz=True, pool_mod='max', normalize_xyz=False, bias='auto', paconv_kernel_input='w_neighbor', scorenet_input='w_neighbor_dist', scorenet_cfg=dict( mlp_channels=[16, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False)): super(PAConvSAModuleMSG, self).__init__( num_point=num_point, radii=radii, sample_nums=sample_nums, mlp_channels=mlp_channels, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, dilated_group=dilated_group, use_xyz=use_xyz, pool_mod=pool_mod, normalize_xyz=normalize_xyz, grouper_return_grouped_xyz=True) assert len(paconv_num_kernels) == len(mlp_channels) for i in range(len(mlp_channels)): assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \ 'PAConv number of kernel weights wrong' # in PAConv, bias only exists in ScoreNet scorenet_cfg['bias'] = bias for i in range(len(self.mlp_channels)): mlp_channel = self.mlp_channels[i] if use_xyz: mlp_channel[0] += 3 num_kernels = paconv_num_kernels[i] mlp = nn.Sequential() for i in range(len(mlp_channel) - 1): mlp.add_module( f'layer{i}', PAConv( mlp_channel[i], mlp_channel[i + 1], num_kernels[i], norm_cfg=norm_cfg, kernel_input=paconv_kernel_input, scorenet_input=scorenet_input, scorenet_cfg=scorenet_cfg)) self.mlps.append(mlp) @SA_MODULES.register_module() class PAConvSAModule(PAConvSAModuleMSG): r"""Point set abstraction module with single-scale grouping (SSG) used in PAConv networks. Replace the MLPs in `PointSAModule` with PAConv layers. See the `paper `_ for more details. """ def __init__(self, mlp_channels, paconv_num_kernels, num_point=None, radius=None, num_sample=None, norm_cfg=dict(type='BN2d', momentum=0.1), use_xyz=True, pool_mod='max', fps_mod=['D-FPS'], fps_sample_range_list=[-1], normalize_xyz=False, paconv_kernel_input='w_neighbor', scorenet_input='w_neighbor_dist', scorenet_cfg=dict( mlp_channels=[16, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False)): super(PAConvSAModule, self).__init__( mlp_channels=[mlp_channels], paconv_num_kernels=[paconv_num_kernels], num_point=num_point, radii=[radius], sample_nums=[num_sample], norm_cfg=norm_cfg, use_xyz=use_xyz, pool_mod=pool_mod, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, normalize_xyz=normalize_xyz, paconv_kernel_input=paconv_kernel_input, scorenet_input=scorenet_input, scorenet_cfg=scorenet_cfg) @SA_MODULES.register_module() class PAConvCUDASAModuleMSG(BasePointSAModule): r"""Point set abstraction module with multi-scale grouping (MSG) used in PAConv networks. Replace the non CUDA version PAConv with CUDA implemented PAConv for efficient computation. See the `paper `_ for more details. """ def __init__(self, num_point, radii, sample_nums, mlp_channels, paconv_num_kernels, fps_mod=['D-FPS'], fps_sample_range_list=[-1], dilated_group=False, norm_cfg=dict(type='BN2d', momentum=0.1), use_xyz=True, pool_mod='max', normalize_xyz=False, bias='auto', paconv_kernel_input='w_neighbor', scorenet_input='w_neighbor_dist', scorenet_cfg=dict( mlp_channels=[8, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False)): super(PAConvCUDASAModuleMSG, self).__init__( num_point=num_point, radii=radii, sample_nums=sample_nums, mlp_channels=mlp_channels, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, dilated_group=dilated_group, use_xyz=use_xyz, pool_mod=pool_mod, normalize_xyz=normalize_xyz, grouper_return_grouped_xyz=True, grouper_return_grouped_idx=True) assert len(paconv_num_kernels) == len(mlp_channels) for i in range(len(mlp_channels)): assert len(paconv_num_kernels[i]) == len(mlp_channels[i]) - 1, \ 'PAConv number of kernel weights wrong' # in PAConv, bias only exists in ScoreNet scorenet_cfg['bias'] = bias # we need to manually concat xyz for CUDA implemented PAConv self.use_xyz = use_xyz for i in range(len(self.mlp_channels)): mlp_channel = self.mlp_channels[i] if use_xyz: mlp_channel[0] += 3 num_kernels = paconv_num_kernels[i] # can't use `nn.Sequential` for PAConvCUDA because its input and # output have different shapes mlp = nn.ModuleList() for i in range(len(mlp_channel) - 1): mlp.append( PAConvCUDA( mlp_channel[i], mlp_channel[i + 1], num_kernels[i], norm_cfg=norm_cfg, kernel_input=paconv_kernel_input, scorenet_input=scorenet_input, scorenet_cfg=scorenet_cfg)) self.mlps.append(mlp) def forward( self, points_xyz, features=None, indices=None, target_xyz=None, ): """forward. Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor, optional): (B, C, N) features of each point. Default: None. indices (Tensor, optional): (B, num_point) Index of the features. Default: None. target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs. Default: None. Returns: Tensor: (B, M, 3) where M is the number of points. New features xyz. Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number of points. New feature descriptors. Tensor: (B, M) where M is the number of points. Index of the features. """ new_features_list = [] # sample points, (B, num_point, 3), (B, num_point) new_xyz, indices = self._sample_points(points_xyz, features, indices, target_xyz) for i in range(len(self.groupers)): xyz = points_xyz new_features = features for j in range(len(self.mlps[i])): # we don't use grouped_features here to avoid large GPU memory # _, (B, 3, num_point, nsample), (B, num_point, nsample) _, grouped_xyz, grouped_idx = self.groupers[i](xyz, new_xyz, new_features) # concat xyz as additional features if self.use_xyz and j == 0: # (B, C+3, N) new_features = torch.cat( (points_xyz.permute(0, 2, 1), new_features), dim=1) # (B, out_c, num_point, nsample) grouped_new_features = self.mlps[i][j]( (new_features, grouped_xyz, grouped_idx.long()))[0] # different from PointNet++ and non CUDA version of PAConv # CUDA version of PAConv needs to aggregate local features # every time after it passes through a Conv layer # in order to transform to valid input shape # (B, out_c, num_point) new_features = self._pool_features(grouped_new_features) # constrain the points to be grouped for next PAConv layer # because new_features only contains sampled centers now # (B, num_point, 3) xyz = new_xyz new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1), indices @SA_MODULES.register_module() class PAConvCUDASAModule(PAConvCUDASAModuleMSG): r"""Point set abstraction module with single-scale grouping (SSG) used in PAConv networks. Replace the non CUDA version PAConv with CUDA implemented PAConv for efficient computation. See the `paper `_ for more details. """ def __init__(self, mlp_channels, paconv_num_kernels, num_point=None, radius=None, num_sample=None, norm_cfg=dict(type='BN2d', momentum=0.1), use_xyz=True, pool_mod='max', fps_mod=['D-FPS'], fps_sample_range_list=[-1], normalize_xyz=False, paconv_kernel_input='w_neighbor', scorenet_input='w_neighbor_dist', scorenet_cfg=dict( mlp_channels=[8, 16, 16], score_norm='softmax', temp_factor=1.0, last_bn=False)): super(PAConvCUDASAModule, self).__init__( mlp_channels=[mlp_channels], paconv_num_kernels=[paconv_num_kernels], num_point=num_point, radii=[radius], sample_nums=[num_sample], norm_cfg=norm_cfg, use_xyz=use_xyz, pool_mod=pool_mod, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, normalize_xyz=normalize_xyz, paconv_kernel_input=paconv_kernel_input, scorenet_input=scorenet_input, scorenet_cfg=scorenet_cfg) ================================================ FILE: mmdet3d/ops/pointnet_modules/point_fp_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from typing import List import torch from mmcv.cnn import ConvModule from mmcv.ops import three_interpolate, three_nn from mmcv.runner import BaseModule, force_fp32 from torch import nn as nn class PointFPModule(BaseModule): """Point feature propagation module used in PointNets. Propagate the features from one set to another. Args: mlp_channels (list[int]): List of mlp channels. norm_cfg (dict, optional): Type of normalization method. Default: dict(type='BN2d'). """ def __init__(self, mlp_channels: List[int], norm_cfg: dict = dict(type='BN2d'), init_cfg=None): super().__init__(init_cfg=init_cfg) self.fp16_enabled = False self.mlps = nn.Sequential() for i in range(len(mlp_channels) - 1): self.mlps.add_module( f'layer{i}', ConvModule( mlp_channels[i], mlp_channels[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg)) @force_fp32() def forward(self, target: torch.Tensor, source: torch.Tensor, target_feats: torch.Tensor, source_feats: torch.Tensor) -> torch.Tensor: """forward. Args: target (Tensor): (B, n, 3) tensor of the xyz positions of the target features. source (Tensor): (B, m, 3) tensor of the xyz positions of the source features. target_feats (Tensor): (B, C1, n) tensor of the features to be propagated to. source_feats (Tensor): (B, C2, m) tensor of features to be propagated. Return: Tensor: (B, M, N) M = mlp[-1], tensor of the target features. """ if source is not None: dist, idx = three_nn(target, source) dist_reciprocal = 1.0 / (dist + 1e-8) norm = torch.sum(dist_reciprocal, dim=2, keepdim=True) weight = dist_reciprocal / norm interpolated_feats = three_interpolate(source_feats, idx, weight) else: interpolated_feats = source_feats.expand(*source_feats.size()[0:2], target.size(1)) if target_feats is not None: new_features = torch.cat([interpolated_feats, target_feats], dim=1) # (B, C2 + C1, n) else: new_features = interpolated_feats new_features = new_features.unsqueeze(-1) new_features = self.mlps(new_features) return new_features.squeeze(-1) ================================================ FILE: mmdet3d/ops/pointnet_modules/point_sa_module.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import torch from mmcv.cnn import ConvModule from mmcv.ops import GroupAll from mmcv.ops import PointsSampler as Points_Sampler from mmcv.ops import QueryAndGroup, gather_points from torch import nn as nn from torch.nn import functional as F from mmdet3d.ops import PAConv from .builder import SA_MODULES class BasePointSAModule(nn.Module): """Base module for point set abstraction module used in PointNets. Args: num_point (int): Number of points. radii (list[float]): List of radius in each ball query. sample_nums (list[int]): Number of samples in each ball query. mlp_channels (list[list[int]]): Specify of the pointnet before the global pooling for each scale. fps_mod (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. fps_sample_range_list (list[int], optional): Range of points to apply FPS. Default: [-1]. dilated_group (bool, optional): Whether to use dilated ball query. Default: False. use_xyz (bool, optional): Whether to use xyz. Default: True. pool_mod (str, optional): Type of pooling method. Default: 'max_pool'. normalize_xyz (bool, optional): Whether to normalize local XYZ with radius. Default: False. grouper_return_grouped_xyz (bool, optional): Whether to return grouped xyz in `QueryAndGroup`. Defaults to False. grouper_return_grouped_idx (bool, optional): Whether to return grouped idx in `QueryAndGroup`. Defaults to False. """ def __init__(self, num_point, radii, sample_nums, mlp_channels, fps_mod=['D-FPS'], fps_sample_range_list=[-1], dilated_group=False, use_xyz=True, pool_mod='max', normalize_xyz=False, grouper_return_grouped_xyz=False, grouper_return_grouped_idx=False): super(BasePointSAModule, self).__init__() assert len(radii) == len(sample_nums) == len(mlp_channels) assert pool_mod in ['max', 'avg'] assert isinstance(fps_mod, list) or isinstance(fps_mod, tuple) assert isinstance(fps_sample_range_list, list) or isinstance( fps_sample_range_list, tuple) assert len(fps_mod) == len(fps_sample_range_list) if isinstance(mlp_channels, tuple): mlp_channels = list(map(list, mlp_channels)) self.mlp_channels = mlp_channels if isinstance(num_point, int): self.num_point = [num_point] elif isinstance(num_point, list) or isinstance(num_point, tuple): self.num_point = num_point elif num_point is None: self.num_point = None else: raise NotImplementedError('Error type of num_point!') self.pool_mod = pool_mod self.groupers = nn.ModuleList() self.mlps = nn.ModuleList() self.fps_mod_list = fps_mod self.fps_sample_range_list = fps_sample_range_list if self.num_point is not None: self.points_sampler = Points_Sampler(self.num_point, self.fps_mod_list, self.fps_sample_range_list) else: self.points_sampler = None for i in range(len(radii)): radius = radii[i] sample_num = sample_nums[i] if num_point is not None: if dilated_group and i != 0: min_radius = radii[i - 1] else: min_radius = 0 grouper = QueryAndGroup( radius, sample_num, min_radius=min_radius, use_xyz=use_xyz, normalize_xyz=normalize_xyz, return_grouped_xyz=grouper_return_grouped_xyz, return_grouped_idx=grouper_return_grouped_idx) else: grouper = GroupAll(use_xyz) self.groupers.append(grouper) def _sample_points(self, points_xyz, features, indices, target_xyz): """Perform point sampling based on inputs. If `indices` is specified, directly sample corresponding points. Else if `target_xyz` is specified, use is as sampled points. Otherwise sample points using `self.points_sampler`. Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor): (B, C, N) features of each point. indices (Tensor): (B, num_point) Index of the features. target_xyz (Tensor): (B, M, 3) new_xyz coordinates of the outputs. Returns: Tensor: (B, num_point, 3) sampled xyz coordinates of points. Tensor: (B, num_point) sampled points' index. """ xyz_flipped = points_xyz.transpose(1, 2).contiguous() if indices is not None: assert (indices.shape[1] == self.num_point[0]) new_xyz = gather_points(xyz_flipped, indices).transpose( 1, 2).contiguous() if self.num_point is not None else None elif target_xyz is not None: new_xyz = target_xyz.contiguous() else: if self.num_point is not None: indices = self.points_sampler(points_xyz, features) new_xyz = gather_points(xyz_flipped, indices).transpose(1, 2).contiguous() else: new_xyz = None return new_xyz, indices def _pool_features(self, features): """Perform feature aggregation using pooling operation. Args: features (torch.Tensor): (B, C, N, K) Features of locally grouped points before pooling. Returns: torch.Tensor: (B, C, N) Pooled features aggregating local information. """ if self.pool_mod == 'max': # (B, C, N, 1) new_features = F.max_pool2d( features, kernel_size=[1, features.size(3)]) elif self.pool_mod == 'avg': # (B, C, N, 1) new_features = F.avg_pool2d( features, kernel_size=[1, features.size(3)]) else: raise NotImplementedError return new_features.squeeze(-1).contiguous() def forward( self, points_xyz, features=None, indices=None, target_xyz=None, ): """forward. Args: points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. features (Tensor, optional): (B, C, N) features of each point. Default: None. indices (Tensor, optional): (B, num_point) Index of the features. Default: None. target_xyz (Tensor, optional): (B, M, 3) new coords of the outputs. Default: None. Returns: Tensor: (B, M, 3) where M is the number of points. New features xyz. Tensor: (B, M, sum_k(mlps[k][-1])) where M is the number of points. New feature descriptors. Tensor: (B, M) where M is the number of points. Index of the features. """ new_features_list = [] # sample points, (B, num_point, 3), (B, num_point) new_xyz, indices = self._sample_points(points_xyz, features, indices, target_xyz) for i in range(len(self.groupers)): # grouped_results may contain: # - grouped_features: (B, C, num_point, nsample) # - grouped_xyz: (B, 3, num_point, nsample) # - grouped_idx: (B, num_point, nsample) grouped_results = self.groupers[i](points_xyz, new_xyz, features) # (B, mlp[-1], num_point, nsample) new_features = self.mlps[i](grouped_results) # this is a bit hack because PAConv outputs two values # we take the first one as feature if isinstance(self.mlps[i][0], PAConv): assert isinstance(new_features, tuple) new_features = new_features[0] # (B, mlp[-1], num_point) new_features = self._pool_features(new_features) new_features_list.append(new_features) return new_xyz, torch.cat(new_features_list, dim=1), indices @SA_MODULES.register_module() class PointSAModuleMSG(BasePointSAModule): """Point set abstraction module with multi-scale grouping (MSG) used in PointNets. Args: num_point (int): Number of points. radii (list[float]): List of radius in each ball query. sample_nums (list[int]): Number of samples in each ball query. mlp_channels (list[list[int]]): Specify of the pointnet before the global pooling for each scale. fps_mod (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. F-FPS: using feature distances for FPS. D-FPS: using Euclidean distances of points for FPS. FS: using F-FPS and D-FPS simultaneously. fps_sample_range_list (list[int], optional): Range of points to apply FPS. Default: [-1]. dilated_group (bool, optional): Whether to use dilated ball query. Default: False. norm_cfg (dict, optional): Type of normalization method. Default: dict(type='BN2d'). use_xyz (bool, optional): Whether to use xyz. Default: True. pool_mod (str, optional): Type of pooling method. Default: 'max_pool'. normalize_xyz (bool, optional): Whether to normalize local XYZ with radius. Default: False. bias (bool | str, optional): If specified as `auto`, it will be decided by `norm_cfg`. `bias` will be set as True if `norm_cfg` is None, otherwise False. Default: 'auto'. """ def __init__(self, num_point, radii, sample_nums, mlp_channels, fps_mod=['D-FPS'], fps_sample_range_list=[-1], dilated_group=False, norm_cfg=dict(type='BN2d'), use_xyz=True, pool_mod='max', normalize_xyz=False, bias='auto'): super(PointSAModuleMSG, self).__init__( num_point=num_point, radii=radii, sample_nums=sample_nums, mlp_channels=mlp_channels, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, dilated_group=dilated_group, use_xyz=use_xyz, pool_mod=pool_mod, normalize_xyz=normalize_xyz) for i in range(len(self.mlp_channels)): mlp_channel = self.mlp_channels[i] if use_xyz: mlp_channel[0] += 3 mlp = nn.Sequential() for i in range(len(mlp_channel) - 1): mlp.add_module( f'layer{i}', ConvModule( mlp_channel[i], mlp_channel[i + 1], kernel_size=(1, 1), stride=(1, 1), conv_cfg=dict(type='Conv2d'), norm_cfg=norm_cfg, bias=bias)) self.mlps.append(mlp) @SA_MODULES.register_module() class PointSAModule(PointSAModuleMSG): """Point set abstraction module with single-scale grouping (SSG) used in PointNets. Args: mlp_channels (list[int]): Specify of the pointnet before the global pooling for each scale. num_point (int, optional): Number of points. Default: None. radius (float, optional): Radius to group with. Default: None. num_sample (int, optional): Number of samples in each ball query. Default: None. norm_cfg (dict, optional): Type of normalization method. Default: dict(type='BN2d'). use_xyz (bool, optional): Whether to use xyz. Default: True. pool_mod (str, optional): Type of pooling method. Default: 'max_pool'. fps_mod (list[str], optional): Type of FPS method, valid mod ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. fps_sample_range_list (list[int], optional): Range of points to apply FPS. Default: [-1]. normalize_xyz (bool, optional): Whether to normalize local XYZ with radius. Default: False. """ def __init__(self, mlp_channels, num_point=None, radius=None, num_sample=None, norm_cfg=dict(type='BN2d'), use_xyz=True, pool_mod='max', fps_mod=['D-FPS'], fps_sample_range_list=[-1], normalize_xyz=False): super(PointSAModule, self).__init__( mlp_channels=[mlp_channels], num_point=num_point, radii=[radius], sample_nums=[num_sample], norm_cfg=norm_cfg, use_xyz=use_xyz, pool_mod=pool_mod, fps_mod=fps_mod, fps_sample_range_list=fps_sample_range_list, normalize_xyz=normalize_xyz) ================================================ FILE: mmdet3d/ops/sparse_block.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.cnn import build_conv_layer, build_norm_layer from torch import nn from mmdet.models.backbones.resnet import BasicBlock, Bottleneck from .spconv import IS_SPCONV2_AVAILABLE if IS_SPCONV2_AVAILABLE: from spconv.pytorch import SparseModule, SparseSequential else: from mmcv.ops import SparseModule, SparseSequential def replace_feature(out, new_features): if 'replace_feature' in out.__dir__(): # spconv 2.x behaviour return out.replace_feature(new_features) else: out.features = new_features return out class SparseBottleneck(Bottleneck, SparseModule): """Sparse bottleneck block for PartA^2. Bottleneck block implemented with submanifold sparse convolution. Args: inplanes (int): inplanes of block. planes (int): planes of block. stride (int, optional): stride of the first block. Default: 1. downsample (Module, optional): down sample module for block. conv_cfg (dict, optional): dictionary to construct and config conv layer. Default: None. norm_cfg (dict, optional): dictionary to construct and config norm layer. Default: dict(type='BN'). """ expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None, conv_cfg=None, norm_cfg=None): SparseModule.__init__(self) Bottleneck.__init__( self, inplanes, planes, stride=stride, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg) def forward(self, x): identity = x.features out = self.conv1(x) out = replace_feature(out, self.bn1(out.features)) out = replace_feature(out, self.relu(out.features)) out = self.conv2(out) out = replace_feature(out, self.bn2(out.features)) out = replace_feature(out, self.relu(out.features)) out = self.conv3(out) out = replace_feature(out, self.bn3(out.features)) if self.downsample is not None: identity = self.downsample(x) out = replace_feature(out, out.features + identity) out = replace_feature(out, self.relu(out.features)) return out class SparseBasicBlock(BasicBlock, SparseModule): """Sparse basic block for PartA^2. Sparse basic block implemented with submanifold sparse convolution. Args: inplanes (int): inplanes of block. planes (int): planes of block. stride (int, optional): stride of the first block. Default: 1. downsample (Module, optional): down sample module for block. conv_cfg (dict, optional): dictionary to construct and config conv layer. Default: None. norm_cfg (dict, optional): dictionary to construct and config norm layer. Default: dict(type='BN'). """ expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None, conv_cfg=None, norm_cfg=None): SparseModule.__init__(self) BasicBlock.__init__( self, inplanes, planes, stride=stride, downsample=downsample, conv_cfg=conv_cfg, norm_cfg=norm_cfg) def forward(self, x): identity = x.features assert x.features.dim() == 2, f'x.features.dim()={x.features.dim()}' out = self.conv1(x) out = replace_feature(out, self.norm1(out.features)) out = replace_feature(out, self.relu(out.features)) out = self.conv2(out) out = replace_feature(out, self.norm2(out.features)) if self.downsample is not None: identity = self.downsample(x) out = replace_feature(out, out.features + identity) out = replace_feature(out, self.relu(out.features)) return out def make_sparse_convmodule(in_channels, out_channels, kernel_size, indice_key, stride=1, padding=0, conv_type='SubMConv3d', norm_cfg=None, order=('conv', 'norm', 'act')): """Make sparse convolution module. Args: in_channels (int): the number of input channels out_channels (int): the number of out channels kernel_size (int|tuple(int)): kernel size of convolution indice_key (str): the indice key used for sparse tensor stride (int|tuple(int)): the stride of convolution padding (int or list[int]): the padding number of input conv_type (str): sparse conv type in spconv norm_cfg (dict[str]): config of normalization layer order (tuple[str]): The order of conv/norm/activation layers. It is a sequence of "conv", "norm" and "act". Common examples are ("conv", "norm", "act") and ("act", "conv", "norm"). Returns: spconv.SparseSequential: sparse convolution module. """ assert isinstance(order, tuple) and len(order) <= 3 assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'} conv_cfg = dict(type=conv_type, indice_key=indice_key) layers = list() for layer in order: if layer == 'conv': if conv_type not in [ 'SparseInverseConv3d', 'SparseInverseConv2d', 'SparseInverseConv1d' ]: layers.append( build_conv_layer( conv_cfg, in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False)) else: layers.append( build_conv_layer( conv_cfg, in_channels, out_channels, kernel_size, bias=False)) elif layer == 'norm': layers.append(build_norm_layer(norm_cfg, out_channels)[1]) elif layer == 'act': layers.append(nn.ReLU(inplace=True)) layers = SparseSequential(*layers) return layers ================================================ FILE: mmdet3d/ops/spconv/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .overwrite_spconv.write_spconv2 import register_spconv2 try: import spconv except ImportError: IS_SPCONV2_AVAILABLE = False else: if hasattr(spconv, '__version__') and spconv.__version__ >= '2.0.0': IS_SPCONV2_AVAILABLE = register_spconv2() else: IS_SPCONV2_AVAILABLE = False __all__ = ['IS_SPCONV2_AVAILABLE'] ================================================ FILE: mmdet3d/ops/spconv/overwrite_spconv/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from .write_spconv2 import register_spconv2 __all__ = ['register_spconv2'] ================================================ FILE: mmdet3d/ops/spconv/overwrite_spconv/write_spconv2.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import itertools from mmcv.cnn.bricks.registry import CONV_LAYERS from torch.nn.parameter import Parameter def register_spconv2(): """This func registers spconv2.0 spconv ops to overwrite the default mmcv spconv ops.""" try: from spconv.pytorch import (SparseConv2d, SparseConv3d, SparseConv4d, SparseConvTranspose2d, SparseConvTranspose3d, SparseInverseConv2d, SparseInverseConv3d, SparseModule, SubMConv2d, SubMConv3d, SubMConv4d) except ImportError: return False else: CONV_LAYERS._register_module(SparseConv2d, 'SparseConv2d', force=True) CONV_LAYERS._register_module(SparseConv3d, 'SparseConv3d', force=True) CONV_LAYERS._register_module(SparseConv4d, 'SparseConv4d', force=True) CONV_LAYERS._register_module( SparseConvTranspose2d, 'SparseConvTranspose2d', force=True) CONV_LAYERS._register_module( SparseConvTranspose3d, 'SparseConvTranspose3d', force=True) CONV_LAYERS._register_module( SparseInverseConv2d, 'SparseInverseConv2d', force=True) CONV_LAYERS._register_module( SparseInverseConv3d, 'SparseInverseConv3d', force=True) CONV_LAYERS._register_module(SubMConv2d, 'SubMConv2d', force=True) CONV_LAYERS._register_module(SubMConv3d, 'SubMConv3d', force=True) CONV_LAYERS._register_module(SubMConv4d, 'SubMConv4d', force=True) SparseModule._version = 2 SparseModule._load_from_state_dict = _load_from_state_dict return True def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs): """Rewrite this func to compat the convolutional kernel weights between spconv 1.x in MMCV and 2.x in spconv2.x. Kernel weights in MMCV spconv has shape in (D,H,W,in_channel,out_channel) , while those in spcon2.x is in (out_channel,D,H,W,in_channel). """ version = local_metadata.get('version', None) for hook in self._load_state_dict_pre_hooks.values(): hook(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs) local_name_params = itertools.chain(self._parameters.items(), self._buffers.items()) local_state = {k: v.data for k, v in local_name_params if v is not None} for name, param in local_state.items(): key = prefix + name if key in state_dict: input_param = state_dict[key] # Backward compatibility: loading 1-dim tensor from # 0.3.* to version 0.4+ if len(param.shape) == 0 and len(input_param.shape) == 1: input_param = input_param[0] if version != 2: dims = [len(input_param.shape) - 1] + list( range(len(input_param.shape) - 1)) input_param = input_param.permute(*dims) if input_param.shape != param.shape: # local shape should match the one in checkpoint error_msgs.append( f'size mismatch for {key}: copying a param with ' f'shape {key, input_param.shape} from checkpoint,' f'the shape in current model is {param.shape}.') continue if isinstance(input_param, Parameter): # backwards compatibility for serialized parameters input_param = input_param.data try: param.copy_(input_param) except Exception: error_msgs.append( f'While copying the parameter named "{key}", whose ' f'dimensions in the model are {param.size()} and whose ' f'dimensions in the checkpoint are {input_param.size()}.') elif strict: missing_keys.append(key) if strict: for key, input_param in state_dict.items(): if key.startswith(prefix): input_name = key[len(prefix):] input_name = input_name.split( '.', 1)[0] # get the name of param/buffer/child if input_name not in self._modules \ and input_name not in local_state: unexpected_keys.append(key) ================================================ FILE: mmdet3d/utils/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.utils import Registry, build_from_cfg, print_log from .collect_env import collect_env from .compat_cfg import compat_cfg from .logger import get_root_logger from .misc import find_latest_checkpoint from .setup_env import setup_multi_processes __all__ = [ 'Registry', 'build_from_cfg', 'get_root_logger', 'collect_env', 'print_log', 'setup_multi_processes', 'find_latest_checkpoint', 'compat_cfg' ] ================================================ FILE: mmdet3d/utils/collect_env.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from mmcv.utils import collect_env as collect_base_env from mmcv.utils import get_git_hash import mmdet import mmdet3d import mmseg from mmdet3d.ops.spconv import IS_SPCONV2_AVAILABLE def collect_env(): """Collect the information of the running environments.""" env_info = collect_base_env() env_info['MMDetection'] = mmdet.__version__ env_info['MMSegmentation'] = mmseg.__version__ env_info['MMDetection3D'] = mmdet3d.__version__ + '+' + get_git_hash()[:7] env_info['spconv2.0'] = IS_SPCONV2_AVAILABLE return env_info if __name__ == '__main__': for name, val in collect_env().items(): print(f'{name}: {val}') ================================================ FILE: mmdet3d/utils/compat_cfg.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import warnings from mmcv import ConfigDict def compat_cfg(cfg): """This function would modify some filed to keep the compatibility of config. For example, it will move some args which will be deprecated to the correct fields. """ cfg = copy.deepcopy(cfg) cfg = compat_imgs_per_gpu(cfg) cfg = compat_loader_args(cfg) cfg = compat_runner_args(cfg) return cfg def compat_runner_args(cfg): if 'runner' not in cfg: cfg.runner = ConfigDict({ 'type': 'EpochBasedRunner', 'max_epochs': cfg.total_epochs }) warnings.warn( 'config is now expected to have a `runner` section, ' 'please set `runner` in your config.', UserWarning) else: if 'total_epochs' in cfg: assert cfg.total_epochs == cfg.runner.max_epochs return cfg def compat_imgs_per_gpu(cfg): cfg = copy.deepcopy(cfg) if 'imgs_per_gpu' in cfg.data: warnings.warn('"imgs_per_gpu" is deprecated in MMDet V2.0. ' 'Please use "samples_per_gpu" instead') if 'samples_per_gpu' in cfg.data: warnings.warn( f'Got "imgs_per_gpu"={cfg.data.imgs_per_gpu} and ' f'"samples_per_gpu"={cfg.data.samples_per_gpu}, "imgs_per_gpu"' f'={cfg.data.imgs_per_gpu} is used in this experiments') else: warnings.warn('Automatically set "samples_per_gpu"="imgs_per_gpu"=' f'{cfg.data.imgs_per_gpu} in this experiments') cfg.data.samples_per_gpu = cfg.data.imgs_per_gpu return cfg def compat_loader_args(cfg): """Deprecated sample_per_gpu in cfg.data.""" cfg = copy.deepcopy(cfg) if 'train_dataloader' not in cfg.data: cfg.data['train_dataloader'] = ConfigDict() if 'val_dataloader' not in cfg.data: cfg.data['val_dataloader'] = ConfigDict() if 'test_dataloader' not in cfg.data: cfg.data['test_dataloader'] = ConfigDict() # special process for train_dataloader if 'samples_per_gpu' in cfg.data: samples_per_gpu = cfg.data.pop('samples_per_gpu') assert 'samples_per_gpu' not in \ cfg.data.train_dataloader, ('`samples_per_gpu` are set ' 'in `data` field and ` ' 'data.train_dataloader` ' 'at the same time. ' 'Please only set it in ' '`data.train_dataloader`. ') cfg.data.train_dataloader['samples_per_gpu'] = samples_per_gpu if 'persistent_workers' in cfg.data: persistent_workers = cfg.data.pop('persistent_workers') assert 'persistent_workers' not in \ cfg.data.train_dataloader, ('`persistent_workers` are set ' 'in `data` field and ` ' 'data.train_dataloader` ' 'at the same time. ' 'Please only set it in ' '`data.train_dataloader`. ') cfg.data.train_dataloader['persistent_workers'] = persistent_workers if 'workers_per_gpu' in cfg.data: workers_per_gpu = cfg.data.pop('workers_per_gpu') cfg.data.train_dataloader['workers_per_gpu'] = workers_per_gpu cfg.data.val_dataloader['workers_per_gpu'] = workers_per_gpu cfg.data.test_dataloader['workers_per_gpu'] = workers_per_gpu # special process for val_dataloader if 'samples_per_gpu' in cfg.data.val: # keep default value of `sample_per_gpu` is 1 assert 'samples_per_gpu' not in \ cfg.data.val_dataloader, ('`samples_per_gpu` are set ' 'in `data.val` field and ` ' 'data.val_dataloader` at ' 'the same time. ' 'Please only set it in ' '`data.val_dataloader`. ') cfg.data.val_dataloader['samples_per_gpu'] = \ cfg.data.val.pop('samples_per_gpu') # special process for val_dataloader # in case the test dataset is concatenated if isinstance(cfg.data.test, dict): if 'samples_per_gpu' in cfg.data.test: assert 'samples_per_gpu' not in \ cfg.data.test_dataloader, ('`samples_per_gpu` are set ' 'in `data.test` field and ` ' 'data.test_dataloader` ' 'at the same time. ' 'Please only set it in ' '`data.test_dataloader`. ') cfg.data.test_dataloader['samples_per_gpu'] = \ cfg.data.test.pop('samples_per_gpu') elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: if 'samples_per_gpu' in ds_cfg: assert 'samples_per_gpu' not in \ cfg.data.test_dataloader, ('`samples_per_gpu` are set ' 'in `data.test` field and ` ' 'data.test_dataloader` at' ' the same time. ' 'Please only set it in ' '`data.test_dataloader`. ') samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) cfg.data.test_dataloader['samples_per_gpu'] = samples_per_gpu return cfg ================================================ FILE: mmdet3d/utils/logger.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import logging from mmcv.utils import get_logger def get_root_logger(log_file=None, log_level=logging.INFO, name='mmdet3d'): """Get root logger and add a keyword filter to it. The logger will be initialized if it has not been initialized. By default a StreamHandler will be added. If `log_file` is specified, a FileHandler will also be added. The name of the root logger is the top-level package name, e.g., "mmdet3d". Args: log_file (str, optional): File path of log. Defaults to None. log_level (int, optional): The level of logger. Defaults to logging.INFO. name (str, optional): The name of the root logger, also used as a filter keyword. Defaults to 'mmdet3d'. Returns: :obj:`logging.Logger`: The obtained logger """ logger = get_logger(name=name, log_file=log_file, log_level=log_level) # add a logging filter logging_filter = logging.Filter(name) logging_filter.filter = lambda record: record.find(name) != -1 return logger ================================================ FILE: mmdet3d/utils/misc.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import glob import os.path as osp import warnings def find_latest_checkpoint(path, suffix='pth'): """Find the latest checkpoint from the working directory. This function is copied from mmdetection. Args: path(str): The path to find checkpoints. suffix(str): File extension. Defaults to pth. Returns: latest_path(str | None): File path of the latest checkpoint. References: .. [1] https://github.com/microsoft/SoftTeacher /blob/main/ssod/utils/patch.py """ if not osp.exists(path): warnings.warn('The path of checkpoints does not exist.') return None if osp.exists(osp.join(path, f'latest.{suffix}')): return osp.join(path, f'latest.{suffix}') checkpoints = glob.glob(osp.join(path, f'*.{suffix}')) if len(checkpoints) == 0: warnings.warn('There are no checkpoints in the path.') return None latest = -1 latest_path = None for checkpoint in checkpoints: count = int(osp.basename(checkpoint).split('_')[-1].split('.')[0]) if count > latest: latest = count latest_path = checkpoint return latest_path ================================================ FILE: mmdet3d/utils/setup_env.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import platform import warnings import cv2 from torch import multiprocessing as mp def setup_multi_processes(cfg): """Setup multi-processing environment variables.""" # set multi-process start method as `fork` to speed up the training if platform.system() != 'Windows': mp_start_method = cfg.get('mp_start_method', 'fork') current_method = mp.get_start_method(allow_none=True) if current_method is not None and current_method != mp_start_method: warnings.warn( f'Multi-processing start method `{mp_start_method}` is ' f'different from the previous setting `{current_method}`.' f'It will be force set to `{mp_start_method}`. You can change ' f'this behavior by changing `mp_start_method` in your config.') mp.set_start_method(mp_start_method, force=True) # disable opencv multithreading to avoid system being overloaded opencv_num_threads = cfg.get('opencv_num_threads', 0) cv2.setNumThreads(opencv_num_threads) # setup OMP threads # This code is referred from https://github.com/pytorch/pytorch/blob/master/torch/distributed/run.py # noqa workers_per_gpu = cfg.data.get('workers_per_gpu', 1) if 'train_dataloader' in cfg.data: workers_per_gpu = \ max(cfg.data.train_dataloader.get('workers_per_gpu', 1), workers_per_gpu) if 'OMP_NUM_THREADS' not in os.environ and workers_per_gpu > 1: omp_num_threads = 1 warnings.warn( f'Setting OMP_NUM_THREADS environment variable for each process ' f'to be {omp_num_threads} in default, to avoid your system being ' f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['OMP_NUM_THREADS'] = str(omp_num_threads) # setup MKL threads if 'MKL_NUM_THREADS' not in os.environ and workers_per_gpu > 1: mkl_num_threads = 1 warnings.warn( f'Setting MKL_NUM_THREADS environment variable for each process ' f'to be {mkl_num_threads} in default, to avoid your system being ' f'overloaded, please further tune the variable for optimal ' f'performance in your application as needed.') os.environ['MKL_NUM_THREADS'] = str(mkl_num_threads) ================================================ FILE: mmdet3d/version.py ================================================ # Copyright (c) Open-MMLab. All rights reserved. __version__ = '1.0.0rc4' short_version = __version__ def parse_version_info(version_str): version_info = [] for x in version_str.split('.'): if x.isdigit(): version_info.append(int(x)) elif x.find('rc') != -1: patch_version = x.split('rc') version_info.append(int(patch_version[0])) version_info.append(f'rc{patch_version[1]}') return tuple(version_info) version_info = parse_version_info(__version__) ================================================ FILE: requirements/build.txt ================================================ ================================================ FILE: requirements/docs.txt ================================================ docutils==0.16.0 m2r mistune==0.8.4 myst-parser -e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme sphinx==4.0.2 sphinx-copybutton sphinx_markdown_tables ================================================ FILE: requirements/mminstall.txt ================================================ mmcv-full>=1.4.8,<=1.6.0 mmdet>=2.24.0,<=3.0.0 mmsegmentation>=0.20.0,<=1.0.0 ================================================ FILE: requirements/optional.txt ================================================ open3d spconv waymo-open-dataset-tf-2-1-0==1.2.0 ================================================ FILE: requirements/readthedocs.txt ================================================ mmcv>=1.4.8 mmdet>=2.24.0 mmsegmentation>=0.20.1 torch torchvision ================================================ FILE: requirements/runtime.txt ================================================ lyft_dataset_sdk networkx>=2.2,<2.3 numba==0.53.0 numpy nuscenes-devkit plyfile scikit-image # by default we also use tensorboard to log results tensorboard trimesh>=2.35.39,<2.35.40 ================================================ FILE: requirements/tests.txt ================================================ asynctest codecov flake8 interrogate isort # Note: used for kwarray.group_items, this may be ported to mmcv in the future. kwarray pytest pytest-cov pytest-runner ubelt xdoctest >= 0.10.0 yapf ================================================ FILE: tools/analysis_tools/analyze_logs.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import json from collections import defaultdict import numpy as np import seaborn as sns from matplotlib import pyplot as plt def cal_train_time(log_dicts, args): for i, log_dict in enumerate(log_dicts): print(f'{"-" * 5}Analyze train time of {args.json_logs[i]}{"-" * 5}') all_times = [] for epoch in log_dict.keys(): if args.include_outliers: all_times.append(log_dict[epoch]['time']) else: all_times.append(log_dict[epoch]['time'][1:]) all_times = np.array(all_times) epoch_ave_time = all_times.mean(-1) slowest_epoch = epoch_ave_time.argmax() fastest_epoch = epoch_ave_time.argmin() std_over_epoch = epoch_ave_time.std() print(f'slowest epoch {slowest_epoch + 1}, ' f'average time is {epoch_ave_time[slowest_epoch]:.4f}') print(f'fastest epoch {fastest_epoch + 1}, ' f'average time is {epoch_ave_time[fastest_epoch]:.4f}') print(f'time std over epochs is {std_over_epoch:.4f}') print(f'average iter time: {np.mean(all_times):.4f} s/iter') print() def plot_curve(log_dicts, args): if args.backend is not None: plt.switch_backend(args.backend) sns.set_style(args.style) # if legend is None, use {filename}_{key} as legend legend = args.legend if legend is None: legend = [] for json_log in args.json_logs: for metric in args.keys: legend.append(f'{json_log}_{metric}') assert len(legend) == (len(args.json_logs) * len(args.keys)) metrics = args.keys num_metrics = len(metrics) for i, log_dict in enumerate(log_dicts): epochs = list(log_dict.keys()) for j, metric in enumerate(metrics): print(f'plot curve of {args.json_logs[i]}, metric is {metric}') if metric not in log_dict[epochs[args.interval - 1]]: raise KeyError( f'{args.json_logs[i]} does not contain metric {metric}') if args.mode == 'eval': if min(epochs) == args.interval: x0 = args.interval else: # if current training is resumed from previous checkpoint # we lost information in early epochs # `xs` should start according to `min(epochs)` if min(epochs) % args.interval == 0: x0 = min(epochs) else: # find the first epoch that do eval x0 = min(epochs) + args.interval - \ min(epochs) % args.interval xs = np.arange(x0, max(epochs) + 1, args.interval) ys = [] for epoch in epochs[args.interval - 1::args.interval]: ys += log_dict[epoch][metric] # if training is aborted before eval of the last epoch # `xs` and `ys` will have different length and cause an error # check if `ys[-1]` is empty here if not log_dict[epoch][metric]: xs = xs[:-1] ax = plt.gca() ax.set_xticks(xs) plt.xlabel('epoch') plt.plot(xs, ys, label=legend[i * num_metrics + j], marker='o') else: xs = [] ys = [] num_iters_per_epoch = \ log_dict[epochs[args.interval-1]]['iter'][-1] for epoch in epochs[args.interval - 1::args.interval]: iters = log_dict[epoch]['iter'] if log_dict[epoch]['mode'][-1] == 'val': iters = iters[:-1] xs.append( np.array(iters) + (epoch - 1) * num_iters_per_epoch) ys.append(np.array(log_dict[epoch][metric][:len(iters)])) xs = np.concatenate(xs) ys = np.concatenate(ys) plt.xlabel('iter') plt.plot( xs, ys, label=legend[i * num_metrics + j], linewidth=0.5) plt.legend() if args.title is not None: plt.title(args.title) if args.out is None: plt.show() else: print(f'save curve to: {args.out}') plt.savefig(args.out) plt.cla() def add_plot_parser(subparsers): parser_plt = subparsers.add_parser( 'plot_curve', help='parser for plotting curves') parser_plt.add_argument( 'json_logs', type=str, nargs='+', help='path of train log in json format') parser_plt.add_argument( '--keys', type=str, nargs='+', default=['mAP_0.25'], help='the metric that you want to plot') parser_plt.add_argument('--title', type=str, help='title of figure') parser_plt.add_argument( '--legend', type=str, nargs='+', default=None, help='legend of each plot') parser_plt.add_argument( '--backend', type=str, default=None, help='backend of plt') parser_plt.add_argument( '--style', type=str, default='dark', help='style of plt') parser_plt.add_argument('--out', type=str, default=None) parser_plt.add_argument('--mode', type=str, default='train') parser_plt.add_argument('--interval', type=int, default=1) def add_time_parser(subparsers): parser_time = subparsers.add_parser( 'cal_train_time', help='parser for computing the average time per training iteration') parser_time.add_argument( 'json_logs', type=str, nargs='+', help='path of train log in json format') parser_time.add_argument( '--include-outliers', action='store_true', help='include the first value of every epoch when computing ' 'the average time') def parse_args(): parser = argparse.ArgumentParser(description='Analyze Json Log') # currently only support plot curve and calculate average train time subparsers = parser.add_subparsers(dest='task', help='task parser') add_plot_parser(subparsers) add_time_parser(subparsers) args = parser.parse_args() return args def load_json_logs(json_logs): # load and convert json_logs to log_dict, key is epoch, value is a sub dict # keys of sub dict is different metrics, e.g. memory, bbox_mAP # value of sub dict is a list of corresponding values of all iterations log_dicts = [dict() for _ in json_logs] for json_log, log_dict in zip(json_logs, log_dicts): with open(json_log, 'r') as log_file: for line in log_file: log = json.loads(line.strip()) # skip lines without `epoch` field if 'epoch' not in log: continue epoch = log.pop('epoch') if epoch not in log_dict: log_dict[epoch] = defaultdict(list) for k, v in log.items(): log_dict[epoch][k].append(v) return log_dicts def main(): args = parse_args() json_logs = args.json_logs for json_log in json_logs: assert json_log.endswith('.json') log_dicts = load_json_logs(json_logs) eval(args.task)(log_dicts, args) if __name__ == '__main__': main() ================================================ FILE: tools/analysis_tools/benchmark.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import time import torch from mmcv import Config from mmcv.parallel import MMDataParallel from mmcv.runner import load_checkpoint, wrap_fp16_model from mmdet3d.datasets import build_dataloader, build_dataset from mmdet3d.models import build_detector from tools.misc.fuse_conv_bn import fuse_module def parse_args(): parser = argparse.ArgumentParser(description='MMDet benchmark a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--samples', default=2000, help='samples to benchmark') parser.add_argument( '--log-interval', default=50, help='interval of logging') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--no-acceleration', action='store_true', help='Omit the pre-computation acceleration') args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # build the model and load checkpoint if not args.no_acceleration: cfg.model.img_view_transformer.accelerate=True cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) # load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_module(model) model = MMDataParallel(model, device_ids=[0]) model.eval() # the first several iterations may be very slow so skip them num_warmup = 5 pure_inf_time = 0 # benchmark with several samples and take the average for i, data in enumerate(data_loader): torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): model(return_loss=False, rescale=True, **data) torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % args.log_interval == 0: fps = (i + 1 - num_warmup) / pure_inf_time print(f'Done image [{i + 1:<3}/ {args.samples}], ' f'fps: {fps:.1f} img / s') if (i + 1) == args.samples: pure_inf_time += elapsed fps = (i + 1 - num_warmup) / pure_inf_time print(f'Overall fps: {fps:.1f} img / s') break if __name__ == '__main__': main() ================================================ FILE: tools/analysis_tools/benchmark_sequential.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import time import torch from mmcv import Config from mmcv.parallel import MMDataParallel from mmcv.runner import load_checkpoint, wrap_fp16_model from mmdet3d.datasets import build_dataloader, build_dataset from mmdet3d.models import build_detector from tools.misc.fuse_conv_bn import fuse_module def parse_args(): parser = argparse.ArgumentParser(description='MMDet benchmark a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--samples', default=400, help='samples to benchmark') parser.add_argument( '--log-interval', default=50, help='interval of logging') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--no-acceleration', action='store_true', help='Omit the pre-computation acceleration') args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # build the model and load checkpoint cfg.model.train_cfg = None cfg.model.align_after_view_transfromation=True if not args.no_acceleration: cfg.model.img_view_transformer.accelerate=True model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_module(model) model = MMDataParallel(model, device_ids=[0]) model.eval() # the first several iterations may be very slow so skip them num_warmup = 5 pure_inf_time = 0 # benchmark with several samples and take the average for i, data in enumerate(data_loader): inputs = [d.cuda() for d in data['img_inputs'][0]] with torch.no_grad(): feat_prev, inputs = model.module.extract_img_feat( inputs, pred_prev=True, img_metas=None) data['img_inputs'][0] = inputs torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): model( return_loss=False, rescale=True, sequential=True, feat_prev=feat_prev, **data) torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % args.log_interval == 0: fps = (i + 1 - num_warmup) / pure_inf_time print(f'Done image [{i + 1:<3}/ {args.samples}], ' f'fps: {fps:.1f} img / s') if (i + 1) == args.samples: pure_inf_time += elapsed fps = (i + 1 - num_warmup) / pure_inf_time print(f'Overall fps: {fps:.1f} img / s') break if __name__ == '__main__': main() ================================================ FILE: tools/analysis_tools/benchmark_trt.py ================================================ import time from typing import Dict, Optional, Sequence, Union import tensorrt as trt import torch import torch.onnx from mmcv import Config from mmdeploy.backend.tensorrt import load_tensorrt_plugin try: # If mmdet version > 2.23.0, compat_cfg would be imported and # used from mmdet instead of mmdet3d. from mmdet.utils import compat_cfg except ImportError: from mmdet3d.utils import compat_cfg import argparse from mmdet3d.core import bbox3d2result from mmdet3d.core.bbox.structures.box_3d_mode import LiDARInstance3DBoxes from mmdet3d.datasets import build_dataloader, build_dataset from mmdet3d.models import build_model def parse_args(): parser = argparse.ArgumentParser(description='Deploy BEVDet with Tensorrt') parser.add_argument('config', help='deploy config file path') parser.add_argument('engine', help='checkpoint file') parser.add_argument('--samples', default=500, help='samples to benchmark') parser.add_argument('--postprocessing', action='store_true') args = parser.parse_args() return args def torch_dtype_from_trt(dtype: trt.DataType) -> torch.dtype: """Convert pytorch dtype to TensorRT dtype. Args: dtype (str.DataType): The data type in tensorrt. Returns: torch.dtype: The corresponding data type in torch. """ if dtype == trt.bool: return torch.bool elif dtype == trt.int8: return torch.int8 elif dtype == trt.int32: return torch.int32 elif dtype == trt.float16: return torch.float16 elif dtype == trt.float32: return torch.float32 else: raise TypeError(f'{dtype} is not supported by torch') class TRTWrapper(torch.nn.Module): def __init__(self, engine: Union[str, trt.ICudaEngine], output_names: Optional[Sequence[str]] = None) -> None: super().__init__() self.engine = engine if isinstance(self.engine, str): with trt.Logger() as logger, trt.Runtime(logger) as runtime: with open(self.engine, mode='rb') as f: engine_bytes = f.read() self.engine = runtime.deserialize_cuda_engine(engine_bytes) self.context = self.engine.create_execution_context() names = [_ for _ in self.engine] input_names = list(filter(self.engine.binding_is_input, names)) self._input_names = input_names self._output_names = output_names if self._output_names is None: output_names = list(set(names) - set(input_names)) self._output_names = output_names def forward(self, inputs: Dict[str, torch.Tensor]): bindings = [None] * (len(self._input_names) + len(self._output_names)) for input_name, input_tensor in inputs.items(): idx = self.engine.get_binding_index(input_name) self.context.set_binding_shape(idx, tuple(input_tensor.shape)) bindings[idx] = input_tensor.contiguous().data_ptr() # create output tensors outputs = {} for output_name in self._output_names: idx = self.engine.get_binding_index(output_name) dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx)) shape = tuple(self.context.get_binding_shape(idx)) device = torch.device('cuda') output = torch.zeros(size=shape, dtype=dtype, device=device) outputs[output_name] = output bindings[idx] = output.data_ptr() self.context.execute_async_v2(bindings, torch.cuda.current_stream().cuda_stream) return outputs def get_plugin_names(): return [pc.name for pc in trt.get_plugin_registry().plugin_creator_list] def main(): load_tensorrt_plugin() args = parse_args() cfg = Config.fromfile(args.config) cfg.model.pretrained = None cfg.model.type = cfg.model.type + 'TRT' cfg = compat_cfg(cfg) cfg.gpu_ids = [0] # build dataloader assert cfg.data.test.test_mode test_dataloader_default_args = dict( samples_per_gpu=1, workers_per_gpu=2, dist=False, shuffle=False) test_loader_cfg = { **test_dataloader_default_args, **cfg.data.get('test_dataloader', {}) } dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, **test_loader_cfg) # build the model cfg.model.train_cfg = None model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) # build tensorrt model trt_model = TRTWrapper(args.engine, [f'output_{i}' for i in range(36)]) num_warmup = 50 pure_inf_time = 0 init_ = True metas = dict() # benchmark with several samples and take the average for i, data in enumerate(data_loader): if init_: inputs = [t.cuda() for t in data['img_inputs'][0]] metas_ = model.get_bev_pool_input(inputs) metas = dict( ranks_bev=metas_[0].int().contiguous(), ranks_depth=metas_[1].int().contiguous(), ranks_feat=metas_[2].int().contiguous(), interval_starts=metas_[3].int().contiguous(), interval_lengths=metas_[4].int().contiguous()) init_ = False img = data['img_inputs'][0][0].cuda().squeeze(0).contiguous() torch.cuda.synchronize() start_time = time.perf_counter() trt_output = trt_model.forward(dict(img=img, **metas)) # postprocessing if args.postprocessing: trt_output = [trt_output[f'output_{i}'] for i in range(36)] pred = model.result_deserialize(trt_output) img_metas = [dict(box_type_3d=LiDARInstance3DBoxes)] bbox_list = model.pts_bbox_head.get_bboxes( pred, img_metas, rescale=True) bbox_results = [ bbox3d2result(bboxes, scores, labels) for bboxes, scores, labels in bbox_list ] torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % 50 == 0: fps = (i + 1 - num_warmup) / pure_inf_time print(f'Done image [{i + 1:<3}/ {args.samples}], ' f'fps: {fps:.1f} img / s') if (i + 1) == args.samples: pure_inf_time += elapsed fps = (i + 1 - num_warmup) / pure_inf_time print(f'Overall \nfps: {fps:.1f} img / s ' f'\ninference time: {1000/fps:.1f} ms') return fps if __name__ == '__main__': fps = main() ================================================ FILE: tools/analysis_tools/benchmark_view_transformer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import time import numpy as np import torch from mmcv import Config from mmcv.parallel import MMDataParallel from mmcv.runner import load_checkpoint from mmdet3d.datasets import build_dataloader, build_dataset from mmdet3d.models import build_detector def parse_args(): parser = argparse.ArgumentParser(description='MMDet benchmark a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--samples', default=1000, help='samples to benchmark') parser.add_argument( '--log-interval', default=50, help='interval of logging') parser.add_argument( '--mem-only', action='store_true', help='Conduct the memory analysis only') parser.add_argument( '--no-acceleration', action='store_true', help='Omit the pre-computation acceleration') args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None cfg.data.test.test_mode = True # build the dataloader # TODO: support multiple images per gpu (only minor changes are needed) dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=1, workers_per_gpu=cfg.data.workers_per_gpu, dist=False, shuffle=False) # build the model and load checkpoint if not args.no_acceleration: cfg.model.img_view_transformer.accelerate=True cfg.model.train_cfg = None model = build_detector(cfg.model, test_cfg=cfg.get('test_cfg')) load_checkpoint(model, args.checkpoint, map_location='cpu') model = MMDataParallel(model, device_ids=[0]) model.eval() # the first several iterations may be very slow so skip them num_warmup = 100 pure_inf_time = 0 D = model.module.img_view_transformer.D out_channels = model.module.img_view_transformer.out_channels depth_net = model.module.img_view_transformer.depth_net view_transformer = model.module.img_view_transformer # benchmark with several samples and take the average for i, data in enumerate(data_loader): with torch.no_grad(): img_feat = \ model.module.image_encoder(data['img_inputs'][0][0].cuda()) B, N, C, H, W = img_feat.shape x = depth_net(img_feat.reshape(B * N, C, H, W)) depth_digit = x[:, :D, ...] tran_feat = x[:, D:D + out_channels, ...] depth = depth_digit.softmax(dim=1) input = [img_feat] + [d.cuda() for d in data['img_inputs'][0][1:]] if i == 0: precomputed_memory_allocated = 0.0 if view_transformer.accelerate: start_mem_allocated = torch.cuda.memory_allocated() view_transformer.pre_compute(input) end_mem_allocated = torch.cuda.memory_allocated() precomputed_memory_allocated = \ end_mem_allocated - start_mem_allocated ref_max_mem_allocated = torch.cuda.max_memory_allocated() # occupy the memory size = (ref_max_mem_allocated - end_mem_allocated) // 4 occupy_tensor = torch.zeros( size=(size, ), device='cuda', dtype=torch.float32) print('Memory analysis: \n' 'precomputed_memory_allocated : %d B / %.01f MB \n' % (precomputed_memory_allocated, precomputed_memory_allocated / 1024 / 1024)) start_mem_allocated = torch.cuda.memory_allocated() bev_feat = view_transformer.view_transform_core( input, depth, tran_feat)[0] end_max_mem_allocated = torch.cuda.max_memory_allocated() peak_memory_allocated = \ end_max_mem_allocated - start_mem_allocated total_memory_requirement = \ precomputed_memory_allocated + peak_memory_allocated print('Memory analysis: \n' 'Memory requirement : %d B / %.01f MB \n' % (total_memory_requirement, total_memory_requirement / 1024 / 1024)) if args.mem_only: return torch.cuda.synchronize() start_time = time.perf_counter() with torch.no_grad(): view_transformer.view_transform(input, depth, tran_feat)[0] torch.cuda.synchronize() elapsed = time.perf_counter() - start_time if i >= num_warmup: pure_inf_time += elapsed if (i + 1) % args.log_interval == 0: fps = (i + 1 - num_warmup) / pure_inf_time print(f'Done image [{i + 1:<3}/ {args.samples}], ' f'fps: {fps:.1f} img / s') if (i + 1) == args.samples: pure_inf_time += elapsed fps = (i + 1 - num_warmup) / pure_inf_time print(f'Overall fps: {fps:.1f} img / s') return fps if __name__ == '__main__': repeat_times = 1 fps_list = [] for _ in range(repeat_times): fps = main() time.sleep(5) fps_list.append(fps) fps_list = np.array(fps_list, dtype=np.float32) print(f'Mean Overall fps: {fps_list.mean():.4f} +' f' {np.sqrt(fps_list.var()):.4f} img / s') ================================================ FILE: tools/analysis_tools/create_video.py ================================================ import random as rd import cv2 as cv import numpy as np class RecordMovie(object): def __init__(self, img_width, img_height): self.video_writer = None self.is_end = False self.img_width = img_width self.img_height = img_height def start(self, file_name, freq): four_cc = cv.VideoWriter_fourcc(*'mp4v') img_size = (self.img_width, self.img_height) self.video_writer = cv.VideoWriter() self.video_writer.open(file_name, four_cc, freq, img_size, True) def record(self, img): if self.is_end is False: self.video_writer.write(img) def end(self): self.is_end = True self.video_writer.release() import os import mmcv def main_waymo(): rm = RecordMovie(200, 200) rm.start("test_waymo.mp4", 10) # base_path = 'test/anchor_traintest_noflip_1.0/Fri_Jun__3_17_10_33_2022/show_dirs/testing_camera/image_0/' files = os.listdir('/mount/data/lsbevv2/vis') for i in range(320): imgs = cv.imread(os.path.join('/mount/data/lsbevv2/vis', f'a_{i}.png')) print(i) print(imgs.shape) rm.record(imgs) rm.end() if __name__ == '__main__': #main_nuscenes() main_waymo() ================================================ FILE: tools/analysis_tools/generate_mask_based_on_lidar_points.py ================================================ from mmdet3d.datasets import build_dataset import mmcv from mmcv import Config, DictAction from mmdet3d.datasets import build_dataset cfg = Config.fromfile('/mount/data/lsbevv2/occupancy_configs/occupancy/debug.py') dataset = build_dataset(cfg.data.test, dict(test_mode=True)) import numpy as np import torch import numpy as np import torch import matplotlib.pyplot as plt import cv2 import torch from torchvision.utils import make_grid import torchvision import matplotlib.pyplot as plt import cv2 import json import os def convert_color(img_path): plt.figure() img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE) plt.imsave(img_path, img, cmap=plt.get_cmap('viridis')) plt.close() def save_tensor(tensor, path, pad_value=254.0,normalize=False): print('save_tensor', path) tensor = tensor.to(torch.float).detach().cpu() max_ = tensor.flatten(1).max(-1).values[:, None, None] min_ = tensor.flatten(1).min(-1).values[:, None, None] tensor = (tensor-min_)/(max_-min_) if tensor.type() == 'torch.BoolTensor': tensor = tensor*255 if len(tensor.shape) == 3: tensor = tensor.unsqueeze(1) tensor = make_grid(tensor, pad_value=pad_value, normalize=normalize).permute(1, 2, 0).numpy().copy() torchvision.utils.save_image(torch.tensor(tensor).permute(2, 0, 1), path) convert_color(path) def generate_forward_transformation_matrix(bda, img_meta_dict=None): b = bda.size(0) hom_res = torch.eye(4)[None].repeat(b, 1, 1).to(bda.device) for i in range(b): hom_res[i, :3, :3] = bda[i] return hom_res from segment_anything import sam_model_registry, SamPredictor def show_mask(mask, ax, random_color=False, cls_=None): classname_to_color= {'ignore_class': (255, 255, 255), 'barrier': (112, 128, 144), # Slategrey 'bicycle': (220, 20, 60), # Crimson 'bus': (255, 127, 80), # Coral 'car': (255, 158, 0), # Orange 'construction_vehicle': (233, 150, 70), # Darksalmon 'motorcycle': (255, 61, 99), # Red 'pedestrian': (0, 0, 230), # Blue 'traffic_cone': (47, 79, 79), # Darkslategrey 'trailer': (255, 140, 0), # Darkorange 'truck': (255, 99, 71), # Tomato 'driveable_surface': (0, 207, 191), # nuTonomy green 'other_flat': (175, 0, 75), 'sidewalk': (75, 0, 75), 'terrain': (112, 180, 60), 'manmade': (222, 184, 135), # Burlywood 'vegetation': (0, 175, 0)} colors = np.array(list(classname_to_color.values())).astype(np.uint8) alpha = np.ones((colors.shape[0], 1), dtype=np.uint8) * 0.5 colors = np.hstack([colors/255, alpha]) if random_color: color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) elif cls_ is not None: color = colors[cls_] else: color = np.array([30/255, 144/255, 255/255, 0.6]) h, w = mask.shape[-2:] mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) ax.imshow(mask_image) def show_points(coords, labels, ax, marker_size=375): pos_points = coords[labels==1] neg_points = coords[labels==0] ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25) def show_box(box, ax): x0, y0 = box[0], box[1] w, h = box[2] - box[0], box[3] - box[1] ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2)) idx_to_name = mmcv.load('/mount/data/lsbevv2/data/nuscenes/v1.0-trainval/category.json') idx_to_name = [each['name'] for each in idx_to_name] name_category = {'animal':0, 'human.pedestrian.personal_mobility':0, 'human.pedestrian.stroller':0, 'human.pedestrian.wheelchair':0, 'movable_object.debris':0, 'movable_object.pushable_pullable':0, 'static_object.bicycle_rack':0, 'vehicle.emergency.ambulance':0, 'vehicle.emergency.police':0, 'noise':0, 'static.other':0, 'vehicle.ego':0, 'movable_object.barrier':1, 'vehicle.bicycle':2, 'vehicle.bus.bendy':3, 'vehicle.bus.rigid':3, 'vehicle.car':4, 'vehicle.construction':5, 'vehicle.motorcycle':6, 'human.pedestrian.adult':7, 'human.pedestrian.child':7, 'human.pedestrian.construction_worker': 7, 'human.pedestrian.police_officer':7, 'movable_object.trafficcone': 8, 'vehicle.trailer': 9, 'vehicle.truck': 10, 'flat.driveable_surface': 11, 'flat.other': 12, 'flat.sidewalk': 13, 'flat.terrain': 14, 'static.manmade': 15, 'static.vegetation': 16} idx_to_category = [name_category[each] for each in idx_to_name] from segment_anything import sam_model_registry, SamPredictor sam_checkpoint = "/mount/data/segment-anything/sam_vit_h_4b8939.pth" model_type = "vit_h" device = "cuda" sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) sam.to(device=device) predictor = SamPredictor(sam) # front_1 = './data/nuscenes/samples/CAM_FRONT_LEFT/n015-2018-07-11-11-54-16+0800__CAM_FRONT_LEFT__1531281439754844.jpg' # import cv2 # image = cv2.imread(front_1) # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) import json from collections import defaultdict file_path = '/mount/data/lsbevv2/data/nuscenes/bevdetv2-nuscenes_infos_val.coco.json' data = json.load(open(file_path, 'r')) category_map_from_det_to_set = { 0:4, 1:10, 2:9, 3:3, 4:5, 5:2, 6:6, 7:7, 8:8, 9:1 } sample_map = defaultdict(lambda: []) image_map = defaultdict(lambda: []) for each in data['images']: sample_map[each['token']].append(each['id']) for i, each in enumerate(data['annotations']): image_map[each['image_id']].append(i) import argparse import random from tqdm import tqdm def f(gap=0): co = 0 for i in tqdm(range(gap, len(dataset))): co +=1 print(i) info = dataset[i] category_map = info['gt_depth'][0] for j in range(len(idx_to_category)): category_map[category_map==j] = idx_to_category[j] imgs = info['img_inputs'][0][0] cams = [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ] for ind, img in enumerate(imgs): img = img.permute(1, 2, 0).to(torch.uint8) image = img.cpu().numpy() predictor.set_image(image) per_category_map = category_map[ind] sample_data_token = info['img_metas'][0].data['curr']['cams'][cams[ind]]['sample_data_token'] # if os.path.isfile(f'/mount/data/lsbevv2/data/nus_sem/{sample_data_token}.png'): continue bboxes =[data['annotations'][each_idx] for each_idx in image_map[sample_data_token]] input_boxes = [] for bbox in bboxes: bbox['category_id'] = category_map_from_det_to_set[bbox['category_id']] x, y, w, h = bbox['bbox'] input_boxes.append([x, y, x+w, y+h]) # input_box = np.array([x, y, x+w, y+h]) # xyxy format input_boxes = torch.tensor(input_boxes, device=predictor.device) transformed_boxes = predictor.transform.apply_boxes_torch(input_boxes, image.shape[:2]) sem_masks = np.zeros([17, 900, 1600]) + 0.05 thing_mask = np.zeros([900, 1600]) if len(input_boxes)>0: try: masks, scores, logits = predictor.predict_torch( point_coords=None, point_labels=None, boxes=torch.tensor(transformed_boxes).to(device), multimask_output=False, return_logits=False, ) masks, scores = masks.squeeze(1).cpu().numpy(), scores.squeeze(1).cpu().numpy() for index, mask in enumerate(masks): id = bboxes[index]['category_id'] sem_masks[id][mask] = scores[index] + 0.4 # 0.4 is the bias of bbox prompt campared to point prompt thing_mask[mask] = 1 except: print(sample_data_token, ' thing error!!!!') for stuff_class in [11, 12, 13, 14, 15, 16]: points = torch.tensor((per_category_map == stuff_class).nonzero()) if points.size(0)==0: continue else: xs = [each[0].item() for each in points] ys = [each[1].item() for each in points] points = points[thing_mask[xs, ys]==0] if points.size(0)==0: continue if points.size(0)<=5: points = random.choices(points, k=min(3, points.size(0))) else: try: y = points[:, 0].to(torch.float).mean() x = points[:, 1].to(torch.float).mean() right_up = random.choices(points[(points[:,0]>=y) & (points[:,1]>=x)], k=1) left_up = random.choices(points[(points[:,0]=x)], k=1) right_bottom = random.choices(points[(points[:,0]>=y) & (points[:,1]0.6.2') def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument( '--shape', type=int, nargs='+', default=[40000, 4], help='input point cloud size') parser.add_argument( '--modality', type=str, default='point', choices=['point', 'image', 'multi'], help='input data modality') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') args = parser.parse_args() return args def construct_input(input_shape): rot = torch.eye(3).float().cuda().view(1, 3, 3) rot = torch.cat([rot for _ in range(6)], axis=0).view(1, 6, 3, 3) input = dict(img_inputs=[ torch.ones(()).new_empty((1, 6, 3, *input_shape)).cuda(), rot, torch.ones((1, 6, 3)).cuda(), rot, rot, torch.ones((1, 6, 3)).cuda(), torch.eye(3).float().cuda().view(1, 3, 3) ]) return input def main(): args = parse_args() if args.modality == 'point': assert len(args.shape) == 2, 'invalid input shape' input_shape = tuple(args.shape) elif args.modality == 'image': if len(args.shape) == 1: input_shape = (3, args.shape[0], args.shape[0]) elif len(args.shape) == 2: input_shape = (3, ) + tuple(args.shape) else: raise ValueError('invalid input shape') elif args.modality == 'multi': raise NotImplementedError( 'FLOPs counter is currently not supported for models with ' 'multi-modality input') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) model = build_model( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) if torch.cuda.is_available(): model.cuda() model.eval() if hasattr(model, 'forward_dummy'): model.forward = model.forward_dummy else: raise NotImplementedError( 'FLOPs counter is currently not supported for {}'.format( model.__class__.__name__)) flops, params = get_model_complexity_info( model, input_shape, input_constructor=construct_input) split_line = '=' * 30 print(f'{split_line}\nInput shape: {input_shape}\n' f'Flops: {flops}\nParams: {params}\n{split_line}') print('!!!Please be cautious if you use the results in papers. ' 'You may need to check if all ops are supported and verify that the ' 'flops computation is correct.') if __name__ == '__main__': main() ================================================ FILE: tools/analysis_tools/model_converter.py ================================================ import torch model = torch.load('/mount/data/FBBEV/work_dirs/mappetrv3_noaug_8x8_36ep_102x102/iter_31644_ema.pth') keys = list(model['state_dict'].keys()) for k in keys: model['state_dict'][k.replace('pts_bbox_head', 'uni_perceive_head')] = model['state_dict'][k] torch.save(model, '/mount/data/FBBEV/work_dirs/mappetrv3_noaug_8x8_36ep_102x102/iter_31644_ema2.pth') ================================================ FILE: tools/analysis_tools/occupancy_cbgs.py ================================================ import os import os.path as osp import sys import mmcv import numpy as np from collections import Counter, defaultdict from tqdm import tqdm total_counter = defaultdict(lambda: 0) info = mmcv.load('/mount/dnn_data/occupancy_2023/annotations.json') p1 = '/mount/dnn_data/occupancy_2023/gts' json_map = {} scenes = os.listdir(p1) for scene in tqdm(info['train_split']): for sample in os.listdir(osp.join(p1, scene)): data = np.load(osp.join(p1, scene, sample, 'labels.npz')) occupancy = data['semantics'] visible_mask = data['mask_camera'] index = (visible_mask>0).nonzero() seen = occupancy[index[0],index[1],index[2]] counter = Counter(seen) json_map[sample] = {} for a,b in counter.items(): total_counter[int(a)]+=b json_map[sample][int(a)] = b from IPython import embed embed() exit() new_json_map = {} for key in json_map.keys() new_json_map[key] = {} for k, v in json_map[key].items(): new_json_map[key][int(k)] = int(v) # for scene in scenes: ================================================ FILE: tools/analysis_tools/vis.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import argparse import json import os import pickle import cv2 import numpy as np from pyquaternion.quaternion import Quaternion from mmdet3d.core.bbox.structures.lidar_box3d import LiDARInstance3DBoxes as LB def check_point_in_img(points, height, width): valid = np.logical_and(points[:, 0] >= 0, points[:, 1] >= 0) valid = np.logical_and( valid, np.logical_and(points[:, 0] < width, points[:, 1] < height)) return valid def depth2color(depth): gray = max(0, min((depth + 2.5) / 3.0, 1.0)) max_lumi = 200 colors = np.array( [[max_lumi, 0, max_lumi], [max_lumi, 0, 0], [max_lumi, max_lumi, 0], [0, max_lumi, 0], [0, max_lumi, max_lumi], [0, 0, max_lumi]], dtype=np.float32) if gray == 1: return tuple(colors[-1].tolist()) num_rank = len(colors) - 1 rank = np.floor(gray * num_rank).astype(np.int) diff = (gray - rank / num_rank) * num_rank return tuple( (colors[rank] + (colors[rank + 1] - colors[rank]) * diff).tolist()) def lidar2img(points_lidar, camrera_info): points_lidar_homogeneous = \ np.concatenate([points_lidar, np.ones((points_lidar.shape[0], 1), dtype=points_lidar.dtype)], axis=1) camera2lidar = np.eye(4, dtype=np.float32) camera2lidar[:3, :3] = camrera_info['sensor2lidar_rotation'] camera2lidar[:3, 3] = camrera_info['sensor2lidar_translation'] lidar2camera = np.linalg.inv(camera2lidar) points_camera_homogeneous = points_lidar_homogeneous @ lidar2camera.T points_camera = points_camera_homogeneous[:, :3] valid = np.ones((points_camera.shape[0]), dtype=bool) valid = np.logical_and(points_camera[:, -1] > 0.5, valid) points_camera = points_camera / points_camera[:, 2:3] camera2img = camrera_info['cam_intrinsic'] points_img = points_camera @ camera2img.T points_img = points_img[:, :2] return points_img, valid def get_lidar2global(infos): lidar2ego = np.eye(4, dtype=np.float32) lidar2ego[:3, :3] = Quaternion(infos['lidar2ego_rotation']).rotation_matrix lidar2ego[:3, 3] = infos['lidar2ego_translation'] ego2global = np.eye(4, dtype=np.float32) ego2global[:3, :3] = Quaternion( infos['ego2global_rotation']).rotation_matrix ego2global[:3, 3] = infos['ego2global_translation'] return ego2global @ lidar2ego def parse_args(): parser = argparse.ArgumentParser(description='Visualize the predicted ' 'result of nuScenes') parser.add_argument( 'res', help='Path to the predicted result in json format') parser.add_argument( '--show-range', type=int, default=50, help='Range of visualization in BEV') parser.add_argument( '--canva-size', type=int, default=1000, help='Size of canva in pixel') parser.add_argument( '--vis-frames', type=int, default=500, help='Number of frames for visualization') parser.add_argument( '--scale-factor', type=int, default=4, help='Trade-off between image-view and bev in size of ' 'the visualized canvas') parser.add_argument( '--vis-thred', type=float, default=0.3, help='Threshold the predicted results') parser.add_argument('--draw-gt', action='store_true') parser.add_argument( '--version', type=str, default='val', help='Version of nuScenes dataset') parser.add_argument( '--root_path', type=str, default='./data/nuscenes', help='Path to nuScenes dataset') parser.add_argument( '--save_path', type=str, default='./vis', help='Path to save visualization results') parser.add_argument( '--format', type=str, default='video', choices=['video', 'image'], help='The desired format of the visualization result') parser.add_argument( '--fps', type=int, default=20, help='Frame rate of video') parser.add_argument( '--video-prefix', type=str, default='vis', help='name of video') args = parser.parse_args() return args color_map = {0: (255, 255, 0), 1: (0, 255, 255)} def main(): args = parse_args() # load predicted results res = json.load(open(args.res, 'r')) # load dataset information info_path = \ args.root_path + '/bevdetv2-nuscenes_infos_%s.pkl' % args.version dataset = pickle.load(open(info_path, 'rb')) # prepare save path and medium vis_dir = args.save_path if not os.path.exists(vis_dir): os.makedirs(vis_dir) print('saving visualized result to %s' % vis_dir) scale_factor = args.scale_factor canva_size = args.canva_size show_range = args.show_range if args.format == 'video': fourcc = cv2.VideoWriter_fourcc(*'MP4V') vout = cv2.VideoWriter( os.path.join(vis_dir, '%s.mp4' % args.video_prefix), fourcc, args.fps, (int(1600 / scale_factor * 3), int(900 / scale_factor * 2 + canva_size))) draw_boxes_indexes_bev = [(0, 1), (1, 2), (2, 3), (3, 0)] draw_boxes_indexes_img_view = [(0, 1), (1, 2), (2, 3), (3, 0), (4, 5), (5, 6), (6, 7), (7, 4), (0, 4), (1, 5), (2, 6), (3, 7)] views = [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ] print('start visualizing results') for cnt, infos in enumerate( dataset['infos'][:min(args.vis_frames, len(dataset['infos']))]): if cnt % 10 == 0: print('%d/%d' % (cnt, min(args.vis_frames, len(dataset['infos'])))) # collect instances pred_res = res['results'][infos['token']] pred_boxes = [ pred_res[rid]['translation'] + pred_res[rid]['size'] + [ Quaternion(pred_res[rid]['rotation']).yaw_pitch_roll[0] + np.pi / 2 ] for rid in range(len(pred_res)) ] if len(pred_boxes) == 0: corners_lidar = np.zeros((0, 3), dtype=np.float32) else: pred_boxes = np.array(pred_boxes, dtype=np.float32) boxes = LB(pred_boxes, origin=(0.5, 0.5, 0.0)) corners_global = boxes.corners.numpy().reshape(-1, 3) corners_global = np.concatenate( [corners_global, np.ones([corners_global.shape[0], 1])], axis=1) l2g = get_lidar2global(infos) corners_lidar = corners_global @ np.linalg.inv(l2g).T corners_lidar = corners_lidar[:, :3] pred_flag = np.ones((corners_lidar.shape[0] // 8, ), dtype=np.bool) scores = [ pred_res[rid]['detection_score'] for rid in range(len(pred_res)) ] if args.draw_gt: gt_boxes = infos['gt_boxes'] gt_boxes[:, -1] = gt_boxes[:, -1] + np.pi / 2 width = gt_boxes[:, 4].copy() gt_boxes[:, 4] = gt_boxes[:, 3] gt_boxes[:, 3] = width corners_lidar_gt = \ LB(infos['gt_boxes'], origin=(0.5, 0.5, 0.5)).corners.numpy().reshape(-1, 3) corners_lidar = np.concatenate([corners_lidar, corners_lidar_gt], axis=0) gt_flag = np.ones((corners_lidar_gt.shape[0] // 8), dtype=np.bool) pred_flag = np.concatenate( [pred_flag, np.logical_not(gt_flag)], axis=0) scores = scores + [0 for _ in range(infos['gt_boxes'].shape[0])] scores = np.array(scores, dtype=np.float32) sort_ids = np.argsort(scores) # image view imgs = [] for view in views: img = cv2.imread(infos['cams'][view]['data_path']) # draw instances corners_img, valid = lidar2img(corners_lidar, infos['cams'][view]) valid = np.logical_and( valid, check_point_in_img(corners_img, img.shape[0], img.shape[1])) valid = valid.reshape(-1, 8) corners_img = corners_img.reshape(-1, 8, 2).astype(np.int) for aid in range(valid.shape[0]): for index in draw_boxes_indexes_img_view: if valid[aid, index[0]] and valid[aid, index[1]]: cv2.line( img, corners_img[aid, index[0]], corners_img[aid, index[1]], color=color_map[int(pred_flag[aid])], thickness=scale_factor) imgs.append(img) # bird-eye-view canvas = np.zeros((int(canva_size), int(canva_size), 3), dtype=np.uint8) # draw lidar points lidar_points = np.fromfile(infos['lidar_path'], dtype=np.float32) lidar_points = lidar_points.reshape(-1, 5)[:, :3] lidar_points[:, 1] = -lidar_points[:, 1] lidar_points[:, :2] = \ (lidar_points[:, :2] + show_range) / show_range / 2.0 * canva_size for p in lidar_points: if check_point_in_img( p.reshape(1, 3), canvas.shape[1], canvas.shape[0])[0]: color = depth2color(p[2]) cv2.circle( canvas, (int(p[0]), int(p[1])), radius=0, color=color, thickness=1) # draw instances corners_lidar = corners_lidar.reshape(-1, 8, 3) corners_lidar[:, :, 1] = -corners_lidar[:, :, 1] bottom_corners_bev = corners_lidar[:, [0, 3, 7, 4], :2] bottom_corners_bev = \ (bottom_corners_bev + show_range) / show_range / 2.0 * canva_size bottom_corners_bev = np.round(bottom_corners_bev).astype(np.int32) center_bev = corners_lidar[:, [0, 3, 7, 4], :2].mean(axis=1) head_bev = corners_lidar[:, [0, 4], :2].mean(axis=1) canter_canvas = \ (center_bev + show_range) / show_range / 2.0 * canva_size center_canvas = canter_canvas.astype(np.int32) head_canvas = (head_bev + show_range) / show_range / 2.0 * canva_size head_canvas = head_canvas.astype(np.int32) for rid in sort_ids: score = scores[rid] if score < args.vis_thred and pred_flag[rid]: continue score = min(score * 2.0, 1.0) if pred_flag[rid] else 1.0 color = color_map[int(pred_flag[rid])] for index in draw_boxes_indexes_bev: cv2.line( canvas, bottom_corners_bev[rid, index[0]], bottom_corners_bev[rid, index[1]], [color[0] * score, color[1] * score, color[2] * score], thickness=1) cv2.line( canvas, center_canvas[rid], head_canvas[rid], [color[0] * score, color[1] * score, color[2] * score], 1, lineType=8) # fuse image-view and bev img = np.zeros((900 * 2 + canva_size * scale_factor, 1600 * 3, 3), dtype=np.uint8) img[:900, :, :] = np.concatenate(imgs[:3], axis=1) img_back = np.concatenate( [imgs[3][:, ::-1, :], imgs[4][:, ::-1, :], imgs[5][:, ::-1, :]], axis=1) img[900 + canva_size * scale_factor:, :, :] = img_back img = cv2.resize(img, (int(1600 / scale_factor * 3), int(900 / scale_factor * 2 + canva_size))) w_begin = int((1600 * 3 / scale_factor - canva_size) // 2) img[int(900 / scale_factor):int(900 / scale_factor) + canva_size, w_begin:w_begin + canva_size, :] = canvas if args.format == 'image': cv2.imwrite(os.path.join(vis_dir, '%s.jpg' % infos['token']), img) elif args.format == 'video': vout.write(img) if args.format == 'video': vout.release() if __name__ == '__main__': main() ================================================ FILE: tools/analysis_tools/vis_occupancy.py ================================================ # pythonw vis_fru.py # from operator import gt import pickle import numpy as np # from omegaconf import DictConfig from mayavi import mlab from collections import Counter # path = r'n008-2018-08-28-16-16-48-0400__LIDAR_TOP__1535488206297315.pcd.bin' # points = np.fromfile(path, dtype=np.float16).reshape(-1, 5) # print(points.shape) import argparse point_cloud_range = [-50, -50, -2, 50, 50, 5] voxel_size=[0.2, 0.2, 0.2] voxel_shape=(int((point_cloud_range[3]-point_cloud_range[0])/voxel_size[0]), int((point_cloud_range[4]-point_cloud_range[1])/voxel_size[1]), int((point_cloud_range[5]-point_cloud_range[2])/voxel_size[2])) map_label = {0: 0, 1: 1, 2: 1, 3: 1, 4: 1, 5: 1, 6: 1, 7: 1, 8: 1, 9: 2, 10: 2, 11: 2, 12: 2, 13: 2, 14: 3, 15: 3, 16: 3, 17: 3, 18: 3, 19: 3, 20: 3, 21: 3, 22: 3, 23: 3, 24: 4, 25: 4, 26: 4, 27: 4, 28: 4, 29: 4, 30: 4, 31: 3} def remove_far(points, point_cloud_range): mask = (points[:, 0]>point_cloud_range[0]) & (points[:, 0]point_cloud_range[1]) & (points[:, 1]point_cloud_range[2]) & (points[:, 2] 0) & (fov_grid_coords[:, 3] < 255) ] # outfov_voxels = outfov_grid_coords[ # (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255) # ] figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1)) # Draw the camera # mlab.triangular_mesh( # x, y, z, triangles, representation="wireframe", color=(0, 0, 0), line_width=5 # ) # counter = Counter(list(fov_voxels[:,3].reshape(-1))) # for key in counter: # if counter[key] < 100: # index = fov_voxels[:,3] != key # fov_voxels = fov_voxels[index] # Draw occupied inside FOV voxels plt_plot_fov = mlab.points3d( fov_voxels[:, 0], fov_voxels[:, 1], fov_voxels[:, 2], fov_voxels[:, 3], colormap="viridis", scale_factor=voxel_size - 0.05 * voxel_size, mode="cube", opacity=1.0, vmin=1, vmax=19, ) # Draw occupied outside FOV voxels # plt_plot_outfov = mlab.points3d( # outfov_voxels[:, 0], # outfov_voxels[:, 1], # outfov_voxels[:, 2], # outfov_voxels[:, 3], # colormap="viridis", # scale_factor=voxel_size - 0.05 * voxel_size, # mode="cube", # opacity=1.0, # vmin=1, # vmax=19, # ) classname_to_color = { # RGB. "noise": (0, 0, 0), # Black. "animal": (70, 130, 180), # Steelblue "human.pedestrian.adult": (0, 0, 230), # Blue "human.pedestrian.child":(0, 0, 230), # Skyblue, "human.pedestrian.construction_worker":(0, 0, 230), # Cornflowerblue "human.pedestrian.personal_mobility": (0, 0, 230), # Palevioletred "human.pedestrian.police_officer":(0, 0, 230), # Navy, "human.pedestrian.stroller": (0, 0, 230), # Lightcoral "human.pedestrian.wheelchair": (0, 0, 230), # Blueviolet "movable_object.barrier": (112, 128, 144), # Slategrey "movable_object.debris": (112, 128, 144), # Chocolate "movable_object.pushable_pullable":(112, 128, 144), # Dimgrey "movable_object.trafficcone":(112, 128, 144), # Darkslategrey "static_object.bicycle_rack": (188, 143, 143), # Rosybrown "vehicle.bicycle": (220, 20, 60), # Crimson "vehicle.bus.bendy":(255, 158, 0), # Coral "vehicle.bus.rigid": (255, 158, 0), # Orangered "vehicle.car": (255, 158, 0), # Orange "vehicle.construction":(255, 158, 0), # Darksalmon "vehicle.emergency.ambulance":(255, 158, 0), "vehicle.emergency.police": (255, 158, 0), # Gold "vehicle.motorcycle": (255, 158, 0), # Red "vehicle.trailer":(255, 158, 0), # Darkorange "vehicle.truck": (255, 158, 0), # Tomato "flat.driveable_surface": (0, 207, 191), # nuTonomy green "flat.other":(0, 207, 191), "flat.sidewalk": (75, 0, 75), "flat.terrain": (0, 207, 191), "static.manmade": (222, 184, 135), # Burlywood "static.other": (0, 207, 191), # Bisque "static.vegetation": (0, 175, 0), # Green "vehicle.ego": (255, 240, 245) } classname_to_color= {'ignore_class': (0, 0, 0), # Black. 'barrier': (112, 128, 144), # Slategrey 'bicycle': (220, 20, 60), # Crimson 'bus': (255, 127, 80), # Coral 'car': (255, 158, 0), # Orange 'construction_vehicle': (233, 150, 70), # Darksalmon 'motorcycle': (255, 61, 99), # Red 'pedestrian': (0, 0, 230), # Blue 'traffic_cone': (47, 79, 79), # Darkslategrey 'trailer': (255, 140, 0), # Darkorange 'truck': (255, 99, 71), # Tomato 'driveable_surface': (0, 207, 191), # nuTonomy green 'other_flat': (175, 0, 75), 'sidewalk': (75, 0, 75), 'terrain': (112, 180, 60), 'manmade': (222, 184, 135), # Burlywood 'vegetation': (0, 175, 0)} colors = np.array(list(classname_to_color.values())).astype(np.uint8) alpha = np.ones((colors.shape[0], 1), dtype=np.uint8) * 255 colors = np.hstack([colors, alpha]) plt_plot_fov.glyph.scale_mode = "scale_by_vector" # plt_plot_outfov.glyph.scale_mode = "scale_by_vector" plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors plt_plot_fov.module_manager.scalar_lut_manager.data_range = [0, 17] mlab.show() def voxel_exist(voxels, x,y,z): if x < 0 or y < 0 or z < 0 or x >= voxels.shape[0] or y >= voxels.shape[1] or z >= voxels.shape[2]: return False else: return voxels[x,y,z] def max_connected(voxels, distance=3): """ Keep the max connected component of the voxels (a boolean matrix). distance is the distance considered as neighbors, i.e. if distance = 2, then two blocks are considered connected even with a hole in between""" assert(distance > 0) component_list = [] # max_component = np.zeros(voxels.shape) voxels_copy = np.copy(voxels) for startx in range(voxels.shape[0]): for starty in range(voxels.shape[1]): for startz in range(voxels.shape[2]): if not voxels_copy[startx,starty,startz]: continue # start a new component component = np.zeros(voxels.shape, dtype=bool) stack = [[startx,starty,startz]] component[startx,starty,startz] = True voxels_copy[startx,starty,startz] = False while len(stack) > 0: x,y,z = stack.pop() category = voxels[x,y,z] for i in range(x-distance, x+distance + 1): for j in range(y-distance, y+distance + 1): for k in range(z-distance, z+distance + 1): if (i-x)**2+(j-y)**2+(k-z)**2 > distance * distance: continue category = voxels[x,y,z] if voxel_exist(voxels_copy, i,j,k) and voxels[i,j,k] == category: voxels_copy[i,j,k] = False component[i,j,k] = True stack.append([i,j,k]) component_list.append(component) # if component.sum() > max_component.sum(): # max_component = component max_component = np.zeros(voxels.shape, dtype=bool) for each in component_list: if each.sum()>10: max_component |= each return max_component # points = remove_far(points, point_cloud_range) def main(filepath='*.npz'): vox_origin = np.array([0, 0, -2]) # y_pred = points2voxel(points, voxel_shape, voxel_size, 20) # y_del = ~max_connected(y_pred) # y_pred[y_del] = 0 if filepath.endswith('npy'): y_pred = np.load(filepath) elif filepath.endswith('npz'): y_pred = np.load(filepath)['pred']# ['semantics'] # y_pred: shape 200x200x16 draw( y_pred, None, vox_origin, None, voxel_size=0.2, f=552.55426, img_size=(1600, 900), d=7, ) if __name__ == "__main__": parser = argparse.ArgumentParser(description='vis occ') parser.add_argument('path', help='path to npz') args = parser.parse_args() main(args.path) ================================================ FILE: tools/create_data.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse from os import path as osp from tools.data_converter import indoor_converter as indoor from tools.data_converter import kitti_converter as kitti from tools.data_converter import lyft_converter as lyft_converter from tools.data_converter import nuscenes_converter as nuscenes_converter from tools.data_converter.create_gt_database import ( GTDatabaseCreater, create_groundtruth_database) def kitti_data_prep(root_path, info_prefix, version, out_dir, with_plane=False): """Prepare data related to Kitti dataset. Related data consists of '.pkl' files recording basic infos, 2D annotations and groundtruth database. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. version (str): Dataset version. out_dir (str): Output directory of the groundtruth database info. with_plane (bool, optional): Whether to use plane information. Default: False. """ kitti.create_kitti_info_file(root_path, info_prefix, with_plane) kitti.create_reduced_point_cloud(root_path, info_prefix) info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl') info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl') info_trainval_path = osp.join(root_path, f'{info_prefix}_infos_trainval.pkl') info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl') kitti.export_2d_annotation(root_path, info_train_path) kitti.export_2d_annotation(root_path, info_val_path) kitti.export_2d_annotation(root_path, info_trainval_path) kitti.export_2d_annotation(root_path, info_test_path) create_groundtruth_database( 'KittiDataset', root_path, info_prefix, f'{out_dir}/{info_prefix}_infos_train.pkl', relative_path=False, mask_anno_path='instances_train.json', with_mask=(version == 'mask')) def nuscenes_data_prep(root_path, info_prefix, version, dataset_name, out_dir, max_sweeps=10): """Prepare data related to nuScenes dataset. Related data consists of '.pkl' files recording basic infos, 2D annotations and groundtruth database. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. version (str): Dataset version. dataset_name (str): The dataset class name. out_dir (str): Output directory of the groundtruth database info. max_sweeps (int, optional): Number of input consecutive frames. Default: 10 """ nuscenes_converter.create_nuscenes_infos( root_path, info_prefix, version=version, max_sweeps=max_sweeps) if version == 'v1.0-test': info_test_path = osp.join(root_path, f'{info_prefix}_infos_test.pkl') nuscenes_converter.export_2d_annotation( root_path, info_test_path, version=version) return info_train_path = osp.join(root_path, f'{info_prefix}_infos_train.pkl') info_val_path = osp.join(root_path, f'{info_prefix}_infos_val.pkl') nuscenes_converter.export_2d_annotation( root_path, info_train_path, version=version) nuscenes_converter.export_2d_annotation( root_path, info_val_path, version=version) create_groundtruth_database(dataset_name, root_path, info_prefix, f'{out_dir}/{info_prefix}_infos_train.pkl') def lyft_data_prep(root_path, info_prefix, version, max_sweeps=10): """Prepare data related to Lyft dataset. Related data consists of '.pkl' files recording basic infos. Although the ground truth database and 2D annotations are not used in Lyft, it can also be generated like nuScenes. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. version (str): Dataset version. max_sweeps (int, optional): Number of input consecutive frames. Defaults to 10. """ lyft_converter.create_lyft_infos( root_path, info_prefix, version=version, max_sweeps=max_sweeps) def scannet_data_prep(root_path, info_prefix, out_dir, workers): """Prepare the info file for scannet dataset. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. out_dir (str): Output directory of the generated info file. workers (int): Number of threads to be used. """ indoor.create_indoor_info_file( root_path, info_prefix, out_dir, workers=workers) def s3dis_data_prep(root_path, info_prefix, out_dir, workers): """Prepare the info file for s3dis dataset. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. out_dir (str): Output directory of the generated info file. workers (int): Number of threads to be used. """ indoor.create_indoor_info_file( root_path, info_prefix, out_dir, workers=workers) def sunrgbd_data_prep(root_path, info_prefix, out_dir, workers, num_points): """Prepare the info file for sunrgbd dataset. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. out_dir (str): Output directory of the generated info file. workers (int): Number of threads to be used. """ indoor.create_indoor_info_file( root_path, info_prefix, out_dir, workers=workers, num_points=num_points) def waymo_data_prep(root_path, info_prefix, version, out_dir, workers, max_sweeps=5): """Prepare the info file for waymo dataset. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. out_dir (str): Output directory of the generated info file. workers (int): Number of threads to be used. max_sweeps (int, optional): Number of input consecutive frames. Default: 5. Here we store pose information of these frames for later use. """ from tools.data_converter import waymo_converter as waymo splits = ['training', 'validation', 'testing'] for i, split in enumerate(splits): load_dir = osp.join(root_path, 'waymo_format', split) if split == 'validation': save_dir = osp.join(out_dir, 'kitti_format', 'training') else: save_dir = osp.join(out_dir, 'kitti_format', split) converter = waymo.Waymo2KITTI( load_dir, save_dir, prefix=str(i), workers=workers, test_mode=(split == 'testing')) converter.convert() # Generate waymo infos out_dir = osp.join(out_dir, 'kitti_format') kitti.create_waymo_info_file( out_dir, info_prefix, max_sweeps=max_sweeps, workers=workers) GTDatabaseCreater( 'WaymoDataset', out_dir, info_prefix, f'{out_dir}/{info_prefix}_infos_train.pkl', relative_path=False, with_mask=False, num_worker=workers).create() parser = argparse.ArgumentParser(description='Data converter arg parser') parser.add_argument('dataset', metavar='kitti', help='name of the dataset') parser.add_argument( '--root-path', type=str, default='./data/kitti', help='specify the root path of dataset') parser.add_argument( '--version', type=str, default='v1.0', required=False, help='specify the dataset version, no need for kitti') parser.add_argument( '--max-sweeps', type=int, default=10, required=False, help='specify sweeps of lidar per example') parser.add_argument( '--with-plane', action='store_true', help='Whether to use plane information for kitti.') parser.add_argument( '--num-points', type=int, default=-1, help='Number of points to sample for indoor datasets.') parser.add_argument( '--out-dir', type=str, default='./data/kitti', required=False, help='name of info pkl') parser.add_argument('--extra-tag', type=str, default='kitti') parser.add_argument( '--workers', type=int, default=4, help='number of threads to be used') args = parser.parse_args() if __name__ == '__main__': if args.dataset == 'kitti': kitti_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=args.version, out_dir=args.out_dir, with_plane=args.with_plane) elif args.dataset == 'nuscenes' and args.version != 'v1.0-mini': train_version = f'{args.version}-trainval' nuscenes_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=train_version, dataset_name='NuScenesDataset', out_dir=args.out_dir, max_sweeps=args.max_sweeps) test_version = f'{args.version}-test' nuscenes_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=test_version, dataset_name='NuScenesDataset', out_dir=args.out_dir, max_sweeps=args.max_sweeps) elif args.dataset == 'nuscenes' and args.version == 'v1.0-mini': train_version = f'{args.version}' nuscenes_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=train_version, dataset_name='NuScenesDataset', out_dir=args.out_dir, max_sweeps=args.max_sweeps) elif args.dataset == 'lyft': train_version = f'{args.version}-train' lyft_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=train_version, max_sweeps=args.max_sweeps) test_version = f'{args.version}-test' lyft_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=test_version, max_sweeps=args.max_sweeps) elif args.dataset == 'waymo': waymo_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, version=args.version, out_dir=args.out_dir, workers=args.workers, max_sweeps=args.max_sweeps) elif args.dataset == 'scannet': scannet_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, out_dir=args.out_dir, workers=args.workers) elif args.dataset == 's3dis': s3dis_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, out_dir=args.out_dir, workers=args.workers) elif args.dataset == 'sunrgbd': sunrgbd_data_prep( root_path=args.root_path, info_prefix=args.extra_tag, num_points=args.num_points, out_dir=args.out_dir, workers=args.workers) ================================================ FILE: tools/create_data.sh ================================================ #!/usr/bin/env bash set -x export PYTHONPATH=`pwd`:$PYTHONPATH PARTITION=$1 JOB_NAME=$2 DATASET=$3 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-1} SRUN_ARGS=${SRUN_ARGS:-""} JOB_NAME=create_data srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/create_data.py ${DATASET} \ --root-path ./data/${DATASET} \ --out-dir ./data/${DATASET} \ --extra-tag ${DATASET} ================================================ FILE: tools/create_data_bev_planner.py ================================================ # Copyright (c) 2023-2024, NVIDIA Corporation & Affiliates. All rights reserved. # # This work is made available under the Nvidia Source Code License-NC. # To view a copy of this license, visit # TODO: add license here import pickle import numpy as np from nuscenes import NuScenes from nuscenes.utils.data_classes import Box from pyquaternion import Quaternion from tools.data_converter import nuscenes_converter as nuscenes_converter # from tools.data_converter.nuscenes_prediction_tools import get_forecasting_annotations map_name_from_general_to_detection = { 'human.pedestrian.adult': 'pedestrian', 'human.pedestrian.child': 'pedestrian', 'human.pedestrian.wheelchair': 'ignore', 'human.pedestrian.stroller': 'ignore', 'human.pedestrian.personal_mobility': 'ignore', 'human.pedestrian.police_officer': 'pedestrian', 'human.pedestrian.construction_worker': 'pedestrian', 'animal': 'ignore', 'vehicle.car': 'car', 'vehicle.motorcycle': 'motorcycle', 'vehicle.bicycle': 'bicycle', 'vehicle.bus.bendy': 'bus', 'vehicle.bus.rigid': 'bus', 'vehicle.truck': 'truck', 'vehicle.construction': 'construction_vehicle', 'vehicle.emergency.ambulance': 'ignore', 'vehicle.emergency.police': 'ignore', 'vehicle.trailer': 'trailer', 'movable_object.barrier': 'barrier', 'movable_object.trafficcone': 'traffic_cone', 'movable_object.pushable_pullable': 'ignore', 'movable_object.debris': 'ignore', 'static_object.bicycle_rack': 'ignore', } classes = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] VERSION= 'v1.0-mini' NUSCENES = 'nuscenes-mini' # VERSION= 'v1.0-trainval' # NUSCENES = 'nuscenes' def get_gt(info, traj_in_lidar_coor=None, traj_mask_in_lidar_coor=None): """Generate gt labels from info. Args: info(dict): Infos needed to generate gt labels. Returns: Tensor: GT bboxes. Tensor: GT labels. """ ego2global_rotation = info['cams']['CAM_FRONT']['ego2global_rotation'] ego2global_translation = info['cams']['CAM_FRONT'][ 'ego2global_translation'] trans = -np.array(ego2global_translation) rot = Quaternion(ego2global_rotation).inverse gt_boxes = list() gt_boxes_in_global = list() gt_labels = list() fut_traj = list() fut_traj_mask = list() valid_flag = list() for i, ann_info in enumerate(info['ann_infos']): # Use ego coordinate. if (map_name_from_general_to_detection[ann_info['category_name']] not in classes or ann_info['num_lidar_pts'] + ann_info['num_radar_pts'] <= 0): valid_flag.append(False) continue valid_flag.append(True) box = Box( ann_info['translation'], ann_info['size'], Quaternion(ann_info['rotation']), velocity=ann_info['velocity'], ) box_xyz_in_global = np.array(box.center) box_dxdydz_in_global = np.array(box.wlh)[[1, 0, 2]] box_yaw_in_global = np.array([box.orientation.yaw_pitch_roll[0]]) box_velo_in_global = np.array(box.velocity[:2]) box.translate(trans) box.rotate(rot) box_xyz = np.array(box.center) box_dxdydz = np.array(box.wlh)[[1, 0, 2]] box_yaw = np.array([box.orientation.yaw_pitch_roll[0]]) box_velo = np.array(box.velocity[:2]) gt_box = np.concatenate([box_xyz, box_dxdydz, box_yaw, box_velo]) gt_box_in_global = np.concatenate([box_xyz_in_global, box_dxdydz_in_global, box_yaw_in_global, box_velo_in_global]) gt_boxes.append(gt_box) gt_boxes_in_global.append(gt_box_in_global) gt_labels.append( classes.index( map_name_from_general_to_detection[ann_info['category_name']])) if traj_in_lidar_coor is not None: # traj = np.dot(Quaternion(info['lidar2ego_rotation']).rotation_matrix[:2,:2],traj_in_lidar_coor[i].transpose(1,0)).transpose(1,0) fut_traj.append(traj_in_lidar_coor[i]) fut_traj_mask.append(traj_mask_in_lidar_coor[i]) return gt_boxes, gt_labels, fut_traj, fut_traj_mask, np.array(valid_flag), gt_boxes_in_global def nuscenes_data_prep(root_path, info_prefix, version, max_sweeps=10): """Prepare data related to nuScenes dataset. Related data consists of '.pkl' files recording basic infos, 2D annotations and groundtruth database. Args: root_path (str): Path of dataset root. info_prefix (str): The prefix of info filenames. version (str): Dataset version. max_sweeps (int, optional): Number of input consecutive frames. Default: 10 """ nuscenes_converter.create_nuscenes_infos( root_path, info_prefix, version=version, max_sweeps=max_sweeps) def add_ann_adj_info(extra_tag, with_lidar_seg=False): nuscenes_version = VERSION dataroot = f'./data/{NUSCENES}/' nuscenes = NuScenes(nuscenes_version, dataroot) # for set in ['test']: # dataset = pickle.load( # open('./data/%s/%s_infos_%s.pkl' % (NUSCENES, extra_tag, set), 'rb')) # for id in range(len(dataset['infos'])): # if id % 10 == 0: # print('%d/%d' % (id, len(dataset['infos']))) # info = dataset['infos'][id] # # get sweep adjacent frame info # sample = nuscenes.get('sample', info['token']) # ann_infos = list() # for ann in sample['anns']: # ann_info = nuscenes.get('sample_annotation', ann) # velocity = nuscenes.box_velocity(ann_info['token']) # if np.any(np.isnan(velocity)): # velocity = np.zeros(3) # ann_info['velocity'] = velocity # ann_infos.append(ann_info) # dataset['infos'][id]['ann_infos'] = ann_infos # dataset['infos'][id]['ann_infos'] = get_gt(dataset['infos'][id]) # dataset['infos'][id]['scene_token'] = sample['scene_token'] # scene = nuscenes.get('scene', sample['scene_token']) # dataset['infos'][id]['scene_name'] = scene['name'] # dataset['infos'][id]['prev'] = sample['prev'] # # description = scene['description'] # if with_lidar_seg: # lidar_sd_token = sample['data']['LIDAR_TOP'] # dataset['infos'][id]['lidarseg_filename'] = nuscenes.get('lidarseg', lidar_sd_token)['filename'] # scene = nuscenes.get('scene', sample['scene_token']) # dataset['infos'][id]['occ_path'] = \ # './data/nuscenes/gts/%s/%s'%(scene['name'], info['token']) # with open('./data/%s/%s_infos_%s.pkl' % (NUSCENES, extra_tag, set), # 'wb') as fid: # pickle.dump(dataset, fid) for set in ['train', 'val']: dataset = pickle.load( open('./data/%s/%s_infos_%s.pkl' % (NUSCENES, extra_tag, set), 'rb')) # traj_data = pickle.load(open(f'/mount/data/GoGo/data/infos/nuscenes_infos_temporal_{set}.pkl', 'rb')) # traj_data = None for id in range(len(dataset['infos'])): if id % 10 == 0: print('%d/%d' % (id, len(dataset['infos']))) info = dataset['infos'][id] # get sweep adjacent frame info sample = nuscenes.get('sample', info['token']) ann_infos = list() for ann in sample['anns']: ann_info = nuscenes.get('sample_annotation', ann) velocity = nuscenes.box_velocity(ann_info['token']) if np.any(np.isnan(velocity)): velocity = np.zeros(3) ann_info['velocity'] = velocity ann_infos.append(ann_info) dataset['infos'][id]['ann_infos'] = ann_infos # traj_info = traj_data['infos'][id] if traj_data is not None else None future_traj_all, future_traj_valid_mask_all = dataset['infos'][id]['fut_traj'], dataset['infos'][id]['fut_traj_valid_mask'] gt_boxes_3d, gt_labels_3d, fut_traj, fut_traj_mask, valid_flag, gt_boxes_3d_in_global = get_gt(dataset['infos'][id], future_traj_all, future_traj_valid_mask_all) dataset['infos'][id]['ann_infos'] = {} if fut_traj is not None: dataset['infos'][id]['ann_infos']['fut_traj'] = fut_traj dataset['infos'][id]['ann_infos']['fut_traj_mask'] = fut_traj_mask dataset['infos'][id]['ann_infos']['gt_boxes_2d'] = dataset['infos'][id]['bboxes2d'] dataset['infos'][id]['ann_infos']['gt_labels_2d'] = dataset['infos'][id]['labels2d'] dataset['infos'][id]['ann_infos']['depths'] = dataset['infos'][id]['depths'] dataset['infos'][id]['ann_infos']['centers2d'] = dataset['infos'][id]['centers2d'] dataset['infos'][id]['ann_infos']['gt_boxes_3d'] = gt_boxes_3d dataset['infos'][id]['ann_infos']['gt_boxes_3d_in_global'] = gt_boxes_3d_in_global dataset['infos'][id]['ann_infos']['gt_labels_3d'] = gt_labels_3d dataset['infos'][id]['scene_token'] = sample['scene_token'] scene = nuscenes.get('scene', sample['scene_token']) map_location = nuscenes.get('log', scene['log_token'])['location'] dataset['infos'][id]['map_location'] = map_location dataset['infos'][id]['scene_name'] = scene['name'] dataset['infos'][id]['prev'] = sample['prev'] annotations = [ nuscenes.get('sample_annotation', token) for token in sample['anns'] ] instance_inds = [nuscenes.getind('instance', ann['instance_token']) for ann in annotations] info['instance_inds'] = instance_inds info['valid_flag'] = valid_flag # description = scene['description'] if with_lidar_seg: lidar_sd_token = sample['data']['LIDAR_TOP'] dataset['infos'][id]['lidarseg_filename'] = nuscenes.get('lidarseg', lidar_sd_token)['filename'] scene = nuscenes.get('scene', sample['scene_token']) dataset['infos'][id]['occ_path'] = \ './data/nuscenes/gts/%s/%s'%(scene['name'], info['token']) with open('./data/%s/%s_infos_%s.pkl' % (NUSCENES, extra_tag, set), 'wb') as fid: pickle.dump(dataset, fid) if __name__ == '__main__': dataset = 'nuscenes' version = 'v1.0' train_version = VERSION root_path = f'./data/{NUSCENES}' extra_tag = 'bev-next-nuscenes' nuscenes_data_prep( root_path=root_path, info_prefix=extra_tag, version=train_version, max_sweeps=10) print('add_ann_infos') add_ann_adj_info(extra_tag) ================================================ FILE: tools/data_converter/__init__.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. ================================================ FILE: tools/data_converter/create_gt_database.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import pickle from os import path as osp import mmcv import numpy as np from mmcv import track_iter_progress from mmcv.ops import roi_align from pycocotools import mask as maskUtils from pycocotools.coco import COCO from mmdet3d.core.bbox import box_np_ops as box_np_ops from mmdet3d.datasets import build_dataset from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps def _poly2mask(mask_ann, img_h, img_w): if isinstance(mask_ann, list): # polygon -- a single object might consist of multiple parts # we merge all parts into one mask rle code rles = maskUtils.frPyObjects(mask_ann, img_h, img_w) rle = maskUtils.merge(rles) elif isinstance(mask_ann['counts'], list): # uncompressed RLE rle = maskUtils.frPyObjects(mask_ann, img_h, img_w) else: # rle rle = mask_ann mask = maskUtils.decode(rle) return mask def _parse_coco_ann_info(ann_info): gt_bboxes = [] gt_labels = [] gt_bboxes_ignore = [] gt_masks_ann = [] for i, ann in enumerate(ann_info): if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] if ann['area'] <= 0: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_bboxes_ignore.append(bbox) else: gt_bboxes.append(bbox) gt_masks_ann.append(ann['segmentation']) if gt_bboxes: gt_bboxes = np.array(gt_bboxes, dtype=np.float32) gt_labels = np.array(gt_labels, dtype=np.int64) else: gt_bboxes = np.zeros((0, 4), dtype=np.float32) gt_labels = np.array([], dtype=np.int64) if gt_bboxes_ignore: gt_bboxes_ignore = np.array(gt_bboxes_ignore, dtype=np.float32) else: gt_bboxes_ignore = np.zeros((0, 4), dtype=np.float32) ann = dict( bboxes=gt_bboxes, bboxes_ignore=gt_bboxes_ignore, masks=gt_masks_ann) return ann def crop_image_patch_v2(pos_proposals, pos_assigned_gt_inds, gt_masks): import torch from torch.nn.modules.utils import _pair device = pos_proposals.device num_pos = pos_proposals.size(0) fake_inds = ( torch.arange(num_pos, device=device).to(dtype=pos_proposals.dtype)[:, None]) rois = torch.cat([fake_inds, pos_proposals], dim=1) # Nx5 mask_size = _pair(28) rois = rois.to(device=device) gt_masks_th = ( torch.from_numpy(gt_masks).to(device).index_select( 0, pos_assigned_gt_inds).to(dtype=rois.dtype)) # Use RoIAlign could apparently accelerate the training (~0.1s/iter) targets = ( roi_align(gt_masks_th, rois, mask_size[::-1], 1.0, 0, True).squeeze(1)) return targets def crop_image_patch(pos_proposals, gt_masks, pos_assigned_gt_inds, org_img): num_pos = pos_proposals.shape[0] masks = [] img_patches = [] for i in range(num_pos): gt_mask = gt_masks[pos_assigned_gt_inds[i]] bbox = pos_proposals[i, :].astype(np.int32) x1, y1, x2, y2 = bbox w = np.maximum(x2 - x1 + 1, 1) h = np.maximum(y2 - y1 + 1, 1) mask_patch = gt_mask[y1:y1 + h, x1:x1 + w] masked_img = gt_mask[..., None] * org_img img_patch = masked_img[y1:y1 + h, x1:x1 + w] img_patches.append(img_patch) masks.append(mask_patch) return img_patches, masks def create_groundtruth_database(dataset_class_name, data_path, info_prefix, info_path=None, gap=0, mask_anno_path=None, used_classes=None, database_save_path=None, db_info_save_path=None, relative_path=True, add_rgb=False, lidar_only=False, bev_only=False, coors_range=None, with_mask=False): """Given the raw data, generate the ground truth database. Args: dataset_class_name (str): Name of the input dataset. data_path (str): Path of the data. info_prefix (str): Prefix of the info file. info_path (str, optional): Path of the info file. Default: None. mask_anno_path (str, optional): Path of the mask_anno. Default: None. used_classes (list[str], optional): Classes have been used. Default: None. database_save_path (str, optional): Path to save database. Default: None. db_info_save_path (str, optional): Path to save db_info. Default: None. relative_path (bool, optional): Whether to use relative path. Default: True. with_mask (bool, optional): Whether to use mask. Default: False. """ print(f'Create GT Database of {dataset_class_name}') dataset_cfg = dict( type=dataset_class_name, data_root=data_path, ann_file=info_path) if dataset_class_name == 'KittiDataset': file_client_args = dict(backend='disk') dataset_cfg.update( test_mode=False, split='training', modality=dict( use_lidar=True, use_depth=False, use_lidar_intensity=True, use_camera=with_mask, ), pipeline=[ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args) ]) elif dataset_class_name == 'NuScenesDataset': class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] file_client_args = dict(backend='disk') data_config = { 'cams': [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT' ], 'Ncams': 6, 'input_size': (256, 704), 'src_size': (900, 1600), # Augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': True, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) dataset_cfg.update( img_info_prototype='bevdet', use_valid_flag=True, box_type_3d='LiDAR', modality=input_modality, test_mode=True, pipeline=[ dict(type='PrepareImageInputs', data_config=data_config, is_train=False, ), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=None, is_train=False, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', dtype='float32', load_dim=5, use_dim=[0, 1, 2, 3, 4], translate2ego=False, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, translate2ego=False, remove_close=True), # dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='PointsFromLidartoEgo'), ]) elif dataset_class_name == 'WaymoDataset': file_client_args = dict(backend='disk') dataset_cfg.update( test_mode=False, split='training', modality=dict( use_lidar=True, use_depth=False, use_lidar_intensity=True, use_camera=False, ), pipeline=[ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=6, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args) ]) dataset = build_dataset(dataset_cfg) if database_save_path is None: database_save_path = osp.join(data_path, f'{info_prefix}_gt_database') if db_info_save_path is None: db_info_save_path = osp.join(data_path, f'{info_prefix}_dbinfos_train.pkl') mmcv.mkdir_or_exist(database_save_path) all_db_infos = dict() if with_mask: coco = COCO(osp.join(data_path, mask_anno_path)) imgIds = coco.getImgIds() file2id = dict() for i in imgIds: info = coco.loadImgs([i])[0] file2id.update({info['file_name']: i}) group_counter = 0 for j in track_iter_progress(list(range(gap, len(dataset), 8))): input_dict = dataset.get_data_info(j) dataset.pre_pipeline(input_dict) example = dataset.pipeline(input_dict) # annos = example['ann_info'] annos = {} image_idx = example['sample_idx'] points = example['points'].tensor.numpy() gt_boxes_3d = example['gt_bboxes_3d'].tensor.numpy() class_names = [ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] names = [class_names[i] for i in example['gt_labels_3d']] # annos['gt_names'] group_dict = dict() if 'group_ids' in annos: group_ids = annos['group_ids'] else: group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64) difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32) if 'difficulty' in annos: difficulty = annos['difficulty'] num_obj = gt_boxes_3d.shape[0] point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d) if with_mask: # prepare masks gt_boxes = annos['gt_bboxes'] img_path = osp.split(example['img_info']['filename'])[-1] if img_path not in file2id.keys(): print(f'skip image {img_path} for empty mask') continue img_id = file2id[img_path] kins_annIds = coco.getAnnIds(imgIds=img_id) kins_raw_info = coco.loadAnns(kins_annIds) kins_ann_info = _parse_coco_ann_info(kins_raw_info) h, w = annos['img_shape'][:2] gt_masks = [ _poly2mask(mask, h, w) for mask in kins_ann_info['masks'] ] # get mask inds based on iou mapping bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes) mask_inds = bbox_iou.argmax(axis=0) valid_inds = (bbox_iou.max(axis=0) > 0.5) # mask the image # use more precise crop when it is ready # object_img_patches = np.ascontiguousarray( # np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2)) # crop image patches using roi_align # object_img_patches = crop_image_patch_v2( # torch.Tensor(gt_boxes), # torch.Tensor(mask_inds).long(), object_img_patches) object_img_patches, object_masks = crop_image_patch( gt_boxes, gt_masks, mask_inds, annos['img']) for i in range(num_obj): filename = f'{image_idx}_{names[i]}_{i}.bin' abs_filepath = osp.join(database_save_path, filename) rel_filepath = osp.join(f'{info_prefix}_gt_database', filename) # save point clouds and image patches for each object gt_points = points[point_indices[:, i]] gt_points[:, :3] -= gt_boxes_3d[i, :3] if with_mask: if object_masks[i].sum() == 0 or not valid_inds[i]: # Skip object for empty or invalid mask continue img_patch_path = abs_filepath + '.png' mask_patch_path = abs_filepath + '.mask.png' mmcv.imwrite(object_img_patches[i], img_patch_path) mmcv.imwrite(object_masks[i], mask_patch_path) with open(abs_filepath, 'w') as f: gt_points.tofile(f) if (used_classes is None) or names[i] in used_classes: db_info = { 'name': names[i], 'path': rel_filepath, 'image_idx': image_idx, 'gt_idx': i, 'box3d_lidar': gt_boxes_3d[i], 'num_points_in_gt': gt_points.shape[0], 'difficulty': difficulty[i], } local_group_id = group_ids[i] # if local_group_id >= 0: if local_group_id not in group_dict: group_dict[local_group_id] = group_counter group_counter += 1 db_info['group_id'] = group_dict[local_group_id] if 'score' in annos: db_info['score'] = annos['score'][i] if with_mask: db_info.update({'box2d_camera': gt_boxes[i]}) if names[i] in all_db_infos: all_db_infos[names[i]].append(db_info) else: all_db_infos[names[i]] = [db_info] for k, v in all_db_infos.items(): print(f'load {len(v)} {k} database infos') with open(db_info_save_path, 'wb') as f: pickle.dump(all_db_infos, f) class GTDatabaseCreater: """Given the raw data, generate the ground truth database. This is the parallel version. For serialized version, please refer to `create_groundtruth_database` Args: dataset_class_name (str): Name of the input dataset. data_path (str): Path of the data. info_prefix (str): Prefix of the info file. info_path (str, optional): Path of the info file. Default: None. mask_anno_path (str, optional): Path of the mask_anno. Default: None. used_classes (list[str], optional): Classes have been used. Default: None. database_save_path (str, optional): Path to save database. Default: None. db_info_save_path (str, optional): Path to save db_info. Default: None. relative_path (bool, optional): Whether to use relative path. Default: True. with_mask (bool, optional): Whether to use mask. Default: False. num_worker (int, optional): the number of parallel workers to use. Default: 8. """ def __init__(self, dataset_class_name, data_path, info_prefix, info_path=None, mask_anno_path=None, used_classes=None, database_save_path=None, db_info_save_path=None, relative_path=True, add_rgb=False, lidar_only=False, bev_only=False, coors_range=None, with_mask=False, num_worker=8) -> None: self.dataset_class_name = dataset_class_name self.data_path = data_path self.info_prefix = info_prefix self.info_path = info_path self.mask_anno_path = mask_anno_path self.used_classes = used_classes self.database_save_path = database_save_path self.db_info_save_path = db_info_save_path self.relative_path = relative_path self.add_rgb = add_rgb self.lidar_only = lidar_only self.bev_only = bev_only self.coors_range = coors_range self.with_mask = with_mask self.num_worker = num_worker self.pipeline = None def create_single(self, input_dict): group_counter = 0 single_db_infos = dict() example = self.pipeline(input_dict) annos = example['ann_info'] image_idx = example['sample_idx'] points = example['points'].tensor.numpy() gt_boxes_3d = annos['gt_bboxes_3d'].tensor.numpy() names = annos['gt_names'] group_dict = dict() if 'group_ids' in annos: group_ids = annos['group_ids'] else: group_ids = np.arange(gt_boxes_3d.shape[0], dtype=np.int64) difficulty = np.zeros(gt_boxes_3d.shape[0], dtype=np.int32) if 'difficulty' in annos: difficulty = annos['difficulty'] num_obj = gt_boxes_3d.shape[0] point_indices = box_np_ops.points_in_rbbox(points, gt_boxes_3d) if self.with_mask: # prepare masks gt_boxes = annos['gt_bboxes'] img_path = osp.split(example['img_info']['filename'])[-1] if img_path not in self.file2id.keys(): print(f'skip image {img_path} for empty mask') return single_db_infos img_id = self.file2id[img_path] kins_annIds = self.coco.getAnnIds(imgIds=img_id) kins_raw_info = self.coco.loadAnns(kins_annIds) kins_ann_info = _parse_coco_ann_info(kins_raw_info) h, w = annos['img_shape'][:2] gt_masks = [ _poly2mask(mask, h, w) for mask in kins_ann_info['masks'] ] # get mask inds based on iou mapping bbox_iou = bbox_overlaps(kins_ann_info['bboxes'], gt_boxes) mask_inds = bbox_iou.argmax(axis=0) valid_inds = (bbox_iou.max(axis=0) > 0.5) # mask the image # use more precise crop when it is ready # object_img_patches = np.ascontiguousarray( # np.stack(object_img_patches, axis=0).transpose(0, 3, 1, 2)) # crop image patches using roi_align # object_img_patches = crop_image_patch_v2( # torch.Tensor(gt_boxes), # torch.Tensor(mask_inds).long(), object_img_patches) object_img_patches, object_masks = crop_image_patch( gt_boxes, gt_masks, mask_inds, annos['img']) for i in range(num_obj): filename = f'{image_idx}_{names[i]}_{i}.bin' abs_filepath = osp.join(self.database_save_path, filename) rel_filepath = osp.join(f'{self.info_prefix}_gt_database', filename) # save point clouds and image patches for each object gt_points = points[point_indices[:, i]] gt_points[:, :3] -= gt_boxes_3d[i, :3] if self.with_mask: if object_masks[i].sum() == 0 or not valid_inds[i]: # Skip object for empty or invalid mask continue img_patch_path = abs_filepath + '.png' mask_patch_path = abs_filepath + '.mask.png' mmcv.imwrite(object_img_patches[i], img_patch_path) mmcv.imwrite(object_masks[i], mask_patch_path) with open(abs_filepath, 'w') as f: gt_points.tofile(f) if (self.used_classes is None) or names[i] in self.used_classes: db_info = { 'name': names[i], 'path': rel_filepath, 'image_idx': image_idx, 'gt_idx': i, 'box3d_lidar': gt_boxes_3d[i], 'num_points_in_gt': gt_points.shape[0], 'difficulty': difficulty[i], } local_group_id = group_ids[i] # if local_group_id >= 0: if local_group_id not in group_dict: group_dict[local_group_id] = group_counter group_counter += 1 db_info['group_id'] = group_dict[local_group_id] if 'score' in annos: db_info['score'] = annos['score'][i] if self.with_mask: db_info.update({'box2d_camera': gt_boxes[i]}) if names[i] in single_db_infos: single_db_infos[names[i]].append(db_info) else: single_db_infos[names[i]] = [db_info] return single_db_infos def create(self): print(f'Create GT Database of {self.dataset_class_name}') dataset_cfg = dict( type=self.dataset_class_name, data_root=self.data_path, ann_file=self.info_path) if self.dataset_class_name == 'KittiDataset': file_client_args = dict(backend='disk') dataset_cfg.update( test_mode=False, split='training', modality=dict( use_lidar=True, use_depth=False, use_lidar_intensity=True, use_camera=self.with_mask, ), pipeline=[ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=4, use_dim=4, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args) ]) elif self.dataset_class_name == 'NuScenesDataset': dataset_cfg.update( img_info_prototype='bevdet', use_valid_flag=True, box_type_3d='LiDAR', test_mode=True, pipeline=[ dict(type='PrepareImageInputs', is_train=False, ), dict( type='LoadAnnotationsBEVDepth', bda_aug_conf=None, is_train=False, classes=class_names), dict( type='LoadPointsFromFile', coord_type='LIDAR', dtype='float32', load_dim=5, use_dim=[0, 1, 2, 3, 4], translate2ego=False, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, use_dim=[0, 1, 2, 3, 4], file_client_args=file_client_args, pad_empty_sweeps=True, translate2ego=False, remove_close=True), # dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict(type='PointsFromLidartoEgo'), ]) elif self.dataset_class_name == 'WaymoDataset': file_client_args = dict(backend='disk') dataset_cfg.update( test_mode=False, split='training', modality=dict( use_lidar=True, use_depth=False, use_lidar_intensity=True, use_camera=False, ), pipeline=[ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=6, use_dim=6, file_client_args=file_client_args), dict( type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True, file_client_args=file_client_args) ]) dataset = build_dataset(dataset_cfg) self.pipeline = dataset.pipeline if self.database_save_path is None: self.database_save_path = osp.join( self.data_path, f'{self.info_prefix}_gt_database') if self.db_info_save_path is None: self.db_info_save_path = osp.join( self.data_path, f'{self.info_prefix}_dbinfos_train.pkl') mmcv.mkdir_or_exist(self.database_save_path) if self.with_mask: self.coco = COCO(osp.join(self.data_path, self.mask_anno_path)) imgIds = self.coco.getImgIds() self.file2id = dict() for i in imgIds: info = self.coco.loadImgs([i])[0] self.file2id.update({info['file_name']: i}) def loop_dataset(i): input_dict = dataset.get_data_info(i) dataset.pre_pipeline(input_dict) return input_dict multi_db_infos = mmcv.track_parallel_progress( self.create_single, ((loop_dataset(i) for i in range(len(dataset))), len(dataset)), self.num_worker) print('Make global unique group id') group_counter_offset = 0 all_db_infos = dict() for single_db_infos in track_iter_progress(multi_db_infos): group_id = -1 for name, name_db_infos in single_db_infos.items(): for db_info in name_db_infos: group_id = max(group_id, db_info['group_id']) db_info['group_id'] += group_counter_offset if name not in all_db_infos: all_db_infos[name] = [] all_db_infos[name].extend(name_db_infos) group_counter_offset += (group_id + 1) for k, v in all_db_infos.items(): print(f'load {len(v)} {k} database infos') with open(self.db_info_save_path, 'wb') as f: pickle.dump(all_db_infos, f) import argparse if __name__ == '__main__': parser = argparse.ArgumentParser(description='gap') parser.add_argument('gap', default=0, type=int, help='gap') args = parser.parse_args() create_groundtruth_database('NuScenesDataset', '/mount/data/lsbevv2/data/nuscenes', 'bevdetv2-nuscenes', '/mount/data/lsbevv2/data/nuscenes/bevdetv2-nuscenes_infos_train.pkl', gap=args.gap) ================================================ FILE: tools/data_converter/imgaug_demo.py ================================================ #!usr/bin/python # -*- coding: utf-8 -*- import cv2 import random import os import os.path as osp from matplotlib import pyplot as plt # import albumentations as A from imgaug import augmenters as iaa from nuscenes import NuScenes from nuscenes.utils import splits fog_aug = iaa.Fog() snow_aug = iaa.Snowflakes(flake_size=(0.7, 0.95), speed=(0.001, 0.03)) rain_aug = iaa.Rain(drop_size=(0.10, 0.20)) noise_aug = iaa.imgcorruptlike.GaussianNoise(severity=1) # transform = A.Compose( # [A.RandomSunFlare(flare_roi=(0, 0, 1, 0.5), angle_lower=0.5, p=1)], # ) import mmcv def get_available_scenes(nusc): """Get available scenes from the input nuscenes class. Given the raw data, get the information of available scenes for further info generation. Args: nusc (class): Dataset class in the nuScenes dataset. Returns: available_scenes (list[dict]): List of basic information for the available scenes. """ available_scenes = [] print('total scene num: {}'.format(len(nusc.scene))) for scene in nusc.scene: scene_token = scene['token'] scene_rec = nusc.get('scene', scene_token) sample_rec = nusc.get('sample', scene_rec['first_sample_token']) sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) has_more_frames = True scene_not_exist = False while has_more_frames: lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token']) lidar_path = str(lidar_path) if os.getcwd() in lidar_path: # path from lyftdataset is absolute path lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1] # relative path if not mmcv.is_filepath(lidar_path): scene_not_exist = True break else: break if scene_not_exist: continue available_scenes.append(scene) print('exist scene num: {}'.format(len(available_scenes))) return available_scenes VERSION= 'v1.0-trainval' NUSCENES = 'nuscenes' nuscenes_version = VERSION dataroot = f'./data/{NUSCENES}/' nuscenes = NuScenes(nuscenes_version, dataroot) val_scenes = splits.val # filter existing scenes. available_scenes = get_available_scenes(nuscenes) available_scene_names = [s['name'] for s in available_scenes] val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) val_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in val_scenes ]) val_imgs = set() for sample in mmcv.track_iter_progress(nuscenes.sample): camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] if sample['scene_token'] in val_scenes: for cam in camera_types: cam_token = sample['data'][cam] cam_path, _, cam_intrinsic = nuscenes.get_sample_data(cam_token) val_imgs.add(cam_path.split('/')[-1]) aug_mapper = dict( fog=iaa.Fog(), snow=iaa.Snowflakes(flake_size=(0.7, 0.95), speed=(0.001, 0.03)), rain=iaa.Rain(drop_size=(0.10, 0.20)), noise=iaa.imgcorruptlike.GaussianNoise(severity=1) ) #imgaug test ori_sample_path = '/mount/data/FBBEV/data/nuscenes/samples' det_sample_path = '/mount/data/FBBEV/data/nuscenes_aug/samples_rain' cams = os.listdir(det_sample_path) for cam in cams: imgs = os.listdir(osp.join(ori_sample_path, cam)) for img_name in imgs: imglist=[] if img_name not in val_imgs: continue img_path = osp.join(ori_sample_path, cam, img_name) print(img_path) img = cv2.imread(img_path) img = cv2.resize(img, (800, 450)) imglist.append(img) augs = ['noise']# ['fog', 'rain', 'snow', 'noise'] for aug_key in augs: seq = iaa.Sequential([ aug_mapper[aug_key] ]) images_aug = seq.augment_images(imglist) images_aug = cv2.resize(images_aug[0], (1600, 900)) # print(f'/mount/data/FBBEV/data/nuscenes_aug/samples_{aug_key}/{cam}/{img_name}') cv2.imwrite(f'/mount/data/FBBEV/data/nuscenes_aug/samples_{aug_key}/{cam}/{img_name}', images_aug) # images_aug = transform(image=img)['image'] # images_aug = cv2.resize(images_aug, (1600, 900)) # cv2.imwrite(f'/mount/data/FBBEV/data/nuscenes_aug/samples_sun/{cam}/{img_name}', images_aug) ================================================ FILE: tools/data_converter/indoor_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os import mmcv import numpy as np from tools.data_converter.s3dis_data_utils import S3DISData, S3DISSegData from tools.data_converter.scannet_data_utils import ScanNetData, ScanNetSegData from tools.data_converter.sunrgbd_data_utils import SUNRGBDData def create_indoor_info_file(data_path, pkl_prefix='sunrgbd', save_path=None, workers=4, **kwargs): """Create indoor information file. Get information of the raw data and save it to the pkl file. Args: data_path (str): Path of the data. pkl_prefix (str, optional): Prefix of the pkl to be saved. Default: 'sunrgbd'. save_path (str, optional): Path of the pkl to be saved. Default: None. workers (int, optional): Number of threads to be used. Default: 4. kwargs (dict): Additional parameters for dataset-specific Data class. May include `use_v1` for SUN RGB-D and `num_points`. """ assert os.path.exists(data_path) assert pkl_prefix in ['sunrgbd', 'scannet', 's3dis'], \ f'unsupported indoor dataset {pkl_prefix}' save_path = data_path if save_path is None else save_path assert os.path.exists(save_path) # generate infos for both detection and segmentation task if pkl_prefix in ['sunrgbd', 'scannet']: train_filename = os.path.join(save_path, f'{pkl_prefix}_infos_train.pkl') val_filename = os.path.join(save_path, f'{pkl_prefix}_infos_val.pkl') if pkl_prefix == 'sunrgbd': # SUN RGB-D has a train-val split num_points = kwargs.get('num_points', -1) use_v1 = kwargs.get('use_v1', False) train_dataset = SUNRGBDData( root_path=data_path, split='train', use_v1=use_v1, num_points=num_points) val_dataset = SUNRGBDData( root_path=data_path, split='val', use_v1=use_v1, num_points=num_points) else: # ScanNet has a train-val-test split train_dataset = ScanNetData(root_path=data_path, split='train') val_dataset = ScanNetData(root_path=data_path, split='val') test_dataset = ScanNetData(root_path=data_path, split='test') test_filename = os.path.join(save_path, f'{pkl_prefix}_infos_test.pkl') infos_train = train_dataset.get_infos( num_workers=workers, has_label=True) mmcv.dump(infos_train, train_filename, 'pkl') print(f'{pkl_prefix} info train file is saved to {train_filename}') infos_val = val_dataset.get_infos(num_workers=workers, has_label=True) mmcv.dump(infos_val, val_filename, 'pkl') print(f'{pkl_prefix} info val file is saved to {val_filename}') if pkl_prefix == 'scannet': infos_test = test_dataset.get_infos( num_workers=workers, has_label=False) mmcv.dump(infos_test, test_filename, 'pkl') print(f'{pkl_prefix} info test file is saved to {test_filename}') # generate infos for the semantic segmentation task # e.g. re-sampled scene indexes and label weights # scene indexes are used to re-sample rooms with different number of points # label weights are used to balance classes with different number of points if pkl_prefix == 'scannet': # label weight computation function is adopted from # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24 num_points = kwargs.get('num_points', 8192) train_dataset = ScanNetSegData( data_root=data_path, ann_file=train_filename, split='train', num_points=num_points, label_weight_func=lambda x: 1.0 / np.log(1.2 + x)) # TODO: do we need to generate on val set? val_dataset = ScanNetSegData( data_root=data_path, ann_file=val_filename, split='val', num_points=num_points, label_weight_func=lambda x: 1.0 / np.log(1.2 + x)) # no need to generate for test set train_dataset.get_seg_infos() val_dataset.get_seg_infos() elif pkl_prefix == 's3dis': # S3DIS doesn't have a fixed train-val split # it has 6 areas instead, so we generate info file for each of them # in training, we will use dataset to wrap different areas splits = [f'Area_{i}' for i in [1, 2, 3, 4, 5, 6]] for split in splits: dataset = S3DISData(root_path=data_path, split=split) info = dataset.get_infos(num_workers=workers, has_label=True) filename = os.path.join(save_path, f'{pkl_prefix}_infos_{split}.pkl') mmcv.dump(info, filename, 'pkl') print(f'{pkl_prefix} info {split} file is saved to {filename}') num_points = kwargs.get('num_points', 4096) seg_dataset = S3DISSegData( data_root=data_path, ann_file=filename, split=split, num_points=num_points, label_weight_func=lambda x: 1.0 / np.log(1.2 + x)) seg_dataset.get_seg_infos() ================================================ FILE: tools/data_converter/kitti_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict from pathlib import Path import mmcv import numpy as np from nuscenes.utils.geometry_utils import view_points from mmdet3d.core.bbox import box_np_ops, points_cam2img from .kitti_data_utils import WaymoInfoGatherer, get_kitti_image_info from .nuscenes_converter import post_process_coords kitti_categories = ('Pedestrian', 'Cyclist', 'Car') def convert_to_kitti_info_version2(info): """convert kitti info v1 to v2 if possible. Args: info (dict): Info of the input kitti data. - image (dict): image info - calib (dict): calibration info - point_cloud (dict): point cloud info """ if 'image' not in info or 'calib' not in info or 'point_cloud' not in info: info['image'] = { 'image_shape': info['img_shape'], 'image_idx': info['image_idx'], 'image_path': info['img_path'], } info['calib'] = { 'R0_rect': info['calib/R0_rect'], 'Tr_velo_to_cam': info['calib/Tr_velo_to_cam'], 'P2': info['calib/P2'], } info['point_cloud'] = { 'velodyne_path': info['velodyne_path'], } def _read_imageset_file(path): with open(path, 'r') as f: lines = f.readlines() return [int(line) for line in lines] class _NumPointsInGTCalculater: """Calculate the number of points inside the ground truth box. This is the parallel version. For the serialized version, please refer to `_calculate_num_points_in_gt`. Args: data_path (str): Path of the data. relative_path (bool): Whether to use relative path. remove_outside (bool, optional): Whether to remove points which are outside of image. Default: True. num_features (int, optional): Number of features per point. Default: False. num_worker (int, optional): the number of parallel workers to use. Default: 8. """ def __init__(self, data_path, relative_path, remove_outside=True, num_features=4, num_worker=8) -> None: self.data_path = data_path self.relative_path = relative_path self.remove_outside = remove_outside self.num_features = num_features self.num_worker = num_worker def calculate_single(self, info): pc_info = info['point_cloud'] image_info = info['image'] calib = info['calib'] if self.relative_path: v_path = str(Path(self.data_path) / pc_info['velodyne_path']) else: v_path = pc_info['velodyne_path'] points_v = np.fromfile( v_path, dtype=np.float32, count=-1).reshape([-1, self.num_features]) rect = calib['R0_rect'] Trv2c = calib['Tr_velo_to_cam'] P2 = calib['P2'] if self.remove_outside: points_v = box_np_ops.remove_outside_points( points_v, rect, Trv2c, P2, image_info['image_shape']) annos = info['annos'] num_obj = len([n for n in annos['name'] if n != 'DontCare']) dims = annos['dimensions'][:num_obj] loc = annos['location'][:num_obj] rots = annos['rotation_y'][:num_obj] gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) gt_boxes_lidar = box_np_ops.box_camera_to_lidar( gt_boxes_camera, rect, Trv2c) indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar) num_points_in_gt = indices.sum(0) num_ignored = len(annos['dimensions']) - num_obj num_points_in_gt = np.concatenate( [num_points_in_gt, -np.ones([num_ignored])]) annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32) return info def calculate(self, infos): ret_infos = mmcv.track_parallel_progress(self.calculate_single, infos, self.num_worker) for i, ret_info in enumerate(ret_infos): infos[i] = ret_info def _calculate_num_points_in_gt(data_path, infos, relative_path, remove_outside=True, num_features=4): for info in mmcv.track_iter_progress(infos): pc_info = info['point_cloud'] image_info = info['image'] calib = info['calib'] if relative_path: v_path = str(Path(data_path) / pc_info['velodyne_path']) else: v_path = pc_info['velodyne_path'] points_v = np.fromfile( v_path, dtype=np.float32, count=-1).reshape([-1, num_features]) rect = calib['R0_rect'] Trv2c = calib['Tr_velo_to_cam'] P2 = calib['P2'] if remove_outside: points_v = box_np_ops.remove_outside_points( points_v, rect, Trv2c, P2, image_info['image_shape']) # points_v = points_v[points_v[:, 0] > 0] annos = info['annos'] num_obj = len([n for n in annos['name'] if n != 'DontCare']) # annos = kitti.filter_kitti_anno(annos, ['DontCare']) dims = annos['dimensions'][:num_obj] loc = annos['location'][:num_obj] rots = annos['rotation_y'][:num_obj] gt_boxes_camera = np.concatenate([loc, dims, rots[..., np.newaxis]], axis=1) gt_boxes_lidar = box_np_ops.box_camera_to_lidar( gt_boxes_camera, rect, Trv2c) indices = box_np_ops.points_in_rbbox(points_v[:, :3], gt_boxes_lidar) num_points_in_gt = indices.sum(0) num_ignored = len(annos['dimensions']) - num_obj num_points_in_gt = np.concatenate( [num_points_in_gt, -np.ones([num_ignored])]) annos['num_points_in_gt'] = num_points_in_gt.astype(np.int32) def create_kitti_info_file(data_path, pkl_prefix='kitti', with_plane=False, save_path=None, relative_path=True): """Create info file of KITTI dataset. Given the raw data, generate its related info file in pkl format. Args: data_path (str): Path of the data root. pkl_prefix (str, optional): Prefix of the info file to be generated. Default: 'kitti'. with_plane (bool, optional): Whether to use plane information. Default: False. save_path (str, optional): Path to save the info file. Default: None. relative_path (bool, optional): Whether to use relative path. Default: True. """ imageset_folder = Path(data_path) / 'ImageSets' train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt')) val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt')) test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt')) print('Generate info. this may take several minutes.') if save_path is None: save_path = Path(data_path) else: save_path = Path(save_path) kitti_infos_train = get_kitti_image_info( data_path, training=True, velodyne=True, calib=True, with_plane=with_plane, image_ids=train_img_ids, relative_path=relative_path) _calculate_num_points_in_gt(data_path, kitti_infos_train, relative_path) filename = save_path / f'{pkl_prefix}_infos_train.pkl' print(f'Kitti info train file is saved to {filename}') mmcv.dump(kitti_infos_train, filename) kitti_infos_val = get_kitti_image_info( data_path, training=True, velodyne=True, calib=True, with_plane=with_plane, image_ids=val_img_ids, relative_path=relative_path) _calculate_num_points_in_gt(data_path, kitti_infos_val, relative_path) filename = save_path / f'{pkl_prefix}_infos_val.pkl' print(f'Kitti info val file is saved to {filename}') mmcv.dump(kitti_infos_val, filename) filename = save_path / f'{pkl_prefix}_infos_trainval.pkl' print(f'Kitti info trainval file is saved to {filename}') mmcv.dump(kitti_infos_train + kitti_infos_val, filename) kitti_infos_test = get_kitti_image_info( data_path, training=False, label_info=False, velodyne=True, calib=True, with_plane=False, image_ids=test_img_ids, relative_path=relative_path) filename = save_path / f'{pkl_prefix}_infos_test.pkl' print(f'Kitti info test file is saved to {filename}') mmcv.dump(kitti_infos_test, filename) def create_waymo_info_file(data_path, pkl_prefix='waymo', save_path=None, relative_path=True, max_sweeps=5, workers=8): """Create info file of waymo dataset. Given the raw data, generate its related info file in pkl format. Args: data_path (str): Path of the data root. pkl_prefix (str, optional): Prefix of the info file to be generated. Default: 'waymo'. save_path (str, optional): Path to save the info file. Default: None. relative_path (bool, optional): Whether to use relative path. Default: True. max_sweeps (int, optional): Max sweeps before the detection frame to be used. Default: 5. """ imageset_folder = Path(data_path) / 'ImageSets' train_img_ids = _read_imageset_file(str(imageset_folder / 'train.txt')) val_img_ids = _read_imageset_file(str(imageset_folder / 'val.txt')) test_img_ids = _read_imageset_file(str(imageset_folder / 'test.txt')) print('Generate info. this may take several minutes.') if save_path is None: save_path = Path(data_path) else: save_path = Path(save_path) waymo_infos_gatherer_trainval = WaymoInfoGatherer( data_path, training=True, velodyne=True, calib=True, pose=True, relative_path=relative_path, max_sweeps=max_sweeps, num_worker=workers) waymo_infos_gatherer_test = WaymoInfoGatherer( data_path, training=False, label_info=False, velodyne=True, calib=True, pose=True, relative_path=relative_path, max_sweeps=max_sweeps, num_worker=workers) num_points_in_gt_calculater = _NumPointsInGTCalculater( data_path, relative_path, num_features=6, remove_outside=False, num_worker=workers) waymo_infos_train = waymo_infos_gatherer_trainval.gather(train_img_ids) num_points_in_gt_calculater.calculate(waymo_infos_train) filename = save_path / f'{pkl_prefix}_infos_train.pkl' print(f'Waymo info train file is saved to {filename}') mmcv.dump(waymo_infos_train, filename) waymo_infos_val = waymo_infos_gatherer_trainval.gather(val_img_ids) num_points_in_gt_calculater.calculate(waymo_infos_val) filename = save_path / f'{pkl_prefix}_infos_val.pkl' print(f'Waymo info val file is saved to {filename}') mmcv.dump(waymo_infos_val, filename) filename = save_path / f'{pkl_prefix}_infos_trainval.pkl' print(f'Waymo info trainval file is saved to {filename}') mmcv.dump(waymo_infos_train + waymo_infos_val, filename) waymo_infos_test = waymo_infos_gatherer_test.gather(test_img_ids) filename = save_path / f'{pkl_prefix}_infos_test.pkl' print(f'Waymo info test file is saved to {filename}') mmcv.dump(waymo_infos_test, filename) def _create_reduced_point_cloud(data_path, info_path, save_path=None, back=False, num_features=4, front_camera_id=2): """Create reduced point clouds for given info. Args: data_path (str): Path of original data. info_path (str): Path of data info. save_path (str, optional): Path to save reduced point cloud data. Default: None. back (bool, optional): Whether to flip the points to back. Default: False. num_features (int, optional): Number of point features. Default: 4. front_camera_id (int, optional): The referenced/front camera ID. Default: 2. """ kitti_infos = mmcv.load(info_path) for info in mmcv.track_iter_progress(kitti_infos): pc_info = info['point_cloud'] image_info = info['image'] calib = info['calib'] v_path = pc_info['velodyne_path'] v_path = Path(data_path) / v_path points_v = np.fromfile( str(v_path), dtype=np.float32, count=-1).reshape([-1, num_features]) rect = calib['R0_rect'] if front_camera_id == 2: P2 = calib['P2'] else: P2 = calib[f'P{str(front_camera_id)}'] Trv2c = calib['Tr_velo_to_cam'] # first remove z < 0 points # keep = points_v[:, -1] > 0 # points_v = points_v[keep] # then remove outside. if back: points_v[:, 0] = -points_v[:, 0] points_v = box_np_ops.remove_outside_points(points_v, rect, Trv2c, P2, image_info['image_shape']) if save_path is None: save_dir = v_path.parent.parent / (v_path.parent.stem + '_reduced') if not save_dir.exists(): save_dir.mkdir() save_filename = save_dir / v_path.name # save_filename = str(v_path) + '_reduced' if back: save_filename += '_back' else: save_filename = str(Path(save_path) / v_path.name) if back: save_filename += '_back' with open(save_filename, 'w') as f: points_v.tofile(f) def create_reduced_point_cloud(data_path, pkl_prefix, train_info_path=None, val_info_path=None, test_info_path=None, save_path=None, with_back=False): """Create reduced point clouds for training/validation/testing. Args: data_path (str): Path of original data. pkl_prefix (str): Prefix of info files. train_info_path (str, optional): Path of training set info. Default: None. val_info_path (str, optional): Path of validation set info. Default: None. test_info_path (str, optional): Path of test set info. Default: None. save_path (str, optional): Path to save reduced point cloud data. Default: None. with_back (bool, optional): Whether to flip the points to back. Default: False. """ if train_info_path is None: train_info_path = Path(data_path) / f'{pkl_prefix}_infos_train.pkl' if val_info_path is None: val_info_path = Path(data_path) / f'{pkl_prefix}_infos_val.pkl' if test_info_path is None: test_info_path = Path(data_path) / f'{pkl_prefix}_infos_test.pkl' print('create reduced point cloud for training set') _create_reduced_point_cloud(data_path, train_info_path, save_path) print('create reduced point cloud for validation set') _create_reduced_point_cloud(data_path, val_info_path, save_path) print('create reduced point cloud for testing set') _create_reduced_point_cloud(data_path, test_info_path, save_path) if with_back: _create_reduced_point_cloud( data_path, train_info_path, save_path, back=True) _create_reduced_point_cloud( data_path, val_info_path, save_path, back=True) _create_reduced_point_cloud( data_path, test_info_path, save_path, back=True) def export_2d_annotation(root_path, info_path, mono3d=True): """Export 2d annotation from the info file and raw data. Args: root_path (str): Root path of the raw data. info_path (str): Path of the info file. mono3d (bool, optional): Whether to export mono3d annotation. Default: True. """ # get bbox annotations for camera kitti_infos = mmcv.load(info_path) cat2Ids = [ dict(id=kitti_categories.index(cat_name), name=cat_name) for cat_name in kitti_categories ] coco_ann_id = 0 coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) from os import path as osp for info in mmcv.track_iter_progress(kitti_infos): coco_infos = get_2d_boxes(info, occluded=[0, 1, 2, 3], mono3d=mono3d) (height, width, _) = mmcv.imread(osp.join(root_path, info['image']['image_path'])).shape coco_2d_dict['images'].append( dict( file_name=info['image']['image_path'], id=info['image']['image_idx'], Tri2v=info['calib']['Tr_imu_to_velo'], Trv2c=info['calib']['Tr_velo_to_cam'], rect=info['calib']['R0_rect'], cam_intrinsic=info['calib']['P2'], width=width, height=height)) for coco_info in coco_infos: if coco_info is None: continue # add an empty key for coco format coco_info['segmentation'] = [] coco_info['id'] = coco_ann_id coco_2d_dict['annotations'].append(coco_info) coco_ann_id += 1 if mono3d: json_prefix = f'{info_path[:-4]}_mono3d' else: json_prefix = f'{info_path[:-4]}' mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json') def get_2d_boxes(info, occluded, mono3d=True): """Get the 2D annotation records for a given info. Args: info: Information of the given sample data. occluded: Integer (0, 1, 2, 3) indicating occlusion state: 0 = fully visible, 1 = partly occluded, 2 = largely occluded, 3 = unknown, -1 = DontCare mono3d (bool): Whether to get boxes with mono3d annotation. Return: list[dict]: List of 2D annotation record that belongs to the input `sample_data_token`. """ # Get calibration information P2 = info['calib']['P2'] repro_recs = [] # if no annotations in info (test dataset), then return if 'annos' not in info: return repro_recs # Get all the annotation with the specified visibilties. ann_dicts = info['annos'] mask = [(ocld in occluded) for ocld in ann_dicts['occluded']] for k in ann_dicts.keys(): ann_dicts[k] = ann_dicts[k][mask] # convert dict of list to list of dict ann_recs = [] for i in range(len(ann_dicts['occluded'])): ann_rec = {} for k in ann_dicts.keys(): ann_rec[k] = ann_dicts[k][i] ann_recs.append(ann_rec) for ann_idx, ann_rec in enumerate(ann_recs): # Augment sample_annotation with token information. ann_rec['sample_annotation_token'] = \ f"{info['image']['image_idx']}.{ann_idx}" ann_rec['sample_data_token'] = info['image']['image_idx'] sample_data_token = info['image']['image_idx'] loc = ann_rec['location'][np.newaxis, :] dim = ann_rec['dimensions'][np.newaxis, :] rot = ann_rec['rotation_y'][np.newaxis, np.newaxis] # transform the center from [0.5, 1.0, 0.5] to [0.5, 0.5, 0.5] dst = np.array([0.5, 0.5, 0.5]) src = np.array([0.5, 1.0, 0.5]) loc = loc + dim * (dst - src) offset = (info['calib']['P2'][0, 3] - info['calib']['P0'][0, 3]) \ / info['calib']['P2'][0, 0] loc_3d = np.copy(loc) loc_3d[0, 0] += offset gt_bbox_3d = np.concatenate([loc, dim, rot], axis=1).astype(np.float32) # Filter out the corners that are not in front of the calibrated # sensor. corners_3d = box_np_ops.center_to_corner_box3d( gt_bbox_3d[:, :3], gt_bbox_3d[:, 3:6], gt_bbox_3d[:, 6], [0.5, 0.5, 0.5], axis=1) corners_3d = corners_3d[0].T # (1, 8, 3) -> (3, 8) in_front = np.argwhere(corners_3d[2, :] > 0).flatten() corners_3d = corners_3d[:, in_front] # Project 3d box to 2d. camera_intrinsic = P2 corner_coords = view_points(corners_3d, camera_intrinsic, True).T[:, :2].tolist() # Keep only corners that fall within the image. final_coords = post_process_coords(corner_coords) # Skip if the convex hull of the re-projected corners # does not intersect the image canvas. if final_coords is None: continue else: min_x, min_y, max_x, max_y = final_coords # Generate dictionary record to be included in the .json file. repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y, sample_data_token, info['image']['image_path']) # If mono3d=True, add 3D annotations in camera coordinates if mono3d and (repro_rec is not None): repro_rec['bbox_cam3d'] = np.concatenate( [loc_3d, dim, rot], axis=1).astype(np.float32).squeeze().tolist() repro_rec['velo_cam3d'] = -1 # no velocity in KITTI center3d = np.array(loc).reshape([1, 3]) center2d = points_cam2img( center3d, camera_intrinsic, with_depth=True) repro_rec['center2d'] = center2d.squeeze().tolist() # normalized center2D + depth # samples with depth < 0 will be removed if repro_rec['center2d'][2] <= 0: continue repro_rec['attribute_name'] = -1 # no attribute in KITTI repro_rec['attribute_id'] = -1 repro_recs.append(repro_rec) return repro_recs def generate_record(ann_rec, x1, y1, x2, y2, sample_data_token, filename): """Generate one 2D annotation record given various information on top of the 2D bounding box coordinates. Args: ann_rec (dict): Original 3d annotation record. x1 (float): Minimum value of the x coordinate. y1 (float): Minimum value of the y coordinate. x2 (float): Maximum value of the x coordinate. y2 (float): Maximum value of the y coordinate. sample_data_token (str): Sample data token. filename (str):The corresponding image file where the annotation is present. Returns: dict: A sample 2D annotation record. - file_name (str): file name - image_id (str): sample data token - area (float): 2d box area - category_name (str): category name - category_id (int): category id - bbox (list[float]): left x, top y, x_size, y_size of 2d box - iscrowd (int): whether the area is crowd """ repro_rec = OrderedDict() repro_rec['sample_data_token'] = sample_data_token coco_rec = dict() key_mapping = { 'name': 'category_name', 'num_points_in_gt': 'num_lidar_pts', 'sample_annotation_token': 'sample_annotation_token', 'sample_data_token': 'sample_data_token', } for key, value in ann_rec.items(): if key in key_mapping.keys(): repro_rec[key_mapping[key]] = value repro_rec['bbox_corners'] = [x1, y1, x2, y2] repro_rec['filename'] = filename coco_rec['file_name'] = filename coco_rec['image_id'] = sample_data_token coco_rec['area'] = (y2 - y1) * (x2 - x1) if repro_rec['category_name'] not in kitti_categories: return None cat_name = repro_rec['category_name'] coco_rec['category_name'] = cat_name coco_rec['category_id'] = kitti_categories.index(cat_name) coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1] coco_rec['iscrowd'] = 0 return coco_rec ================================================ FILE: tools/data_converter/kitti_data_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from collections import OrderedDict from concurrent import futures as futures from os import path as osp from pathlib import Path import mmcv import numpy as np from PIL import Image from skimage import io def get_image_index_str(img_idx, use_prefix_id=False): if use_prefix_id: return '{:07d}'.format(img_idx) else: return '{:06d}'.format(img_idx) def get_kitti_info_path(idx, prefix, info_type='image_2', file_tail='.png', training=True, relative_path=True, exist_check=True, use_prefix_id=False): img_idx_str = get_image_index_str(idx, use_prefix_id) img_idx_str += file_tail prefix = Path(prefix) if training: file_path = Path('training') / info_type / img_idx_str else: file_path = Path('testing') / info_type / img_idx_str if exist_check and not (prefix / file_path).exists(): raise ValueError('file not exist: {}'.format(file_path)) if relative_path: return str(file_path) else: return str(prefix / file_path) def get_image_path(idx, prefix, training=True, relative_path=True, exist_check=True, info_type='image_2', use_prefix_id=False): return get_kitti_info_path(idx, prefix, info_type, '.png', training, relative_path, exist_check, use_prefix_id) def get_label_path(idx, prefix, training=True, relative_path=True, exist_check=True, info_type='label_2', use_prefix_id=False): return get_kitti_info_path(idx, prefix, info_type, '.txt', training, relative_path, exist_check, use_prefix_id) def get_plane_path(idx, prefix, training=True, relative_path=True, exist_check=True, info_type='planes', use_prefix_id=False): return get_kitti_info_path(idx, prefix, info_type, '.txt', training, relative_path, exist_check, use_prefix_id) def get_velodyne_path(idx, prefix, training=True, relative_path=True, exist_check=True, use_prefix_id=False): return get_kitti_info_path(idx, prefix, 'velodyne', '.bin', training, relative_path, exist_check, use_prefix_id) def get_calib_path(idx, prefix, training=True, relative_path=True, exist_check=True, use_prefix_id=False): return get_kitti_info_path(idx, prefix, 'calib', '.txt', training, relative_path, exist_check, use_prefix_id) def get_pose_path(idx, prefix, training=True, relative_path=True, exist_check=True, use_prefix_id=False): return get_kitti_info_path(idx, prefix, 'pose', '.txt', training, relative_path, exist_check, use_prefix_id) def get_timestamp_path(idx, prefix, training=True, relative_path=True, exist_check=True, use_prefix_id=False): return get_kitti_info_path(idx, prefix, 'timestamp', '.txt', training, relative_path, exist_check, use_prefix_id) def get_label_anno(label_path): annotations = {} annotations.update({ 'name': [], 'truncated': [], 'occluded': [], 'alpha': [], 'bbox': [], 'dimensions': [], 'location': [], 'rotation_y': [] }) with open(label_path, 'r') as f: lines = f.readlines() # if len(lines) == 0 or len(lines[0]) < 15: # content = [] # else: content = [line.strip().split(' ') for line in lines] num_objects = len([x[0] for x in content if x[0] != 'DontCare']) annotations['name'] = np.array([x[0] for x in content]) num_gt = len(annotations['name']) annotations['truncated'] = np.array([float(x[1]) for x in content]) annotations['occluded'] = np.array([int(x[2]) for x in content]) annotations['alpha'] = np.array([float(x[3]) for x in content]) annotations['bbox'] = np.array([[float(info) for info in x[4:8]] for x in content]).reshape(-1, 4) # dimensions will convert hwl format to standard lhw(camera) format. annotations['dimensions'] = np.array([[float(info) for info in x[8:11]] for x in content ]).reshape(-1, 3)[:, [2, 0, 1]] annotations['location'] = np.array([[float(info) for info in x[11:14]] for x in content]).reshape(-1, 3) annotations['rotation_y'] = np.array([float(x[14]) for x in content]).reshape(-1) if len(content) != 0 and len(content[0]) == 16: # have score annotations['score'] = np.array([float(x[15]) for x in content]) else: annotations['score'] = np.zeros((annotations['bbox'].shape[0], )) index = list(range(num_objects)) + [-1] * (num_gt - num_objects) annotations['index'] = np.array(index, dtype=np.int32) annotations['group_ids'] = np.arange(num_gt, dtype=np.int32) return annotations def _extend_matrix(mat): mat = np.concatenate([mat, np.array([[0., 0., 0., 1.]])], axis=0) return mat def get_kitti_image_info(path, training=True, label_info=True, velodyne=False, calib=False, with_plane=False, image_ids=7481, extend_matrix=True, num_worker=8, relative_path=True, with_imageshape=True): """ KITTI annotation format version 2: { [optional]points: [N, 3+] point cloud [optional, for kitti]image: { image_idx: ... image_path: ... image_shape: ... } point_cloud: { num_features: 4 velodyne_path: ... } [optional, for kitti]calib: { R0_rect: ... Tr_velo_to_cam: ... P2: ... } annos: { location: [num_gt, 3] array dimensions: [num_gt, 3] array rotation_y: [num_gt] angle array name: [num_gt] ground truth name array [optional]difficulty: kitti difficulty [optional]group_ids: used for multi-part object } } """ root_path = Path(path) if not isinstance(image_ids, list): image_ids = list(range(image_ids)) def map_func(idx): info = {} pc_info = {'num_features': 4} calib_info = {} image_info = {'image_idx': idx} annotations = None if velodyne: pc_info['velodyne_path'] = get_velodyne_path( idx, path, training, relative_path) image_info['image_path'] = get_image_path(idx, path, training, relative_path) if with_imageshape: img_path = image_info['image_path'] if relative_path: img_path = str(root_path / img_path) image_info['image_shape'] = np.array( io.imread(img_path).shape[:2], dtype=np.int32) if label_info: label_path = get_label_path(idx, path, training, relative_path) if relative_path: label_path = str(root_path / label_path) annotations = get_label_anno(label_path) info['image'] = image_info info['point_cloud'] = pc_info if calib: calib_path = get_calib_path( idx, path, training, relative_path=False) with open(calib_path, 'r') as f: lines = f.readlines() P0 = np.array([float(info) for info in lines[0].split(' ')[1:13] ]).reshape([3, 4]) P1 = np.array([float(info) for info in lines[1].split(' ')[1:13] ]).reshape([3, 4]) P2 = np.array([float(info) for info in lines[2].split(' ')[1:13] ]).reshape([3, 4]) P3 = np.array([float(info) for info in lines[3].split(' ')[1:13] ]).reshape([3, 4]) if extend_matrix: P0 = _extend_matrix(P0) P1 = _extend_matrix(P1) P2 = _extend_matrix(P2) P3 = _extend_matrix(P3) R0_rect = np.array([ float(info) for info in lines[4].split(' ')[1:10] ]).reshape([3, 3]) if extend_matrix: rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype) rect_4x4[3, 3] = 1. rect_4x4[:3, :3] = R0_rect else: rect_4x4 = R0_rect Tr_velo_to_cam = np.array([ float(info) for info in lines[5].split(' ')[1:13] ]).reshape([3, 4]) Tr_imu_to_velo = np.array([ float(info) for info in lines[6].split(' ')[1:13] ]).reshape([3, 4]) if extend_matrix: Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam) Tr_imu_to_velo = _extend_matrix(Tr_imu_to_velo) calib_info['P0'] = P0 calib_info['P1'] = P1 calib_info['P2'] = P2 calib_info['P3'] = P3 calib_info['R0_rect'] = rect_4x4 calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam calib_info['Tr_imu_to_velo'] = Tr_imu_to_velo info['calib'] = calib_info if with_plane: plane_path = get_plane_path(idx, path, training, relative_path) if relative_path: plane_path = str(root_path / plane_path) lines = mmcv.list_from_file(plane_path) info['plane'] = np.array([float(i) for i in lines[3].split()]) if annotations is not None: info['annos'] = annotations add_difficulty_to_annos(info) return info with futures.ThreadPoolExecutor(num_worker) as executor: image_infos = executor.map(map_func, image_ids) return list(image_infos) class WaymoInfoGatherer: """ Parallel version of waymo dataset information gathering. Waymo annotation format version like KITTI: { [optional]points: [N, 3+] point cloud [optional, for kitti]image: { image_idx: ... image_path: ... image_shape: ... } point_cloud: { num_features: 6 velodyne_path: ... } [optional, for kitti]calib: { R0_rect: ... Tr_velo_to_cam0: ... P0: ... } annos: { location: [num_gt, 3] array dimensions: [num_gt, 3] array rotation_y: [num_gt] angle array name: [num_gt] ground truth name array [optional]difficulty: kitti difficulty [optional]group_ids: used for multi-part object } } """ def __init__(self, path, training=True, label_info=True, velodyne=False, calib=False, pose=False, extend_matrix=True, num_worker=8, relative_path=True, with_imageshape=True, max_sweeps=5) -> None: self.path = path self.training = training self.label_info = label_info self.velodyne = velodyne self.calib = calib self.pose = pose self.extend_matrix = extend_matrix self.num_worker = num_worker self.relative_path = relative_path self.with_imageshape = with_imageshape self.max_sweeps = max_sweeps def gather_single(self, idx): root_path = Path(self.path) info = {} pc_info = {'num_features': 6} calib_info = {} image_info = {'image_idx': idx} annotations = None if self.velodyne: pc_info['velodyne_path'] = get_velodyne_path( idx, self.path, self.training, self.relative_path, use_prefix_id=True) with open( get_timestamp_path( idx, self.path, self.training, relative_path=False, use_prefix_id=True)) as f: info['timestamp'] = np.int64(f.read()) image_info['image_path'] = get_image_path( idx, self.path, self.training, self.relative_path, info_type='image_0', use_prefix_id=True) if self.with_imageshape: img_path = image_info['image_path'] if self.relative_path: img_path = str(root_path / img_path) # io using PIL is significantly faster than skimage w, h = Image.open(img_path).size image_info['image_shape'] = np.array((h, w), dtype=np.int32) if self.label_info: label_path = get_label_path( idx, self.path, self.training, self.relative_path, info_type='label_all', use_prefix_id=True) if self.relative_path: label_path = str(root_path / label_path) annotations = get_label_anno(label_path) info['image'] = image_info info['point_cloud'] = pc_info if self.calib: calib_path = get_calib_path( idx, self.path, self.training, relative_path=False, use_prefix_id=True) with open(calib_path, 'r') as f: lines = f.readlines() P0 = np.array([float(info) for info in lines[0].split(' ')[1:13] ]).reshape([3, 4]) P1 = np.array([float(info) for info in lines[1].split(' ')[1:13] ]).reshape([3, 4]) P2 = np.array([float(info) for info in lines[2].split(' ')[1:13] ]).reshape([3, 4]) P3 = np.array([float(info) for info in lines[3].split(' ')[1:13] ]).reshape([3, 4]) P4 = np.array([float(info) for info in lines[4].split(' ')[1:13] ]).reshape([3, 4]) if self.extend_matrix: P0 = _extend_matrix(P0) P1 = _extend_matrix(P1) P2 = _extend_matrix(P2) P3 = _extend_matrix(P3) P4 = _extend_matrix(P4) R0_rect = np.array([ float(info) for info in lines[5].split(' ')[1:10] ]).reshape([3, 3]) if self.extend_matrix: rect_4x4 = np.zeros([4, 4], dtype=R0_rect.dtype) rect_4x4[3, 3] = 1. rect_4x4[:3, :3] = R0_rect else: rect_4x4 = R0_rect Tr_velo_to_cam = np.array([ float(info) for info in lines[6].split(' ')[1:13] ]).reshape([3, 4]) if self.extend_matrix: Tr_velo_to_cam = _extend_matrix(Tr_velo_to_cam) calib_info['P0'] = P0 calib_info['P1'] = P1 calib_info['P2'] = P2 calib_info['P3'] = P3 calib_info['P4'] = P4 calib_info['R0_rect'] = rect_4x4 calib_info['Tr_velo_to_cam'] = Tr_velo_to_cam info['calib'] = calib_info if self.pose: pose_path = get_pose_path( idx, self.path, self.training, relative_path=False, use_prefix_id=True) info['pose'] = np.loadtxt(pose_path) if annotations is not None: info['annos'] = annotations info['annos']['camera_id'] = info['annos'].pop('score') add_difficulty_to_annos(info) sweeps = [] prev_idx = idx while len(sweeps) < self.max_sweeps: prev_info = {} prev_idx -= 1 prev_info['velodyne_path'] = get_velodyne_path( prev_idx, self.path, self.training, self.relative_path, exist_check=False, use_prefix_id=True) if_prev_exists = osp.exists( Path(self.path) / prev_info['velodyne_path']) if if_prev_exists: with open( get_timestamp_path( prev_idx, self.path, self.training, relative_path=False, use_prefix_id=True)) as f: prev_info['timestamp'] = np.int64(f.read()) prev_pose_path = get_pose_path( prev_idx, self.path, self.training, relative_path=False, use_prefix_id=True) prev_info['pose'] = np.loadtxt(prev_pose_path) sweeps.append(prev_info) else: break info['sweeps'] = sweeps return info def gather(self, image_ids): if not isinstance(image_ids, list): image_ids = list(range(image_ids)) image_infos = mmcv.track_parallel_progress(self.gather_single, image_ids, self.num_worker) return list(image_infos) def kitti_anno_to_label_file(annos, folder): folder = Path(folder) for anno in annos: image_idx = anno['metadata']['image_idx'] label_lines = [] for j in range(anno['bbox'].shape[0]): label_dict = { 'name': anno['name'][j], 'alpha': anno['alpha'][j], 'bbox': anno['bbox'][j], 'location': anno['location'][j], 'dimensions': anno['dimensions'][j], 'rotation_y': anno['rotation_y'][j], 'score': anno['score'][j], } label_line = kitti_result_line(label_dict) label_lines.append(label_line) label_file = folder / f'{get_image_index_str(image_idx)}.txt' label_str = '\n'.join(label_lines) with open(label_file, 'w') as f: f.write(label_str) def add_difficulty_to_annos(info): min_height = [40, 25, 25] # minimum height for evaluated groundtruth/detections max_occlusion = [ 0, 1, 2 ] # maximum occlusion level of the groundtruth used for evaluation max_trunc = [ 0.15, 0.3, 0.5 ] # maximum truncation level of the groundtruth used for evaluation annos = info['annos'] dims = annos['dimensions'] # lhw format bbox = annos['bbox'] height = bbox[:, 3] - bbox[:, 1] occlusion = annos['occluded'] truncation = annos['truncated'] diff = [] easy_mask = np.ones((len(dims), ), dtype=np.bool) moderate_mask = np.ones((len(dims), ), dtype=np.bool) hard_mask = np.ones((len(dims), ), dtype=np.bool) i = 0 for h, o, t in zip(height, occlusion, truncation): if o > max_occlusion[0] or h <= min_height[0] or t > max_trunc[0]: easy_mask[i] = False if o > max_occlusion[1] or h <= min_height[1] or t > max_trunc[1]: moderate_mask[i] = False if o > max_occlusion[2] or h <= min_height[2] or t > max_trunc[2]: hard_mask[i] = False i += 1 is_easy = easy_mask is_moderate = np.logical_xor(easy_mask, moderate_mask) is_hard = np.logical_xor(hard_mask, moderate_mask) for i in range(len(dims)): if is_easy[i]: diff.append(0) elif is_moderate[i]: diff.append(1) elif is_hard[i]: diff.append(2) else: diff.append(-1) annos['difficulty'] = np.array(diff, np.int32) return diff def kitti_result_line(result_dict, precision=4): prec_float = '{' + ':.{}f'.format(precision) + '}' res_line = [] all_field_default = OrderedDict([ ('name', None), ('truncated', -1), ('occluded', -1), ('alpha', -10), ('bbox', None), ('dimensions', [-1, -1, -1]), ('location', [-1000, -1000, -1000]), ('rotation_y', -10), ('score', 0.0), ]) res_dict = [(key, None) for key, val in all_field_default.items()] res_dict = OrderedDict(res_dict) for key, val in result_dict.items(): if all_field_default[key] is None and val is None: raise ValueError('you must specify a value for {}'.format(key)) res_dict[key] = val for key, val in res_dict.items(): if key == 'name': res_line.append(val) elif key in ['truncated', 'alpha', 'rotation_y', 'score']: if val is None: res_line.append(str(all_field_default[key])) else: res_line.append(prec_float.format(val)) elif key == 'occluded': if val is None: res_line.append(str(all_field_default[key])) else: res_line.append('{}'.format(val)) elif key in ['bbox', 'dimensions', 'location']: if val is None: res_line += [str(v) for v in all_field_default[key]] else: res_line += [prec_float.format(v) for v in val] else: raise ValueError('unknown key. supported key:{}'.format( res_dict.keys())) return ' '.join(res_line) ================================================ FILE: tools/data_converter/lyft_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from logging import warning from os import path as osp import mmcv import numpy as np from lyft_dataset_sdk.lyftdataset import LyftDataset as Lyft from pyquaternion import Quaternion from mmdet3d.datasets import LyftDataset from .nuscenes_converter import (get_2d_boxes, get_available_scenes, obtain_sensor2top) lyft_categories = ('car', 'truck', 'bus', 'emergency_vehicle', 'other_vehicle', 'motorcycle', 'bicycle', 'pedestrian', 'animal') def create_lyft_infos(root_path, info_prefix, version='v1.01-train', max_sweeps=10): """Create info file of lyft dataset. Given the raw data, generate its related info file in pkl format. Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. version (str, optional): Version of the data. Default: 'v1.01-train'. max_sweeps (int, optional): Max number of sweeps. Default: 10. """ lyft = Lyft( data_path=osp.join(root_path, version), json_path=osp.join(root_path, version, version), verbose=True) available_vers = ['v1.01-train', 'v1.01-test'] assert version in available_vers if version == 'v1.01-train': train_scenes = mmcv.list_from_file('data/lyft/train.txt') val_scenes = mmcv.list_from_file('data/lyft/val.txt') elif version == 'v1.01-test': train_scenes = mmcv.list_from_file('data/lyft/test.txt') val_scenes = [] else: raise ValueError('unknown') # filter existing scenes. available_scenes = get_available_scenes(lyft) available_scene_names = [s['name'] for s in available_scenes] train_scenes = list( filter(lambda x: x in available_scene_names, train_scenes)) val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) train_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in train_scenes ]) val_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in val_scenes ]) test = 'test' in version if test: print(f'test scene: {len(train_scenes)}') else: print(f'train scene: {len(train_scenes)}, \ val scene: {len(val_scenes)}') train_lyft_infos, val_lyft_infos = _fill_trainval_infos( lyft, train_scenes, val_scenes, test, max_sweeps=max_sweeps) metadata = dict(version=version) if test: print(f'test sample: {len(train_lyft_infos)}') data = dict(infos=train_lyft_infos, metadata=metadata) info_name = f'{info_prefix}_infos_test' info_path = osp.join(root_path, f'{info_name}.pkl') mmcv.dump(data, info_path) else: print(f'train sample: {len(train_lyft_infos)}, \ val sample: {len(val_lyft_infos)}') data = dict(infos=train_lyft_infos, metadata=metadata) train_info_name = f'{info_prefix}_infos_train' info_path = osp.join(root_path, f'{train_info_name}.pkl') mmcv.dump(data, info_path) data['infos'] = val_lyft_infos val_info_name = f'{info_prefix}_infos_val' info_val_path = osp.join(root_path, f'{val_info_name}.pkl') mmcv.dump(data, info_val_path) def _fill_trainval_infos(lyft, train_scenes, val_scenes, test=False, max_sweeps=10): """Generate the train/val infos from the raw data. Args: lyft (:obj:`LyftDataset`): Dataset class in the Lyft dataset. train_scenes (list[str]): Basic information of training scenes. val_scenes (list[str]): Basic information of validation scenes. test (bool, optional): Whether use the test mode. In the test mode, no annotations can be accessed. Default: False. max_sweeps (int, optional): Max number of sweeps. Default: 10. Returns: tuple[list[dict]]: Information of training set and validation set that will be saved to the info file. """ train_lyft_infos = [] val_lyft_infos = [] for sample in mmcv.track_iter_progress(lyft.sample): lidar_token = sample['data']['LIDAR_TOP'] sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP']) cs_record = lyft.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_record = lyft.get('ego_pose', sd_rec['ego_pose_token']) abs_lidar_path, boxes, _ = lyft.get_sample_data(lidar_token) # nuScenes devkit returns more convenient relative paths while # lyft devkit returns absolute paths abs_lidar_path = str(abs_lidar_path) # absolute path lidar_path = abs_lidar_path.split(f'{os.getcwd()}/')[-1] # relative path mmcv.check_file_exist(lidar_path) info = { 'lidar_path': lidar_path, 'token': sample['token'], 'sweeps': [], 'cams': dict(), 'lidar2ego_translation': cs_record['translation'], 'lidar2ego_rotation': cs_record['rotation'], 'ego2global_translation': pose_record['translation'], 'ego2global_rotation': pose_record['rotation'], 'timestamp': sample['timestamp'], } l2e_r = info['lidar2ego_rotation'] l2e_t = info['lidar2ego_translation'] e2g_r = info['ego2global_rotation'] e2g_t = info['ego2global_translation'] l2e_r_mat = Quaternion(l2e_r).rotation_matrix e2g_r_mat = Quaternion(e2g_r).rotation_matrix # obtain 6 image's information per frame camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] for cam in camera_types: cam_token = sample['data'][cam] cam_path, _, cam_intrinsic = lyft.get_sample_data(cam_token) cam_info = obtain_sensor2top(lyft, cam_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam) cam_info.update(cam_intrinsic=cam_intrinsic) info['cams'].update({cam: cam_info}) # obtain sweeps for a single key-frame sd_rec = lyft.get('sample_data', sample['data']['LIDAR_TOP']) sweeps = [] while len(sweeps) < max_sweeps: if not sd_rec['prev'] == '': sweep = obtain_sensor2top(lyft, sd_rec['prev'], l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, 'lidar') sweeps.append(sweep) sd_rec = lyft.get('sample_data', sd_rec['prev']) else: break info['sweeps'] = sweeps # obtain annotation if not test: annotations = [ lyft.get('sample_annotation', token) for token in sample['anns'] ] locs = np.array([b.center for b in boxes]).reshape(-1, 3) dims = np.array([b.wlh for b in boxes]).reshape(-1, 3) rots = np.array([b.orientation.yaw_pitch_roll[0] for b in boxes]).reshape(-1, 1) names = [b.name for b in boxes] for i in range(len(names)): if names[i] in LyftDataset.NameMapping: names[i] = LyftDataset.NameMapping[names[i]] names = np.array(names) # we need to convert box size to # the format of our lidar coordinate system # which is x_size, y_size, z_size (corresponding to l, w, h) gt_boxes = np.concatenate([locs, dims[:, [1, 0, 2]], rots], axis=1) assert len(gt_boxes) == len( annotations), f'{len(gt_boxes)}, {len(annotations)}' info['gt_boxes'] = gt_boxes info['gt_names'] = names info['num_lidar_pts'] = np.array( [a['num_lidar_pts'] for a in annotations]) info['num_radar_pts'] = np.array( [a['num_radar_pts'] for a in annotations]) if sample['scene_token'] in train_scenes: train_lyft_infos.append(info) else: val_lyft_infos.append(info) return train_lyft_infos, val_lyft_infos def export_2d_annotation(root_path, info_path, version): """Export 2d annotation from the info file and raw data. Args: root_path (str): Root path of the raw data. info_path (str): Path of the info file. version (str): Dataset version. """ warning.warn('DeprecationWarning: 2D annotations are not used on the ' 'Lyft dataset. The function export_2d_annotation will be ' 'deprecated.') # get bbox annotations for camera camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] lyft_infos = mmcv.load(info_path)['infos'] lyft = Lyft( data_path=osp.join(root_path, version), json_path=osp.join(root_path, version, version), verbose=True) # info_2d_list = [] cat2Ids = [ dict(id=lyft_categories.index(cat_name), name=cat_name) for cat_name in lyft_categories ] coco_ann_id = 0 coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) for info in mmcv.track_iter_progress(lyft_infos): for cam in camera_types: cam_info = info['cams'][cam] coco_infos = get_2d_boxes( lyft, cam_info['sample_data_token'], visibilities=['', '1', '2', '3', '4']) (height, width, _) = mmcv.imread(cam_info['data_path']).shape coco_2d_dict['images'].append( dict( file_name=cam_info['data_path'], id=cam_info['sample_data_token'], width=width, height=height)) for coco_info in coco_infos: if coco_info is None: continue # add an empty key for coco format coco_info['segmentation'] = [] coco_info['id'] = coco_ann_id coco_2d_dict['annotations'].append(coco_info) coco_ann_id += 1 mmcv.dump(coco_2d_dict, f'{info_path[:-4]}.coco.json') ================================================ FILE: tools/data_converter/lyft_data_fixer.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import os import numpy as np def fix_lyft(root_folder='./data/lyft', version='v1.01'): # refer to https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/discussion/110000 # noqa lidar_path = 'lidar/host-a011_lidar1_1233090652702363606.bin' root_folder = os.path.join(root_folder, f'{version}-train') lidar_path = os.path.join(root_folder, lidar_path) assert os.path.isfile(lidar_path), f'Please download the complete Lyft ' \ f'dataset and make sure {lidar_path} is present.' points = np.fromfile(lidar_path, dtype=np.float32, count=-1) try: points.reshape([-1, 5]) print(f'This fix is not required for version {version}.') except ValueError: new_points = np.array(list(points) + [100.0, 1.0], dtype='float32') new_points.tofile(lidar_path) print(f'Appended 100.0 and 1.0 to the end of {lidar_path}.') parser = argparse.ArgumentParser(description='Lyft dataset fixer arg parser') parser.add_argument( '--root-folder', type=str, default='./data/lyft', help='specify the root path of Lyft dataset') parser.add_argument( '--version', type=str, default='v1.01', help='specify Lyft dataset version') args = parser.parse_args() if __name__ == '__main__': fix_lyft(root_folder=args.root_folder, version=args.version) ================================================ FILE: tools/data_converter/nuimage_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import base64 from os import path as osp import mmcv import numpy as np from nuimages import NuImages from nuimages.utils.utils import mask_decode, name_to_index_mapping nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier') NAME_MAPPING = { 'movable_object.barrier': 'barrier', 'vehicle.bicycle': 'bicycle', 'vehicle.bus.bendy': 'bus', 'vehicle.bus.rigid': 'bus', 'vehicle.car': 'car', 'vehicle.construction': 'construction_vehicle', 'vehicle.motorcycle': 'motorcycle', 'human.pedestrian.adult': 'pedestrian', 'human.pedestrian.child': 'pedestrian', 'human.pedestrian.construction_worker': 'pedestrian', 'human.pedestrian.police_officer': 'pedestrian', 'movable_object.trafficcone': 'traffic_cone', 'vehicle.trailer': 'trailer', 'vehicle.truck': 'truck', } def parse_args(): parser = argparse.ArgumentParser(description='Data converter arg parser') parser.add_argument( '--data-root', type=str, default='./data/nuimages', help='specify the root path of dataset') parser.add_argument( '--version', type=str, nargs='+', default=['v1.0-mini'], required=False, help='specify the dataset version') parser.add_argument( '--out-dir', type=str, default='./data/nuimages/annotations/', required=False, help='path to save the exported json') parser.add_argument( '--nproc', type=int, default=4, required=False, help='workers to process semantic masks') parser.add_argument('--extra-tag', type=str, default='nuimages') args = parser.parse_args() return args def get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root): """Get semantic segmentation map for an image. Args: nuim (obj:`NuImages`): NuImages dataset object img_info (dict): Meta information of img Returns: np.ndarray: Semantic segmentation map of the image """ sd_token = img_info['token'] image_id = img_info['id'] name_to_index = name_to_index_mapping(nuim.category) # Get image data. width, height = img_info['width'], img_info['height'] semseg_mask = np.zeros((height, width)).astype('uint8') # Load stuff / surface regions. surface_anns = [ o for o in nuim.surface_ann if o['sample_data_token'] == sd_token ] # Draw stuff / surface regions. for ann in surface_anns: # Get color and mask. category_token = ann['category_token'] category_name = nuim.get('category', category_token)['name'] if ann['mask'] is None: continue mask = mask_decode(ann['mask']) # Draw mask for semantic segmentation. semseg_mask[mask == 1] = name_to_index[category_name] # Load object instances. object_anns = [ o for o in nuim.object_ann if o['sample_data_token'] == sd_token ] # Sort by token to ensure that objects always appear in the # instance mask in the same order. object_anns = sorted(object_anns, key=lambda k: k['token']) # Draw object instances. # The 0 index is reserved for background; thus, the instances # should start from index 1. annotations = [] for i, ann in enumerate(object_anns, start=1): # Get color, box, mask and name. category_token = ann['category_token'] category_name = nuim.get('category', category_token)['name'] if ann['mask'] is None: continue mask = mask_decode(ann['mask']) # Draw masks for semantic segmentation and instance segmentation. semseg_mask[mask == 1] = name_to_index[category_name] if category_name in NAME_MAPPING: cat_name = NAME_MAPPING[category_name] cat_id = cat2id[cat_name] x_min, y_min, x_max, y_max = ann['bbox'] # encode calibrated instance mask mask_anno = dict() mask_anno['counts'] = base64.b64decode( ann['mask']['counts']).decode() mask_anno['size'] = ann['mask']['size'] data_anno = dict( image_id=image_id, category_id=cat_id, bbox=[x_min, y_min, x_max - x_min, y_max - y_min], area=(x_max - x_min) * (y_max - y_min), segmentation=mask_anno, iscrowd=0) annotations.append(data_anno) # after process, save semantic masks img_filename = img_info['file_name'] seg_filename = img_filename.replace('jpg', 'png') seg_filename = osp.join(seg_root, seg_filename) mmcv.imwrite(semseg_mask, seg_filename) return annotations, np.max(semseg_mask) def export_nuim_to_coco(nuim, data_root, out_dir, extra_tag, version, nproc): print('Process category information') categories = [] categories = [ dict(id=nus_categories.index(cat_name), name=cat_name) for cat_name in nus_categories ] cat2id = {k_v['name']: k_v['id'] for k_v in categories} images = [] print('Process image meta information...') for sample_info in mmcv.track_iter_progress(nuim.sample_data): if sample_info['is_key_frame']: img_idx = len(images) images.append( dict( id=img_idx, token=sample_info['token'], file_name=sample_info['filename'], width=sample_info['width'], height=sample_info['height'])) seg_root = f'{out_dir}semantic_masks' mmcv.mkdir_or_exist(seg_root) mmcv.mkdir_or_exist(osp.join(data_root, 'calibrated')) global process_img_anno def process_img_anno(img_info): single_img_annos, max_cls_id = get_img_annos(nuim, img_info, cat2id, out_dir, data_root, seg_root) return single_img_annos, max_cls_id print('Process img annotations...') if nproc > 1: outputs = mmcv.track_parallel_progress( process_img_anno, images, nproc=nproc) else: outputs = [] for img_info in mmcv.track_iter_progress(images): outputs.append(process_img_anno(img_info)) # Determine the index of object annotation print('Process annotation information...') annotations = [] max_cls_ids = [] for single_img_annos, max_cls_id in outputs: max_cls_ids.append(max_cls_id) for img_anno in single_img_annos: img_anno.update(id=len(annotations)) annotations.append(img_anno) max_cls_id = max(max_cls_ids) print(f'Max ID of class in the semantic map: {max_cls_id}') coco_format_json = dict( images=images, annotations=annotations, categories=categories) mmcv.mkdir_or_exist(out_dir) out_file = osp.join(out_dir, f'{extra_tag}_{version}.json') print(f'Annotation dumped to {out_file}') mmcv.dump(coco_format_json, out_file) def main(): args = parse_args() for version in args.version: nuim = NuImages( dataroot=args.data_root, version=version, verbose=True, lazy=True) export_nuim_to_coco(nuim, args.data_root, args.out_dir, args.extra_tag, version, args.nproc) if __name__ == '__main__': main() ================================================ FILE: tools/data_converter/nuscenes_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from collections import OrderedDict from os import path as osp from typing import List, Tuple, Union import mmcv import numpy as np from nuscenes.nuscenes import NuScenes from nuscenes.utils.geometry_utils import view_points from nuscenes.prediction import PredictHelper from pyquaternion import Quaternion from shapely.geometry import MultiPoint, box from nuscenes.utils.geometry_utils import transform_matrix import math from mmdet3d.core.bbox import points_cam2img from mmdet3d.datasets import NuScenesDataset from tqdm import tqdm import multiprocessing import copy from multiprocessing import Manager from data_converter.nuscenes_prediction_tools import get_forecasting_annotations from nuscenes.utils.data_classes import Box nus_categories = ('car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier') nus_attributes = ('cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', 'pedestrian.standing', 'pedestrian.sitting_lying_down', 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None') ego_width, ego_length = 1.85, 4.084 def quart_to_rpy(qua): x, y, z, w = qua roll = math.atan2(2 * (w * x + y * z), 1 - 2 * (x * x + y * y)) pitch = math.asin(2 * (w * y - x * z)) yaw = math.atan2(2 * (w * z + x * y), 1 - 2 * (z * z + y * y)) return roll, pitch, yaw def locate_message(utimes, utime): i = np.searchsorted(utimes, utime) if i == len(utimes) or (i > 0 and utime - utimes[i-1] < utimes[i] - utime): i -= 1 return i def create_nuscenes_infos(root_path, info_prefix, version='v1.0-trainval', max_sweeps=10): """Create info file of nuscene dataset. Given the raw data, generate its related info file in pkl format. Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. version (str, optional): Version of the data. Default: 'v1.0-trainval'. max_sweeps (int, optional): Max number of sweeps. Default: 10. """ from nuscenes.nuscenes import NuScenes from nuscenes.can_bus.can_bus_api import NuScenesCanBus nusc = NuScenes(version=version, dataroot=root_path, verbose=True) nusc_can_bus = NuScenesCanBus(dataroot=root_path) from nuscenes.utils import splits available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini'] assert version in available_vers if version == 'v1.0-trainval': train_scenes = splits.train val_scenes = splits.val elif version == 'v1.0-test': train_scenes = splits.test val_scenes = [] elif version == 'v1.0-mini': train_scenes = splits.mini_train val_scenes = splits.mini_val else: raise ValueError('unknown') # filter existing scenes. available_scenes = get_available_scenes(nusc) available_scene_names = [s['name'] for s in available_scenes] train_scenes = list( filter(lambda x: x in available_scene_names, train_scenes)) val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) train_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in train_scenes ]) val_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in val_scenes ]) test = 'test' in version if test: print('test scene: {}'.format(len(train_scenes))) else: print('train scene: {}, val scene: {}'.format( len(train_scenes), len(val_scenes))) train_nusc_infos, val_nusc_infos = _fill_trainval_infos( nusc, nusc_can_bus, train_scenes, val_scenes, test, max_sweeps=max_sweeps) metadata = dict(version=version) if test: print('test sample: {}'.format(len(train_nusc_infos))) data = dict(infos=train_nusc_infos, metadata=metadata) info_path = osp.join(root_path, '{}_infos_test.pkl'.format(info_prefix)) mmcv.dump(data, info_path) else: print('train sample: {}, val sample: {}'.format( len(train_nusc_infos), len(val_nusc_infos))) data = dict(infos=train_nusc_infos, metadata=metadata) info_path = osp.join(root_path, '{}_infos_train.pkl'.format(info_prefix)) mmcv.dump(data, info_path) data['infos'] = val_nusc_infos info_val_path = osp.join(root_path, '{}_infos_val.pkl'.format(info_prefix)) mmcv.dump(data, info_val_path) def get_available_scenes(nusc): """Get available scenes from the input nuscenes class. Given the raw data, get the information of available scenes for further info generation. Args: nusc (class): Dataset class in the nuScenes dataset. Returns: available_scenes (list[dict]): List of basic information for the available scenes. """ available_scenes = [] print('total scene num: {}'.format(len(nusc.scene))) for scene in nusc.scene: scene_token = scene['token'] scene_rec = nusc.get('scene', scene_token) sample_rec = nusc.get('sample', scene_rec['first_sample_token']) sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) has_more_frames = True scene_not_exist = False while has_more_frames: lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token']) lidar_path = str(lidar_path) if os.getcwd() in lidar_path: # path from lyftdataset is absolute path lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1] # relative path if not mmcv.is_filepath(lidar_path): scene_not_exist = True break else: break if scene_not_exist: continue available_scenes.append(scene) print('exist scene num: {}'.format(len(available_scenes))) return available_scenes def _get_future_traj_info(nusc, sample, predict_steps=8, in_agent_frame=False): sample_token = sample['token'] ann_tokens = np.array(sample['anns']) sd_rec = nusc.get('sample', sample_token) fut_traj_all = [] fut_traj_valid_mask_all = [] _, boxes, _ = nusc.get_sample_data(sd_rec['data']['LIDAR_TOP'], selected_anntokens=ann_tokens) predict_helper = PredictHelper(nusc) for i, ann_token in enumerate(ann_tokens): box = boxes[i] instance_token = nusc.get('sample_annotation', ann_token)['instance_token'] fut_traj_local = predict_helper.get_future_for_agent(instance_token, sample_token, seconds=predict_steps//2, in_agent_frame=in_agent_frame) fut_traj = np.zeros((predict_steps, 2)) fut_traj_valid_mask = np.zeros((predict_steps, 2)) if fut_traj_local.shape[0] > 0: # trans = box.center # trans = np.array([0, 0, 0]) # rot = Quaternion(matrix=box.rotation_matrix) # fut_traj_scence_centric = convert_local_coords_to_global(fut_traj_local, trans, rot) fut_traj_scence_centric = fut_traj_local fut_traj[:fut_traj_scence_centric.shape[0], :] = fut_traj_scence_centric fut_traj_valid_mask[:fut_traj_scence_centric.shape[0], :] = 1 fut_traj_all.append(fut_traj) fut_traj_valid_mask_all.append(fut_traj_valid_mask) if len(ann_tokens) > 0: fut_traj_all = np.stack(fut_traj_all, axis=0) fut_traj_valid_mask_all = np.stack(fut_traj_valid_mask_all, axis=0) else: fut_traj_all = np.zeros((0, predict_steps, 2)) fut_traj_valid_mask_all = np.zeros((0, predict_steps, 2)) return fut_traj_all, fut_traj_valid_mask_all def _get_can_bus_info(nusc, nusc_can_bus, sample): scene_name = nusc.get('scene', sample['scene_token'])['name'] sample_timestamp = sample['timestamp'] try: pose_list = nusc_can_bus.get_messages(scene_name, 'pose') except: return np.zeros(18) # server scenes do not have can bus information. can_bus = [] # during each scene, the first timestamp of can_bus may be large than the first sample's timestamp last_pose = pose_list[0] for i, pose in enumerate(pose_list): if pose['utime'] > sample_timestamp: break last_pose = pose _ = last_pose.pop('utime') # useless pos = last_pose.pop('pos') rotation = last_pose.pop('orientation') can_bus.extend(pos) can_bus.extend(rotation) for key in last_pose.keys(): can_bus.extend(pose[key]) # 16 elements can_bus.extend([0., 0.]) return np.array(can_bus) def _fill_trainval_infos(nusc, nusc_can_bus, train_scenes, val_scenes, test=False, max_sweeps=10, forecasting=False, forecasting_length=13, his_ts=2, fut_ts=6, ): """Generate the train/val infos from the raw data. Args: nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset. train_scenes (list[str]): Basic information of training scenes. val_scenes (list[str]): Basic information of validation scenes. test (bool, optional): Whether use the test mode. In test mode, no annotations can be accessed. Default: False. max_sweeps (int, optional): Max number of sweeps. Default: 10. Returns: tuple[list[dict]]: Information of training set and validation set that will be saved to the info file. """ train_nusc_infos = [] val_nusc_infos = [] frame_idx = 0 cat2idx = {} for i, name in enumerate(nus_categories): cat2idx[name] = i for sample in mmcv.track_iter_progress(nusc.sample): lidar_token = sample['data']['LIDAR_TOP'] sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) if sample['prev'] != '': sample_prev = nusc.get('sample', sample['prev']) sd_rec_prev = nusc.get('sample_data', sample_prev['data']['LIDAR_TOP']) pose_record_prev = nusc.get('ego_pose', sd_rec_prev['ego_pose_token']) else: pose_record_prev = None if sample['next'] != '': sample_next = nusc.get('sample', sample['next']) sd_rec_next = nusc.get('sample_data', sample_next['data']['LIDAR_TOP']) pose_record_next = nusc.get('ego_pose', sd_rec_next['ego_pose_token']) else: pose_record_next = None lidar_path, boxes, _ = nusc.get_sample_data(lidar_token) mmcv.check_file_exist(lidar_path) can_bus = _get_can_bus_info(nusc, nusc_can_bus, sample) info = { 'lidar_path': lidar_path, 'token': sample['token'], 'prev': sample['prev'], 'next': sample['next'], 'can_bus': can_bus, 'sweeps': [], 'frame_idx': frame_idx, 'cams': dict(), 'scene_token': sample['scene_token'], 'lidar2ego_translation': cs_record['translation'], 'lidar2ego_rotation': cs_record['rotation'], 'ego2global_translation': pose_record['translation'], 'ego2global_rotation': pose_record['rotation'], 'timestamp': sample['timestamp'], } l2e_r = info['lidar2ego_rotation'] l2e_t = info['lidar2ego_translation'] e2g_r = info['ego2global_rotation'] e2g_t = info['ego2global_translation'] l2e_r_mat = Quaternion(l2e_r).rotation_matrix e2g_r_mat = Quaternion(e2g_r).rotation_matrix if sample['next'] == '': frame_idx = 0 else: frame_idx += 1 # obtain 6 image's information per frame camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] for cam in camera_types: cam_token = sample['data'][cam] cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token) cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam) cam_info.update(cam_intrinsic=cam_intrinsic) info['cams'].update({cam: cam_info}) # obtain sweeps for a single key-frame sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) sweeps = [] while len(sweeps) < max_sweeps: if not sd_rec['prev'] == '': sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, 'lidar') sweeps.append(sweep) sd_rec = nusc.get('sample_data', sd_rec['prev']) else: break info['sweeps'] = sweeps # obtain annotation if not test: annotations = [ nusc.get('sample_annotation', token) for token in sample['anns'] ] locs = np.array([b.center for b in boxes]).reshape(-1, 3) dims = np.array([b.wlh for b in boxes]).reshape(-1, 3) rots = np.array([b.orientation.yaw_pitch_roll[0] for b in boxes]).reshape(-1, 1) velocity = np.array( [nusc.box_velocity(token)[:2] for token in sample['anns']]) valid_flag = np.array( [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0 for anno in annotations], dtype=bool).reshape(-1) # add instance_ids instance_inds = [nusc.getind('instance', ann['instance_token']) for ann in annotations] future_traj_all, future_traj_valid_mask_all = _get_future_traj_info(nusc, sample, in_agent_frame=False) # from IPython import embed # embed() # exit() # future_traj_all_rel, future_traj_valid_mask_all = _get_future_traj_info(nusc, sample, in_agent_frame=True) instance_tokens = [ann['instance_token'] for ann in annotations] # dtype('= 2: command = np.array([1, 0, 0]) # Turn Right elif ego_fut_trajs[-1][1] <= -2: command = np.array([0, 1, 0]) # Turn Left else: command = np.array([0, 0, 1]) # Go Straight # offset from lcf -> per-step offset ego_fut_trajs = ego_fut_trajs[1:] - ego_fut_trajs[:-1] ### ego lcf feat (vx, vy, ax, ay, w, length, width, vel, steer), w: yaw角速度 ego_lcf_feat = np.zeros(9) # 根据odom推算自车速度及加速度 _, _, ego_yaw = quart_to_rpy(pose_record['rotation']) ego_pos = np.array(pose_record['translation']) if pose_record_prev is not None: _, _, ego_yaw_prev = quart_to_rpy(pose_record_prev['rotation']) ego_pos_prev = np.array(pose_record_prev['translation']) if pose_record_next is not None: _, _, ego_yaw_next = quart_to_rpy(pose_record_next['rotation']) ego_pos_next = np.array(pose_record_next['translation']) assert (pose_record_prev is not None) or (pose_record_next is not None), 'prev token and next token all empty' if pose_record_prev is not None: ego_w = (ego_yaw - ego_yaw_prev) / 0.5 ego_v = np.linalg.norm(ego_pos[:2] - ego_pos_prev[:2]) / 0.5 ego_vy, ego_vx = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2) else: ego_w = (ego_yaw_next - ego_yaw) / 0.5 ego_v = np.linalg.norm(ego_pos_next[:2] - ego_pos[:2]) / 0.5 ego_vy, ego_vx = ego_v * math.cos(ego_yaw + np.pi/2), ego_v * math.sin(ego_yaw + np.pi/2) ref_scene = nusc.get("scene", sample['scene_token']) try: pose_msgs = nusc_can_bus.get_messages(ref_scene['name'],'pose') steer_msgs = nusc_can_bus.get_messages(ref_scene['name'], 'steeranglefeedback') pose_uts = [msg['utime'] for msg in pose_msgs] steer_uts = [msg['utime'] for msg in steer_msgs] ref_utime = sample['timestamp'] pose_index = locate_message(pose_uts, ref_utime) pose_data = pose_msgs[pose_index] steer_index = locate_message(steer_uts, ref_utime) steer_data = steer_msgs[steer_index] # initial speed v0 = pose_data["vel"][0] # [0] means longitudinal velocity m/s # curvature (positive: turn left) steering = steer_data["value"] # flip x axis if in left-hand traffic (singapore) map_location = nusc.get('log', nusc.get('scene', sample['scene_token'])['log_token'])['location'] flip_flag = True if map_location.startswith('singapore') else False if flip_flag: steering *= -1 Kappa = 2 * steering / 2.588 except: delta_x = ego_his_trajs[-1, 0] + ego_fut_trajs[0, 0] delta_y = ego_his_trajs[-1, 1] + ego_fut_trajs[0, 1] v0 = np.sqrt(delta_x**2 + delta_y**2) Kappa = 0 ego_lcf_feat[:2] = np.array([ego_vx, ego_vy]) # can_bus[13:15] ego_lcf_feat[2:4] = can_bus[7:9] ego_lcf_feat[4] = ego_w # can_bus[12] ego_lcf_feat[5:7] = np.array([ego_length, ego_width]) ego_lcf_feat[7] = v0 ego_lcf_feat[8] = Kappa info['gt_boxes'] = gt_boxes info['gt_names'] = names info['gt_velocity'] = velocity.reshape(-1, 2) info['num_lidar_pts'] = np.array( [a['num_lidar_pts'] for a in annotations]) info['num_radar_pts'] = np.array( [a['num_radar_pts'] for a in annotations]) info['valid_flag'] = valid_flag info['gt_agent_fut_trajs'] = gt_fut_trajs.reshape(-1, fut_ts*2).astype(np.float32) info['gt_agent_fut_masks'] = gt_fut_masks.reshape(-1, fut_ts).astype(np.float32) info['gt_agent_lcf_feat'] = agent_lcf_feat.astype(np.float32) info['gt_agent_fut_yaw'] = gt_fut_yaw.astype(np.float32) info['gt_agent_fut_goal'] = gt_fut_goal.astype(np.float32) info['gt_ego_his_trajs'] = ego_his_trajs[:, :2].astype(np.float32) info['gt_ego_fut_trajs'] = ego_fut_trajs[:, :2].astype(np.float32) info['gt_ego_fut_masks'] = ego_fut_masks[1:].astype(np.float32) info['gt_ego_fut_cmd'] = command.astype(np.float32) info['gt_ego_lcf_feat'] = ego_lcf_feat.astype(np.float32) info['instance_inds'] = instance_inds info['gt_ins_tokens'] = np.array(instance_tokens) info['fut_traj'] = future_traj_all info['fut_traj_valid_mask'] = future_traj_valid_mask_all # add visibility_tokens visibility_tokens = [int(anno['visibility_token']) for anno in annotations] info['visibility_tokens'] = np.array(visibility_tokens) # if forecasting: # fboxes, fannotations, fmasks, ftypes = get_forecasting_annotations(nusc, annotations, forecasting_length) # locs = [np.array([b.center for b in boxes]).reshape(-1, 3) for boxes in fboxes] # tokens = [np.array([b.token for b in boxes]) for boxes in fboxes] # info['forecasting_locs'] = np.array(locs) # info['forecasting_tokens'] = np.array(tokens) # info['forecasting_masks'] = np.array(fmasks) # info['forecasting_types'] = np.array(ftypes) gt_2dbboxes_cams = [] gt_3dbboxes_cams = [] centers2d_cams = [] gt_2dbboxes_ignore_cams = [] gt_2dlabels_cams = [] depths_cams = [] visibilities = [] for cam_type, cam_info in info['cams'].items(): gt_3dbboxes = [] gt_2dbboxes = [] centers2d = [] gt_2dbboxes_ignore = [] gt_2dlabels = [] depths = [] visibility = [] (height, width, _) = mmcv.imread(cam_info['data_path']).shape annos_cam = get_2d_boxes(nusc, cam_info['sample_data_token'], visibilities= ['', '1', '2', '3', '4'], mono3d=True) for i, ann in enumerate(annos_cam): if ann is None: continue if ann.get('ignore', False): continue x1, y1, w, h = ann['bbox'] inter_w = max(0, min(x1 + w, width) - max(x1, 0)) inter_h = max(0, min(y1 + h, height) - max(y1, 0)) if inter_w * inter_h == 0: continue if ann['area'] <= 0 or w < 1 or h < 1: continue if ann['category_name'] not in nus_categories: continue bbox = [x1, y1, x1 + w, y1 + h] if ann.get('iscrowd', False): gt_2dbboxes_ignore.append(bbox) else: gt_2dbboxes.append(bbox) gt_2dlabels.append(ann['category_id']) center2d = ann['center2d'][:2] depth = ann['center2d'][2] centers2d.append(center2d) depths.append(depth) visibility.append(ann['visibility_token']) gt_3dbboxes.append(ann['bbox_cam3d']) gt_2dbboxes = np.array(gt_2dbboxes, dtype=np.float32) gt_3dbboxes_cam = np.array(gt_3dbboxes, dtype=np.float32) gt_2dlabels = np.array(gt_2dlabels, dtype=np.int64) centers2d = np.array(centers2d, dtype=np.float32) depths = np.array(depths, dtype=np.float32) gt_2dbboxes_ignore = np.array(gt_2dbboxes_ignore, dtype=np.float32) gt_2dbboxes_cams.append(gt_2dbboxes) gt_2dlabels_cams.append(gt_2dlabels) centers2d_cams.append(centers2d) gt_3dbboxes_cams.append(gt_3dbboxes_cam) depths_cams.append(depths) gt_2dbboxes_ignore_cams.append(gt_2dbboxes_ignore) visibilities.append(visibility) info.update( dict( bboxes2d=gt_2dbboxes_cams, bboxes3d_cams=gt_3dbboxes_cams, labels2d=gt_2dlabels_cams, centers2d=centers2d_cams, depths=depths_cams, bboxes_ignore=gt_2dbboxes_ignore_cams, visibilities = visibilities,) ) if sample['scene_token'] in train_scenes: train_nusc_infos.append(info) else: val_nusc_infos.append(info) return train_nusc_infos, val_nusc_infos def get_global_sensor_pose(rec, nusc, inverse=False): lidar_sample_data = nusc.get('sample_data', rec['data']['LIDAR_TOP']) sd_ep = nusc.get("ego_pose", lidar_sample_data["ego_pose_token"]) sd_cs = nusc.get("calibrated_sensor", lidar_sample_data["calibrated_sensor_token"]) if inverse is False: global_from_ego = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=False) ego_from_sensor = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=False) pose = global_from_ego.dot(ego_from_sensor) # translation equivalent writing # pose_translation = np.array(sd_cs["translation"]) # rot_mat = Quaternion(sd_ep['rotation']).rotation_matrix # pose_translation = np.dot(rot_mat, pose_translation) # # pose_translation = pose[:3, 3] # pose_translation = pose_translation + np.array(sd_ep["translation"]) else: sensor_from_ego = transform_matrix(sd_cs["translation"], Quaternion(sd_cs["rotation"]), inverse=True) ego_from_global = transform_matrix(sd_ep["translation"], Quaternion(sd_ep["rotation"]), inverse=True) pose = sensor_from_ego.dot(ego_from_global) return pose def obtain_sensor2top(nusc, sensor_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, sensor_type='lidar'): """Obtain the info with RT matric from general sensor to Top LiDAR. Args: nusc (class): Dataset class in the nuScenes dataset. sensor_token (str): Sample data token corresponding to the specific sensor type. l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3). l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego in shape (3, 3). e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). e2g_r_mat (np.ndarray): Rotation matrix from ego to global in shape (3, 3). sensor_type (str, optional): Sensor to calibrate. Default: 'lidar'. Returns: sweep (dict): Sweep information after transformation. """ sd_rec = nusc.get('sample_data', sensor_token) cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) data_path = str(nusc.get_sample_data_path(sd_rec['token'])) if os.getcwd() in data_path: # path from lyftdataset is absolute path data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path sweep = { 'data_path': data_path, 'type': sensor_type, 'sample_data_token': sd_rec['token'], 'sensor2ego_translation': cs_record['translation'], 'sensor2ego_rotation': cs_record['rotation'], 'ego2global_translation': pose_record['translation'], 'ego2global_rotation': pose_record['rotation'], 'timestamp': sd_rec['timestamp'] } l2e_r_s = sweep['sensor2ego_rotation'] l2e_t_s = sweep['sensor2ego_translation'] e2g_r_s = sweep['ego2global_rotation'] e2g_t_s = sweep['ego2global_translation'] # obtain the RT from sensor to Top LiDAR # sweep->ego->global->ego'->lidar l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ ( np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ ( np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T ) + l2e_t @ np.linalg.inv(l2e_r_mat).T sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T sweep['sensor2lidar_translation'] = T return sweep def export_2d_annotation(root_path, info_path, version, mono3d=True): """Export 2d annotation from the info file and raw data. Args: root_path (str): Root path of the raw data. info_path (str): Path of the info file. version (str): Dataset version. mono3d (bool, optional): Whether to export mono3d annotation. Default: True. """ # get bbox annotations for camera camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] nusc_infos = mmcv.load(info_path)['infos'] nusc = NuScenes(version=version, dataroot=root_path, verbose=True) # info_2d_list = [] cat2Ids = [ dict(id=nus_categories.index(cat_name), name=cat_name) for cat_name in nus_categories ] coco_ann_id = 0 coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) for info in mmcv.track_iter_progress(nusc_infos): for cam in camera_types: cam_info = info['cams'][cam] coco_infos = get_2d_boxes( nusc, cam_info['sample_data_token'], visibilities=['', '1', '2', '3', '4'], mono3d=mono3d) (height, width, _) = mmcv.imread(cam_info['data_path']).shape coco_2d_dict['images'].append( dict( file_name=cam_info['data_path'].split('data/nuscenes/') [-1], id=cam_info['sample_data_token'], token=info['token'], cam2ego_rotation=cam_info['sensor2ego_rotation'], cam2ego_translation=cam_info['sensor2ego_translation'], ego2global_rotation=info['ego2global_rotation'], ego2global_translation=info['ego2global_translation'], cam_intrinsic=cam_info['cam_intrinsic'], width=width, height=height)) for coco_info in coco_infos: if coco_info is None: continue # add an empty key for coco format coco_info['segmentation'] = [] coco_info['id'] = coco_ann_id coco_2d_dict['annotations'].append(coco_info) coco_ann_id += 1 if mono3d: json_prefix = f'{info_path[:-4]}_mono3d' else: json_prefix = f'{info_path[:-4]}' mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json') def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str], mono3d=True): """Get the 2D annotation records for a given `sample_data_token`. Args: sample_data_token (str): Sample data token belonging to a camera keyframe. visibilities (list[str]): Visibility filter. mono3d (bool): Whether to get boxes with mono3d annotation. Return: list[dict]: List of 2D annotation record that belongs to the input `sample_data_token`. """ # Get the sample data and the sample corresponding to that sample data. sd_rec = nusc.get('sample_data', sample_data_token) assert sd_rec[ 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \ ' for camera sample_data!' if not sd_rec['is_key_frame']: raise ValueError( 'The 2D re-projections are available only for keyframes.') s_rec = nusc.get('sample', sd_rec['sample_token']) # Get the calibrated sensor and ego pose # record to get the transformation matrices. cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token']) camera_intrinsic = np.array(cs_rec['camera_intrinsic']) # Get all the annotation with the specified visibilties. ann_recs = [ nusc.get('sample_annotation', token) for token in s_rec['anns'] ] ann_recs = [ ann_rec for ann_rec in ann_recs if (ann_rec['visibility_token'] in visibilities) ] repro_recs = [] for ann_rec in ann_recs: # Augment sample_annotation with token information. ann_rec['sample_annotation_token'] = ann_rec['token'] ann_rec['sample_data_token'] = sample_data_token # Get the box in global coordinates. box = nusc.get_box(ann_rec['token']) # Move them to the ego-pose frame. box.translate(-np.array(pose_rec['translation'])) box.rotate(Quaternion(pose_rec['rotation']).inverse) # Move them to the calibrated sensor frame. box.translate(-np.array(cs_rec['translation'])) box.rotate(Quaternion(cs_rec['rotation']).inverse) # Filter out the corners that are not in front of the calibrated # sensor. corners_3d = box.corners() in_front = np.argwhere(corners_3d[2, :] > 0).flatten() corners_3d = corners_3d[:, in_front] # Project 3d box to 2d. corner_coords = view_points(corners_3d, camera_intrinsic, True).T[:, :2].tolist() # Keep only corners that fall within the image. final_coords = post_process_coords(corner_coords) # Skip if the convex hull of the re-projected corners # does not intersect the image canvas. if final_coords is None: continue else: min_x, min_y, max_x, max_y = final_coords # Generate dictionary record to be included in the .json file. repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y, sample_data_token, sd_rec['filename']) if repro_rec is None: continue # If mono3d=True, add 3D annotations in camera coordinates if mono3d and (repro_rec is not None): loc = box.center.tolist() dim = box.wlh dim[[0, 1, 2]] = dim[[1, 2, 0]] # convert wlh to our lhw dim = dim.tolist() rot = box.orientation.yaw_pitch_roll[0] rot = [-rot] # convert the rot to our cam coordinate global_velo2d = nusc.box_velocity(box.token)[:2] global_velo3d = np.array([*global_velo2d, 0.0]) e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix cam_velo3d = global_velo3d @ np.linalg.inv( e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T velo = cam_velo3d[0::2].tolist() repro_rec['bbox_cam3d'] = loc + dim + rot repro_rec['velo_cam3d'] = velo center3d = np.array(loc).reshape([1, 3]) center2d = points_cam2img( center3d, camera_intrinsic, with_depth=True) repro_rec['center2d'] = center2d.squeeze().tolist() # normalized center2D + depth # if samples with depth < 0 will be removed if repro_rec['center2d'][2] <= 0: continue ann_token = nusc.get('sample_annotation', box.token)['attribute_tokens'] if len(ann_token) == 0: attr_name = 'None' else: attr_name = nusc.get('attribute', ann_token[0])['name'] attr_id = nus_attributes.index(attr_name) repro_rec['attribute_name'] = attr_name repro_rec['attribute_id'] = attr_id repro_recs.append(repro_rec) return repro_recs def post_process_coords( corner_coords: List, imsize: Tuple[int, int] = (1600, 900) ) -> Union[Tuple[float, float, float, float], None]: """Get the intersection of the convex hull of the reprojected bbox corners and the image canvas, return None if no intersection. Args: corner_coords (list[int]): Corner coordinates of reprojected bounding box. imsize (tuple[int]): Size of the image canvas. Return: tuple [float]: Intersection of the convex hull of the 2D box corners and the image canvas. """ polygon_from_2d_box = MultiPoint(corner_coords).convex_hull img_canvas = box(0, 0, imsize[0], imsize[1]) if polygon_from_2d_box.intersects(img_canvas): img_intersection = polygon_from_2d_box.intersection(img_canvas) intersection_coords = np.array( [coord for coord in img_intersection.exterior.coords]) min_x = min(intersection_coords[:, 0]) min_y = min(intersection_coords[:, 1]) max_x = max(intersection_coords[:, 0]) max_y = max(intersection_coords[:, 1]) return min_x, min_y, max_x, max_y else: return None def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float, sample_data_token: str, filename: str) -> OrderedDict: """Generate one 2D annotation record given various information on top of the 2D bounding box coordinates. Args: ann_rec (dict): Original 3d annotation record. x1 (float): Minimum value of the x coordinate. y1 (float): Minimum value of the y coordinate. x2 (float): Maximum value of the x coordinate. y2 (float): Maximum value of the y coordinate. sample_data_token (str): Sample data token. filename (str):The corresponding image file where the annotation is present. Returns: dict: A sample 2D annotation record. - file_name (str): file name - image_id (str): sample data token - area (float): 2d box area - category_name (str): category name - category_id (int): category id - bbox (list[float]): left x, top y, dx, dy of 2d box - iscrowd (int): whether the area is crowd """ repro_rec = OrderedDict() repro_rec['sample_data_token'] = sample_data_token coco_rec = dict() relevant_keys = [ 'attribute_tokens', 'category_name', 'instance_token', 'next', 'num_lidar_pts', 'num_radar_pts', 'prev', 'sample_annotation_token', 'sample_data_token', 'visibility_token', ] for key, value in ann_rec.items(): if key in relevant_keys: repro_rec[key] = value repro_rec['bbox_corners'] = [x1, y1, x2, y2] repro_rec['filename'] = filename coco_rec['file_name'] = filename coco_rec['image_id'] = sample_data_token coco_rec['area'] = (y2 - y1) * (x2 - x1) if repro_rec['category_name'] not in NuScenesDataset.NameMapping: return None cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']] #filter useless categories for tracking if cat_name not in nus_categories: return None coco_rec['category_name'] = cat_name coco_rec['category_id'] = nus_categories.index(cat_name) coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1] coco_rec['iscrowd'] = 0 coco_rec['visibility_token'] = repro_rec['visibility_token'] return coco_rec if __name__ == '__main__': export_2d_annotation('/mount/data/lsbevv2/data/nuscenes', '/mount/data/lsbevv2/data/nuscenes/bevdetv2-nuscenes_infos_val.pkl', 'v1.0-trainval', False) ================================================ FILE: tools/data_converter/nuscenes_prediction_tools.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2023 toyota research instutute. # ------------------------------------------------------------------------ # Modified from FutureDet (https://github.com/neeharperi/FutureDet) # ------------------------------------------------------------------------ import numpy as np from pyquaternion import Quaternion from nuscenes import NuScenes from nuscenes.utils.data_classes import Box from itertools import tee from copy import deepcopy def get_forecasting_annotations(nusc: NuScenes, annotations, length): """Acquire the trajectories for each box """ forecast_annotations = [] forecast_boxes = [] forecast_trajectory_type = [] forecast_visibility_mask = [] sample_tokens = [s["token"] for s in nusc.sample] for annotation in annotations: tracklet_box = [] tracklet_annotation = [] tracklet_visiblity_mask = [] tracklet_trajectory_type = [] token = nusc.sample[sample_tokens.index(annotation["sample_token"])]["data"]["LIDAR_TOP"] sd_record = nusc.get("sample_data", token) cs_record = nusc.get("calibrated_sensor", sd_record["calibrated_sensor_token"]) pose_record = nusc.get("ego_pose", sd_record["ego_pose_token"]) visibility = True for step in range(length): box = Box(center = annotation["translation"], size = annotation["size"], orientation = Quaternion(annotation["rotation"]), velocity = nusc.box_velocity(annotation["token"]), name = annotation["category_name"], token = annotation["token"]) # move box to the ego-system when the prediction is made box.translate(-np.array(pose_record["translation"])) box.rotate(Quaternion(pose_record["rotation"]).inverse) # Move box to sensor coord system box.translate(-np.array(cs_record["translation"])) box.rotate(Quaternion(cs_record["rotation"]).inverse) tracklet_box.append(box) tracklet_annotation.append(annotation) tracklet_visiblity_mask.append(visibility) next_token = annotation['next'] if next_token != '': annotation = nusc.get('sample_annotation', next_token) else: # if the trajectory cannot be prolonged anymore, # use the last one to pad and set the visibility flag annotation = annotation visibility = False tokens = [b["sample_token"] for b in tracklet_annotation] time = [get_time(nusc, src, dst) for src, dst in window(tokens, 2)] tracklet_trajectory_type = trajectory_type(nusc, tracklet_box, time, length) # same as FutureDet forecast_boxes.append(tracklet_box) forecast_annotations.append(tracklet_annotation) forecast_trajectory_type.append(length * [tracklet_trajectory_type]) forecast_visibility_mask.append(tracklet_visiblity_mask) return forecast_boxes, forecast_annotations, forecast_visibility_mask, forecast_trajectory_type def window(iterable, size): iters = tee(iterable, size) for i in range(1, size): for each in iters[i:]: next(each, None) return zip(*iters) def get_time(nusc, src_token, dst_token): time_last = 1e-6 * nusc.get('sample', src_token)["timestamp"] time_first = 1e-6 * nusc.get('sample', dst_token)["timestamp"] time_diff = time_first - time_last return time_diff def center_distance(gt_box, pred_box) -> float: """ L2 distance between the box centers (xy only). :param gt_box: GT annotation sample. :param pred_box: Predicted sample. :return: L2 distance. """ return np.linalg.norm(np.array(pred_box.center[:2]) - np.array(gt_box.center[:2])) def trajectory_type(nusc, boxes, time, timesteps=7, past=False): target = boxes[-1] static_forecast = deepcopy(boxes[0]) linear_forecast = deepcopy(boxes[0]) vel = linear_forecast.velocity[:2] disp = np.sum(list(map(lambda x: np.array(list(vel) + [0]) * x, time)), axis=0) if past: linear_forecast.center = linear_forecast.center - disp else: linear_forecast.center = linear_forecast.center + disp if center_distance(target, static_forecast) < max(target.wlh[0], target.wlh[1]): # return "static" return 0 elif center_distance(target, linear_forecast) < max(target.wlh[0], target.wlh[1]): # return "linear" return 1 else: # return "nonlinear" return 2 ================================================ FILE: tools/data_converter/nuscenes_track_converter.py ================================================ # ------------------------------------------------------------------------ # Copyright (c) 2023 toyota research instutute. # ------------------------------------------------------------------------ # Modified from MUTR3D (https://github.com/a1600012888/MUTR3D) # Copyright (c) 2022 Tianyuan Zhang # ------------------------------------------------------------------------ # Modified from mmdetection3d (https://github.com/open-mmlab/mmdetection3d) # Copyright (c) OpenMMLab. All rights reserved. # ------------------------------------------------------------------------ import mmcv import numpy as np import os from collections import OrderedDict from nuscenes.nuscenes import NuScenes from nuscenes.utils.geometry_utils import view_points from os import path as osp from pyquaternion import Quaternion from shapely.geometry import MultiPoint, box from typing import List, Tuple, Union from mmdet3d.core.bbox.box_np_ops import points_cam2img from projects.tracking_plugin.datasets.nuscenes_tracking_dataset import NuScenesTrackingDataset as NuScenesDataset from data_converter.nuscenes_prediction_tools import get_forecasting_annotations # remove the classes barrier, trafficcone and construction_vehicle nus_categories = ( 'car', 'truck', 'bus', 'trailer', 'motorcycle', 'bicycle', 'pedestrian', 'construction_vehicle', 'traffic_cone', 'barrier') nus_attributes = ('cycle.with_rider', 'cycle.without_rider', 'pedestrian.moving', 'pedestrian.standing', 'pedestrian.sitting_lying_down', 'vehicle.moving', 'vehicle.parked', 'vehicle.stopped', 'None') def create_nuscenes_infos(root_path, out_dir, info_prefix, version='v1.0-trainval', max_sweeps=10, forecasting=False, forecasting_length=13): """Create info file of nuscene dataset. Given the raw data, generate its related info file in pkl format. Args: root_path (str): Path of the data root. info_prefix (str): Prefix of the info file to be generated. version (str): Version of the data. Default: 'v1.0-trainval' max_sweeps (int): Max number of sweeps. Default: 10 forecasting (bool): If prepare for forecasting data forecasting_length (int): Max frame number for forecasting. Default: 13 (6 seconds + current frame) """ from nuscenes.nuscenes import NuScenes nusc = NuScenes(version=version, dataroot=root_path, verbose=True) from nuscenes.utils import splits available_vers = ['v1.0-trainval', 'v1.0-test', 'v1.0-mini'] assert version in available_vers if version == 'v1.0-trainval': train_scenes = splits.train val_scenes = splits.val elif version == 'v1.0-test': train_scenes = splits.test val_scenes = [] elif version == 'v1.0-mini': train_scenes = splits.mini_train val_scenes = splits.mini_val info_prefix = info_prefix + '-mini' else: raise ValueError('unknown') # filter existing scenes. available_scenes = get_available_scenes(nusc) available_scene_names = [s['name'] for s in available_scenes] train_scenes = list( filter(lambda x: x in available_scene_names, train_scenes)) val_scenes = list(filter(lambda x: x in available_scene_names, val_scenes)) train_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in train_scenes ]) val_scenes = set([ available_scenes[available_scene_names.index(s)]['token'] for s in val_scenes ]) test = 'test' in version if test: print('test scene: {}'.format(len(train_scenes))) else: print('train scene: {}, val scene: {}'.format( len(train_scenes), len(val_scenes))) train_nusc_infos, val_nusc_infos = _fill_trainval_infos( nusc, train_scenes, val_scenes, test, max_sweeps=max_sweeps, forecasting=forecasting, forecasting_length=forecasting_length) metadata = dict(version=version) if test: print('test sample: {}'.format(len(train_nusc_infos))) data = dict(infos=train_nusc_infos, metadata=metadata) info_path = osp.join(out_dir, '{}_infos_test.pkl'.format(info_prefix)) mmcv.dump(data, info_path) else: print('train sample: {}, val sample: {}'.format( len(train_nusc_infos), len(val_nusc_infos))) data = dict(infos=train_nusc_infos, metadata=metadata) info_path = osp.join(out_dir, '{}_infos_train.pkl'.format(info_prefix)) mmcv.dump(data, info_path) data['infos'] = val_nusc_infos info_val_path = osp.join(out_dir, '{}_infos_val.pkl'.format(info_prefix)) mmcv.dump(data, info_val_path) def get_available_scenes(nusc): """Get available scenes from the input nuscenes class. Given the raw data, get the information of available scenes for further info generation. Args: nusc (class): Dataset class in the nuScenes dataset. Returns: available_scenes (list[dict]): List of basic information for the available scenes. """ available_scenes = [] print('total scene num: {}'.format(len(nusc.scene))) for scene in nusc.scene: scene_token = scene['token'] scene_rec = nusc.get('scene', scene_token) sample_rec = nusc.get('sample', scene_rec['first_sample_token']) sd_rec = nusc.get('sample_data', sample_rec['data']['LIDAR_TOP']) has_more_frames = True scene_not_exist = False while has_more_frames: lidar_path, boxes, _ = nusc.get_sample_data(sd_rec['token']) lidar_path = str(lidar_path) if os.getcwd() in lidar_path: # path from lyftdataset is absolute path lidar_path = lidar_path.split(f'{os.getcwd()}/')[-1] # relative path if not mmcv.is_filepath(lidar_path): scene_not_exist = True break else: break if scene_not_exist: continue available_scenes.append(scene) print('exist scene num: {}'.format(len(available_scenes))) return available_scenes def _fill_trainval_infos(nusc, train_scenes, val_scenes, test=False, max_sweeps=10, forecasting=False, forecasting_length=13): """Generate the train/val infos from the raw data. Args: nusc (:obj:`NuScenes`): Dataset class in the nuScenes dataset. train_scenes (list[str]): Basic information of training scenes. val_scenes (list[str]): Basic information of validation scenes. test (bool): Whether use the test mode. In the test mode, no annotations can be accessed. Default: False. max_sweeps (int): Max number of sweeps. Default: 10. forecasting (bool): If prepare for forecasting data forecasting_length (int): Max frame number for forecasting. Default: 13 (6 seconds + current frame) Returns: tuple[list[dict]]: Information of training set and validation set that will be saved to the info file. """ train_nusc_infos = [] val_nusc_infos = [] frame_idx = 0 for sample in mmcv.track_iter_progress(nusc.sample): lidar_token = sample['data']['LIDAR_TOP'] sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) lidar_path, boxes, _ = nusc.get_sample_data(lidar_token) mmcv.check_file_exist(lidar_path) info = { 'lidar_path': lidar_path, 'token': sample['token'], 'sweeps': [], 'cams': dict(), 'radars': dict(), 'lidar2ego_translation': cs_record['translation'], 'lidar2ego_rotation': cs_record['rotation'], 'ego2global_translation': pose_record['translation'], 'ego2global_rotation': pose_record['rotation'], 'timestamp': sample['timestamp'], 'scene_token': sample['scene_token'], 'frame_idx': frame_idx } if sample['next'] == '': frame_idx = 0 else: frame_idx += 1 l2e_r = info['lidar2ego_rotation'] l2e_t = info['lidar2ego_translation'] e2g_r = info['ego2global_rotation'] e2g_t = info['ego2global_translation'] l2e_r_mat = Quaternion(l2e_r).rotation_matrix e2g_r_mat = Quaternion(e2g_r).rotation_matrix # obtain 6 image's information per frame camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] for cam in camera_types: cam_token = sample['data'][cam] cam_path, _, cam_intrinsic = nusc.get_sample_data(cam_token) cam_info = obtain_sensor2top(nusc, cam_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, cam) cam_info.update(cam_intrinsic=cam_intrinsic) info['cams'].update({cam: cam_info}) # obtain sweeps for a single key-frame sd_rec = nusc.get('sample_data', sample['data']['LIDAR_TOP']) sweeps = [] while len(sweeps) < max_sweeps: if not sd_rec['prev'] == '': sweep = obtain_sensor2top(nusc, sd_rec['prev'], l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, 'lidar') sweeps.append(sweep) sd_rec = nusc.get('sample_data', sd_rec['prev']) else: break info['sweeps'] = sweeps # obtain annotation if not test: annotations = [ nusc.get('sample_annotation', token) for token in sample['anns'] ] locs = np.array([b.center for b in boxes]).reshape(-1, 3) dims = np.array([b.wlh for b in boxes]).reshape(-1, 3) rots = np.array([b.orientation.yaw_pitch_roll[0] for b in boxes]).reshape(-1, 1) velocity = np.array( [nusc.box_velocity(token)[:2] for token in sample['anns']]) valid_flag = np.array( [(anno['num_lidar_pts'] + anno['num_radar_pts']) > 0 for anno in annotations], dtype=bool).reshape(-1) # convert velo from global to lidar for i in range(len(boxes)): velo = np.array([*velocity[i], 0.0]) velo = velo @ np.linalg.inv(e2g_r_mat).T @ np.linalg.inv( l2e_r_mat).T velocity[i] = velo[:2] names = [b.name for b in boxes] for i in range(len(names)): if names[i] in NuScenesDataset.NameMapping: names[i] = NuScenesDataset.NameMapping[names[i]] names = np.array(names) # update valid now name_in_track = [_a in nus_categories for _a in names] name_in_track = np.array(name_in_track) valid_flag = np.logical_and(valid_flag, name_in_track) # add instance_ids instance_inds = [nusc.getind('instance', ann['instance_token']) for ann in annotations] # we need to convert rot to SECOND format. gt_boxes = np.concatenate([locs, dims, -rots - np.pi / 2], axis=1) assert len(gt_boxes) == len( annotations), f'{len(gt_boxes)}, {len(annotations)}' info['gt_boxes'] = gt_boxes info['gt_names'] = names info['gt_velocity'] = velocity.reshape(-1, 2) info['num_lidar_pts'] = np.array( [a['num_lidar_pts'] for a in annotations]) info['num_radar_pts'] = np.array( [a['num_radar_pts'] for a in annotations]) info['valid_flag'] = valid_flag info['instance_inds'] = instance_inds if forecasting: fboxes, fannotations, fmasks, ftypes = get_forecasting_annotations(nusc, annotations, forecasting_length) locs = [np.array([b.center for b in boxes]).reshape(-1, 3) for boxes in fboxes] tokens = [np.array([b.token for b in boxes]) for boxes in fboxes] info['forecasting_locs'] = np.array(locs) info['forecasting_tokens'] = np.array(tokens) info['forecasting_masks'] = np.array(fmasks) info['forecasting_types'] = np.array(ftypes) if sample['scene_token'] in train_scenes: train_nusc_infos.append(info) else: val_nusc_infos.append(info) return train_nusc_infos, val_nusc_infos def obtain_sensor2top(nusc, sensor_token, l2e_t, l2e_r_mat, e2g_t, e2g_r_mat, sensor_type='lidar'): """Obtain the info with RT matric from general sensor to Top LiDAR. Args: nusc (class): Dataset class in the nuScenes dataset. sensor_token (str): Sample data token corresponding to the specific sensor type. l2e_t (np.ndarray): Translation from lidar to ego in shape (1, 3). l2e_r_mat (np.ndarray): Rotation matrix from lidar to ego in shape (3, 3). e2g_t (np.ndarray): Translation from ego to global in shape (1, 3). e2g_r_mat (np.ndarray): Rotation matrix from ego to global in shape (3, 3). sensor_type (str): Sensor to calibrate. Default: 'lidar'. Returns: sweep (dict): Sweep information after transformation. """ sd_rec = nusc.get('sample_data', sensor_token) cs_record = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_record = nusc.get('ego_pose', sd_rec['ego_pose_token']) data_path = str(nusc.get_sample_data_path(sd_rec['token'])) if os.getcwd() in data_path: # path from lyftdataset is absolute path data_path = data_path.split(f'{os.getcwd()}/')[-1] # relative path sweep = { 'data_path': data_path, 'type': sensor_type, 'sample_data_token': sd_rec['token'], 'sensor2ego_translation': cs_record['translation'], 'sensor2ego_rotation': cs_record['rotation'], 'ego2global_translation': pose_record['translation'], 'ego2global_rotation': pose_record['rotation'], 'timestamp': sd_rec['timestamp'] } l2e_r_s = sweep['sensor2ego_rotation'] l2e_t_s = sweep['sensor2ego_translation'] e2g_r_s = sweep['ego2global_rotation'] e2g_t_s = sweep['ego2global_translation'] # obtain the RT from sensor to Top LiDAR # sweep->ego->global->ego'->lidar l2e_r_s_mat = Quaternion(l2e_r_s).rotation_matrix e2g_r_s_mat = Quaternion(e2g_r_s).rotation_matrix R = (l2e_r_s_mat.T @ e2g_r_s_mat.T) @ ( np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) T = (l2e_t_s @ e2g_r_s_mat.T + e2g_t_s) @ ( np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T) T -= e2g_t @ (np.linalg.inv(e2g_r_mat).T @ np.linalg.inv(l2e_r_mat).T ) + l2e_t @ np.linalg.inv(l2e_r_mat).T sweep['sensor2lidar_rotation'] = R.T # points @ R.T + T sweep['sensor2lidar_translation'] = T return sweep def export_2d_annotation(root_path, info_path, version, mono3d=True): """Export 2d annotation from the info file and raw data. Args: root_path (str): Root path of the raw data. info_path (str): Path of the info file. version (str): Dataset version. mono3d (bool): Whether to export mono3d annotation. Default: True. """ # get bbox annotations for camera camera_types = [ 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_FRONT_LEFT', 'CAM_BACK', 'CAM_BACK_LEFT', 'CAM_BACK_RIGHT', ] nusc_infos = mmcv.load(info_path)['infos'] nusc = NuScenes(version=version, dataroot=root_path, verbose=True) # info_2d_list = [] cat2Ids = [ dict(id=nus_categories.index(cat_name), name=cat_name) for cat_name in nus_categories ] coco_ann_id = 0 coco_2d_dict = dict(annotations=[], images=[], categories=cat2Ids) for info in mmcv.track_iter_progress(nusc_infos): for cam in camera_types: cam_info = info['cams'][cam] coco_infos = get_2d_boxes( nusc, cam_info['sample_data_token'], visibilities=['', '1', '2', '3', '4'], mono3d=mono3d) (height, width, _) = mmcv.imread(cam_info['data_path']).shape coco_2d_dict['images'].append( dict( file_name=cam_info['data_path'].split('data/nuscenes/') [-1], id=cam_info['sample_data_token'], token=info['token'], cam2ego_rotation=cam_info['sensor2ego_rotation'], cam2ego_translation=cam_info['sensor2ego_translation'], ego2global_rotation=info['ego2global_rotation'], ego2global_translation=info['ego2global_translation'], cam_intrinsic=cam_info['cam_intrinsic'], width=width, height=height)) for coco_info in coco_infos: if coco_info is None: continue # add an empty key for coco format coco_info['segmentation'] = [] coco_info['id'] = coco_ann_id coco_2d_dict['annotations'].append(coco_info) coco_ann_id += 1 if mono3d: json_prefix = f'{info_path[:-4]}_mono3d' else: json_prefix = f'{info_path[:-4]}' mmcv.dump(coco_2d_dict, f'{json_prefix}.coco.json') def get_2d_boxes(nusc, sample_data_token: str, visibilities: List[str], mono3d=True): """Get the 2D annotation records for a given `sample_data_token`. Args: sample_data_token (str): Sample data token belonging to a camera \ keyframe. visibilities (list[str]): Visibility filter. mono3d (bool): Whether to get boxes with mono3d annotation. Return: list[dict]: List of 2D annotation record that belongs to the input `sample_data_token`. """ # Get the sample data and the sample corresponding to that sample data. sd_rec = nusc.get('sample_data', sample_data_token) assert sd_rec[ 'sensor_modality'] == 'camera', 'Error: get_2d_boxes only works' \ ' for camera sample_data!' if not sd_rec['is_key_frame']: raise ValueError( 'The 2D re-projections are available only for keyframes.') s_rec = nusc.get('sample', sd_rec['sample_token']) # Get the calibrated sensor and ego pose # record to get the transformation matrices. cs_rec = nusc.get('calibrated_sensor', sd_rec['calibrated_sensor_token']) pose_rec = nusc.get('ego_pose', sd_rec['ego_pose_token']) camera_intrinsic = np.array(cs_rec['camera_intrinsic']) # Get all the annotation with the specified visibilties. ann_recs = [ nusc.get('sample_annotation', token) for token in s_rec['anns'] ] ann_recs = [ ann_rec for ann_rec in ann_recs if (ann_rec['visibility_token'] in visibilities) ] repro_recs = [] for ann_rec in ann_recs: # Augment sample_annotation with token information. ann_rec['sample_annotation_token'] = ann_rec['token'] ann_rec['sample_data_token'] = sample_data_token # Get the box in global coordinates. box = nusc.get_box(ann_rec['token']) # Move them to the ego-pose frame. box.translate(-np.array(pose_rec['translation'])) box.rotate(Quaternion(pose_rec['rotation']).inverse) # Move them to the calibrated sensor frame. box.translate(-np.array(cs_rec['translation'])) box.rotate(Quaternion(cs_rec['rotation']).inverse) # Filter out the corners that are not in front of the calibrated # sensor. corners_3d = box.corners() in_front = np.argwhere(corners_3d[2, :] > 0).flatten() corners_3d = corners_3d[:, in_front] # Project 3d box to 2d. corner_coords = view_points(corners_3d, camera_intrinsic, True).T[:, :2].tolist() # Keep only corners that fall within the image. final_coords = post_process_coords(corner_coords) # Skip if the convex hull of the re-projected corners # does not intersect the image canvas. if final_coords is None: continue else: min_x, min_y, max_x, max_y = final_coords # Generate dictionary record to be included in the .json file. repro_rec = generate_record(ann_rec, min_x, min_y, max_x, max_y, sample_data_token, sd_rec['filename']) # If mono3d=True, add 3D annotations in camera coordinates if mono3d and (repro_rec is not None): loc = box.center.tolist() dim = box.wlh.tolist() rot = [box.orientation.yaw_pitch_roll[0]] global_velo2d = nusc.box_velocity(box.token)[:2] global_velo3d = np.array([*global_velo2d, 0.0]) e2g_r_mat = Quaternion(pose_rec['rotation']).rotation_matrix c2e_r_mat = Quaternion(cs_rec['rotation']).rotation_matrix cam_velo3d = global_velo3d @ np.linalg.inv( e2g_r_mat).T @ np.linalg.inv(c2e_r_mat).T velo = cam_velo3d[0::2].tolist() repro_rec['bbox_cam3d'] = loc + dim + rot repro_rec['velo_cam3d'] = velo center3d = np.array(loc).reshape([1, 3]) center2d = points_cam2img( center3d, camera_intrinsic, with_depth=True) repro_rec['center2d'] = center2d.squeeze().tolist() # normalized center2D + depth # if samples with depth < 0 will be removed if repro_rec['center2d'][2] <= 0: continue ann_token = nusc.get('sample_annotation', box.token)['attribute_tokens'] if len(ann_token) == 0: attr_name = 'None' else: attr_name = nusc.get('attribute', ann_token[0])['name'] attr_id = nus_attributes.index(attr_name) repro_rec['attribute_name'] = attr_name repro_rec['attribute_id'] = attr_id repro_recs.append(repro_rec) return repro_recs def post_process_coords( corner_coords: List, imsize: Tuple[int, int] = (1600, 900) ) -> Union[Tuple[float, float, float, float], None]: """Get the intersection of the convex hull of the reprojected bbox corners and the image canvas, return None if no intersection. Args: corner_coords (list[int]): Corner coordinates of reprojected bounding box. imsize (tuple[int]): Size of the image canvas. Return: tuple [float]: Intersection of the convex hull of the 2D box corners and the image canvas. """ polygon_from_2d_box = MultiPoint(corner_coords).convex_hull img_canvas = box(0, 0, imsize[0], imsize[1]) if polygon_from_2d_box.intersects(img_canvas): img_intersection = polygon_from_2d_box.intersection(img_canvas) intersection_coords = np.array( [coord for coord in img_intersection.exterior.coords]) min_x = min(intersection_coords[:, 0]) min_y = min(intersection_coords[:, 1]) max_x = max(intersection_coords[:, 0]) max_y = max(intersection_coords[:, 1]) return min_x, min_y, max_x, max_y else: return None def generate_record(ann_rec: dict, x1: float, y1: float, x2: float, y2: float, sample_data_token: str, filename: str) -> OrderedDict: """Generate one 2D annotation record given various informations on top of the 2D bounding box coordinates. Args: ann_rec (dict): Original 3d annotation record. x1 (float): Minimum value of the x coordinate. y1 (float): Minimum value of the y coordinate. x2 (float): Maximum value of the x coordinate. y2 (float): Maximum value of the y coordinate. sample_data_token (str): Sample data token. filename (str):The corresponding image file where the annotation is present. Returns: dict: A sample 2D annotation record. - file_name (str): flie name - image_id (str): sample data token - area (float): 2d box area - category_name (str): category name - category_id (int): category id - bbox (list[float]): left x, top y, dx, dy of 2d box - iscrowd (int): whether the area is crowd """ repro_rec = OrderedDict() repro_rec['sample_data_token'] = sample_data_token coco_rec = dict() relevant_keys = [ 'attribute_tokens', 'category_name', 'instance_token', 'next', 'num_lidar_pts', 'num_radar_pts', 'prev', 'sample_annotation_token', 'sample_data_token', 'visibility_token', ] for key, value in ann_rec.items(): if key in relevant_keys: repro_rec[key] = value repro_rec['bbox_corners'] = [x1, y1, x2, y2] repro_rec['filename'] = filename coco_rec['file_name'] = filename coco_rec['image_id'] = sample_data_token coco_rec['area'] = (y2 - y1) * (x2 - x1) if repro_rec['category_name'] not in NuScenesDataset.NameMapping: return None cat_name = NuScenesDataset.NameMapping[repro_rec['category_name']] coco_rec['category_name'] = cat_name coco_rec['category_id'] = nus_categories.index(cat_name) coco_rec['bbox'] = [x1, y1, x2 - x1, y2 - y1] coco_rec['iscrowd'] = 0 return coco_rec if __name__ == '__main__': # generate .pkl for train, and val create_nuscenes_infos('data/nuscenes/', 'track') # generate .pkl for test set # create_nuscenes_infos('data/nuscenes/', 'track_test', version='v1.0-test') ================================================ FILE: tools/data_converter/s3dis_data_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from concurrent import futures as futures from os import path as osp import mmcv import numpy as np class S3DISData(object): """S3DIS data. Generate s3dis infos for s3dis_converter. Args: root_path (str): Root path of the raw data. split (str, optional): Set split type of the data. Default: 'Area_1'. """ def __init__(self, root_path, split='Area_1'): self.root_dir = root_path self.split = split self.data_dir = osp.join(root_path, 'Stanford3dDataset_v1.2_Aligned_Version') # Following `GSDN `_, use 5 furniture # classes for detection: table, chair, sofa, bookcase, board. self.cat_ids = np.array([7, 8, 9, 10, 11]) self.cat_ids2class = { cat_id: i for i, cat_id in enumerate(list(self.cat_ids)) } assert split in [ 'Area_1', 'Area_2', 'Area_3', 'Area_4', 'Area_5', 'Area_6' ] self.sample_id_list = os.listdir(osp.join(self.data_dir, split)) # conferenceRoom_1 for sample_id in self.sample_id_list: if os.path.isfile(osp.join(self.data_dir, split, sample_id)): self.sample_id_list.remove(sample_id) def __len__(self): return len(self.sample_id_list) def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): """Get data infos. This method gets information from the raw data. Args: num_workers (int, optional): Number of threads to be used. Default: 4. has_label (bool, optional): Whether the data has label. Default: True. sample_id_list (list[int], optional): Index list of the sample. Default: None. Returns: infos (list[dict]): Information of the raw data. """ def process_single_scene(sample_idx): print(f'{self.split} sample_idx: {sample_idx}') info = dict() pc_info = { 'num_features': 6, 'lidar_idx': f'{self.split}_{sample_idx}' } info['point_cloud'] = pc_info pts_filename = osp.join(self.root_dir, 's3dis_data', f'{self.split}_{sample_idx}_point.npy') pts_instance_mask_path = osp.join( self.root_dir, 's3dis_data', f'{self.split}_{sample_idx}_ins_label.npy') pts_semantic_mask_path = osp.join( self.root_dir, 's3dis_data', f'{self.split}_{sample_idx}_sem_label.npy') points = np.load(pts_filename).astype(np.float32) pts_instance_mask = np.load(pts_instance_mask_path).astype(np.int) pts_semantic_mask = np.load(pts_semantic_mask_path).astype(np.int) mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points')) mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask')) mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask')) points.tofile( osp.join(self.root_dir, 'points', f'{self.split}_{sample_idx}.bin')) pts_instance_mask.tofile( osp.join(self.root_dir, 'instance_mask', f'{self.split}_{sample_idx}.bin')) pts_semantic_mask.tofile( osp.join(self.root_dir, 'semantic_mask', f'{self.split}_{sample_idx}.bin')) info['pts_path'] = osp.join('points', f'{self.split}_{sample_idx}.bin') info['pts_instance_mask_path'] = osp.join( 'instance_mask', f'{self.split}_{sample_idx}.bin') info['pts_semantic_mask_path'] = osp.join( 'semantic_mask', f'{self.split}_{sample_idx}.bin') info['annos'] = self.get_bboxes(points, pts_instance_mask, pts_semantic_mask) return info sample_id_list = sample_id_list if sample_id_list is not None \ else self.sample_id_list with futures.ThreadPoolExecutor(num_workers) as executor: infos = executor.map(process_single_scene, sample_id_list) return list(infos) def get_bboxes(self, points, pts_instance_mask, pts_semantic_mask): """Convert instance masks to axis-aligned bounding boxes. Args: points (np.array): Scene points of shape (n, 6). pts_instance_mask (np.ndarray): Instance labels of shape (n,). pts_semantic_mask (np.ndarray): Semantic labels of shape (n,). Returns: dict: A dict containing detection infos with following keys: - gt_boxes_upright_depth (np.ndarray): Bounding boxes of shape (n, 6) - class (np.ndarray): Box labels of shape (n,) - gt_num (int): Number of boxes. """ bboxes, labels = [], [] for i in range(1, pts_instance_mask.max() + 1): ids = pts_instance_mask == i # check if all instance points have same semantic label assert pts_semantic_mask[ids].min() == pts_semantic_mask[ids].max() label = pts_semantic_mask[ids][0] # keep only furniture objects if label in self.cat_ids2class: labels.append(self.cat_ids2class[pts_semantic_mask[ids][0]]) pts = points[:, :3][ids] min_pts = pts.min(axis=0) max_pts = pts.max(axis=0) locations = (min_pts + max_pts) / 2 dimensions = max_pts - min_pts bboxes.append(np.concatenate((locations, dimensions))) annotation = dict() # follow ScanNet and SUN RGB-D keys annotation['gt_boxes_upright_depth'] = np.array(bboxes) annotation['class'] = np.array(labels) annotation['gt_num'] = len(labels) return annotation class S3DISSegData(object): """S3DIS dataset used to generate infos for semantic segmentation task. Args: data_root (str): Root path of the raw data. ann_file (str): The generated scannet infos. split (str, optional): Set split type of the data. Default: 'train'. num_points (int, optional): Number of points in each data input. Default: 8192. label_weight_func (function, optional): Function to compute the label weight. Default: None. """ def __init__(self, data_root, ann_file, split='Area_1', num_points=4096, label_weight_func=None): self.data_root = data_root self.data_infos = mmcv.load(ann_file) self.split = split self.num_points = num_points self.all_ids = np.arange(13) # all possible ids self.cat_ids = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) # used for seg task self.ignore_index = len(self.cat_ids) self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \ self.ignore_index for i, cat_id in enumerate(self.cat_ids): self.cat_id2class[cat_id] = i # label weighting function is taken from # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24 self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \ label_weight_func is None else label_weight_func def get_seg_infos(self): scene_idxs, label_weight = self.get_scene_idxs_and_label_weight() save_folder = osp.join(self.data_root, 'seg_info') mmcv.mkdir_or_exist(save_folder) np.save( osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'), scene_idxs) np.save( osp.join(save_folder, f'{self.split}_label_weight.npy'), label_weight) print(f'{self.split} resampled scene index and label weight saved') def _convert_to_label(self, mask): """Convert class_id in loaded segmentation mask to label.""" if isinstance(mask, str): if mask.endswith('npy'): mask = np.load(mask) else: mask = np.fromfile(mask, dtype=np.int64) label = self.cat_id2class[mask] return label def get_scene_idxs_and_label_weight(self): """Compute scene_idxs for data sampling and label weight for loss calculation. We sample more times for scenes with more points. Label_weight is inversely proportional to number of class points. """ num_classes = len(self.cat_ids) num_point_all = [] label_weight = np.zeros((num_classes + 1, )) # ignore_index for data_info in self.data_infos: label = self._convert_to_label( osp.join(self.data_root, data_info['pts_semantic_mask_path'])) num_point_all.append(label.shape[0]) class_count, _ = np.histogram(label, range(num_classes + 2)) label_weight += class_count # repeat scene_idx for num_scene_point // num_sample_point times sample_prob = np.array(num_point_all) / float(np.sum(num_point_all)) num_iter = int(np.sum(num_point_all) / float(self.num_points)) scene_idxs = [] for idx in range(len(self.data_infos)): scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter))) scene_idxs = np.array(scene_idxs).astype(np.int32) # calculate label weight, adopted from PointNet++ label_weight = label_weight[:-1].astype(np.float32) label_weight = label_weight / label_weight.sum() label_weight = self.label_weight_func(label_weight).astype(np.float32) return scene_idxs, label_weight ================================================ FILE: tools/data_converter/scannet_data_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import os from concurrent import futures as futures from os import path as osp import mmcv import numpy as np class ScanNetData(object): """ScanNet data. Generate scannet infos for scannet_converter. Args: root_path (str): Root path of the raw data. split (str, optional): Set split type of the data. Default: 'train'. """ def __init__(self, root_path, split='train'): self.root_dir = root_path self.split = split self.split_dir = osp.join(root_path) self.classes = [ 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', 'showercurtrain', 'toilet', 'sink', 'bathtub', 'garbagebin' ] self.cat2label = {cat: self.classes.index(cat) for cat in self.classes} self.label2cat = {self.cat2label[t]: t for t in self.cat2label} self.cat_ids = np.array( [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39]) self.cat_ids2class = { nyu40id: i for i, nyu40id in enumerate(list(self.cat_ids)) } assert split in ['train', 'val', 'test'] split_file = osp.join(self.root_dir, 'meta_data', f'scannetv2_{split}.txt') mmcv.check_file_exist(split_file) self.sample_id_list = mmcv.list_from_file(split_file) self.test_mode = (split == 'test') def __len__(self): return len(self.sample_id_list) def get_aligned_box_label(self, idx): box_file = osp.join(self.root_dir, 'scannet_instance_data', f'{idx}_aligned_bbox.npy') mmcv.check_file_exist(box_file) return np.load(box_file) def get_unaligned_box_label(self, idx): box_file = osp.join(self.root_dir, 'scannet_instance_data', f'{idx}_unaligned_bbox.npy') mmcv.check_file_exist(box_file) return np.load(box_file) def get_axis_align_matrix(self, idx): matrix_file = osp.join(self.root_dir, 'scannet_instance_data', f'{idx}_axis_align_matrix.npy') mmcv.check_file_exist(matrix_file) return np.load(matrix_file) def get_images(self, idx): paths = [] path = osp.join(self.root_dir, 'posed_images', idx) for file in sorted(os.listdir(path)): if file.endswith('.jpg'): paths.append(osp.join('posed_images', idx, file)) return paths def get_extrinsics(self, idx): extrinsics = [] path = osp.join(self.root_dir, 'posed_images', idx) for file in sorted(os.listdir(path)): if file.endswith('.txt') and not file == 'intrinsic.txt': extrinsics.append(np.loadtxt(osp.join(path, file))) return extrinsics def get_intrinsics(self, idx): matrix_file = osp.join(self.root_dir, 'posed_images', idx, 'intrinsic.txt') mmcv.check_file_exist(matrix_file) return np.loadtxt(matrix_file) def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): """Get data infos. This method gets information from the raw data. Args: num_workers (int, optional): Number of threads to be used. Default: 4. has_label (bool, optional): Whether the data has label. Default: True. sample_id_list (list[int], optional): Index list of the sample. Default: None. Returns: infos (list[dict]): Information of the raw data. """ def process_single_scene(sample_idx): print(f'{self.split} sample_idx: {sample_idx}') info = dict() pc_info = {'num_features': 6, 'lidar_idx': sample_idx} info['point_cloud'] = pc_info pts_filename = osp.join(self.root_dir, 'scannet_instance_data', f'{sample_idx}_vert.npy') points = np.load(pts_filename) mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points')) points.tofile( osp.join(self.root_dir, 'points', f'{sample_idx}.bin')) info['pts_path'] = osp.join('points', f'{sample_idx}.bin') # update with RGB image paths if exist if os.path.exists(osp.join(self.root_dir, 'posed_images')): info['intrinsics'] = self.get_intrinsics(sample_idx) all_extrinsics = self.get_extrinsics(sample_idx) all_img_paths = self.get_images(sample_idx) # some poses in ScanNet are invalid extrinsics, img_paths = [], [] for extrinsic, img_path in zip(all_extrinsics, all_img_paths): if np.all(np.isfinite(extrinsic)): img_paths.append(img_path) extrinsics.append(extrinsic) info['extrinsics'] = extrinsics info['img_paths'] = img_paths if not self.test_mode: pts_instance_mask_path = osp.join( self.root_dir, 'scannet_instance_data', f'{sample_idx}_ins_label.npy') pts_semantic_mask_path = osp.join( self.root_dir, 'scannet_instance_data', f'{sample_idx}_sem_label.npy') pts_instance_mask = np.load(pts_instance_mask_path).astype( np.int64) pts_semantic_mask = np.load(pts_semantic_mask_path).astype( np.int64) mmcv.mkdir_or_exist(osp.join(self.root_dir, 'instance_mask')) mmcv.mkdir_or_exist(osp.join(self.root_dir, 'semantic_mask')) pts_instance_mask.tofile( osp.join(self.root_dir, 'instance_mask', f'{sample_idx}.bin')) pts_semantic_mask.tofile( osp.join(self.root_dir, 'semantic_mask', f'{sample_idx}.bin')) info['pts_instance_mask_path'] = osp.join( 'instance_mask', f'{sample_idx}.bin') info['pts_semantic_mask_path'] = osp.join( 'semantic_mask', f'{sample_idx}.bin') if has_label: annotations = {} # box is of shape [k, 6 + class] aligned_box_label = self.get_aligned_box_label(sample_idx) unaligned_box_label = self.get_unaligned_box_label(sample_idx) annotations['gt_num'] = aligned_box_label.shape[0] if annotations['gt_num'] != 0: aligned_box = aligned_box_label[:, :-1] # k, 6 unaligned_box = unaligned_box_label[:, :-1] classes = aligned_box_label[:, -1] # k annotations['name'] = np.array([ self.label2cat[self.cat_ids2class[classes[i]]] for i in range(annotations['gt_num']) ]) # default names are given to aligned bbox for compatibility # we also save unaligned bbox info with marked names annotations['location'] = aligned_box[:, :3] annotations['dimensions'] = aligned_box[:, 3:6] annotations['gt_boxes_upright_depth'] = aligned_box annotations['unaligned_location'] = unaligned_box[:, :3] annotations['unaligned_dimensions'] = unaligned_box[:, 3:6] annotations[ 'unaligned_gt_boxes_upright_depth'] = unaligned_box annotations['index'] = np.arange( annotations['gt_num'], dtype=np.int32) annotations['class'] = np.array([ self.cat_ids2class[classes[i]] for i in range(annotations['gt_num']) ]) axis_align_matrix = self.get_axis_align_matrix(sample_idx) annotations['axis_align_matrix'] = axis_align_matrix # 4x4 info['annos'] = annotations return info sample_id_list = sample_id_list if sample_id_list is not None \ else self.sample_id_list with futures.ThreadPoolExecutor(num_workers) as executor: infos = executor.map(process_single_scene, sample_id_list) return list(infos) class ScanNetSegData(object): """ScanNet dataset used to generate infos for semantic segmentation task. Args: data_root (str): Root path of the raw data. ann_file (str): The generated scannet infos. split (str, optional): Set split type of the data. Default: 'train'. num_points (int, optional): Number of points in each data input. Default: 8192. label_weight_func (function, optional): Function to compute the label weight. Default: None. """ def __init__(self, data_root, ann_file, split='train', num_points=8192, label_weight_func=None): self.data_root = data_root self.data_infos = mmcv.load(ann_file) self.split = split assert split in ['train', 'val', 'test'] self.num_points = num_points self.all_ids = np.arange(41) # all possible ids self.cat_ids = np.array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39 ]) # used for seg task self.ignore_index = len(self.cat_ids) self.cat_id2class = np.ones((self.all_ids.shape[0],), dtype=np.int) * \ self.ignore_index for i, cat_id in enumerate(self.cat_ids): self.cat_id2class[cat_id] = i # label weighting function is taken from # https://github.com/charlesq34/pointnet2/blob/master/scannet/scannet_dataset.py#L24 self.label_weight_func = (lambda x: 1.0 / np.log(1.2 + x)) if \ label_weight_func is None else label_weight_func def get_seg_infos(self): if self.split == 'test': return scene_idxs, label_weight = self.get_scene_idxs_and_label_weight() save_folder = osp.join(self.data_root, 'seg_info') mmcv.mkdir_or_exist(save_folder) np.save( osp.join(save_folder, f'{self.split}_resampled_scene_idxs.npy'), scene_idxs) np.save( osp.join(save_folder, f'{self.split}_label_weight.npy'), label_weight) print(f'{self.split} resampled scene index and label weight saved') def _convert_to_label(self, mask): """Convert class_id in loaded segmentation mask to label.""" if isinstance(mask, str): if mask.endswith('npy'): mask = np.load(mask) else: mask = np.fromfile(mask, dtype=np.int64) label = self.cat_id2class[mask] return label def get_scene_idxs_and_label_weight(self): """Compute scene_idxs for data sampling and label weight for loss calculation. We sample more times for scenes with more points. Label_weight is inversely proportional to number of class points. """ num_classes = len(self.cat_ids) num_point_all = [] label_weight = np.zeros((num_classes + 1, )) # ignore_index for data_info in self.data_infos: label = self._convert_to_label( osp.join(self.data_root, data_info['pts_semantic_mask_path'])) num_point_all.append(label.shape[0]) class_count, _ = np.histogram(label, range(num_classes + 2)) label_weight += class_count # repeat scene_idx for num_scene_point // num_sample_point times sample_prob = np.array(num_point_all) / float(np.sum(num_point_all)) num_iter = int(np.sum(num_point_all) / float(self.num_points)) scene_idxs = [] for idx in range(len(self.data_infos)): scene_idxs.extend([idx] * int(round(sample_prob[idx] * num_iter))) scene_idxs = np.array(scene_idxs).astype(np.int32) # calculate label weight, adopted from PointNet++ label_weight = label_weight[:-1].astype(np.float32) label_weight = label_weight / label_weight.sum() label_weight = self.label_weight_func(label_weight).astype(np.float32) return scene_idxs, label_weight ================================================ FILE: tools/data_converter/sunrgbd_data_utils.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from concurrent import futures as futures from os import path as osp import mmcv import numpy as np from scipy import io as sio def random_sampling(points, num_points, replace=None): """Random sampling. Sampling point cloud to a certain number of points. Args: points (ndarray): Point cloud. num_points (int): The number of samples. replace (bool): Whether the sample is with or without replacement. Returns: points (ndarray): Point cloud after sampling. """ if num_points < 0: return points if replace is None: replace = (points.shape[0] < num_points) choices = np.random.choice(points.shape[0], num_points, replace=replace) return points[choices] class SUNRGBDInstance(object): def __init__(self, line): data = line.split(' ') data[1:] = [float(x) for x in data[1:]] self.classname = data[0] self.xmin = data[1] self.ymin = data[2] self.xmax = data[1] + data[3] self.ymax = data[2] + data[4] self.box2d = np.array([self.xmin, self.ymin, self.xmax, self.ymax]) self.centroid = np.array([data[5], data[6], data[7]]) self.width = data[8] self.length = data[9] self.height = data[10] # data[9] is x_size (length), data[8] is y_size (width), data[10] is # z_size (height) in our depth coordinate system, # l corresponds to the size along the x axis self.size = np.array([data[9], data[8], data[10]]) * 2 self.orientation = np.zeros((3, )) self.orientation[0] = data[11] self.orientation[1] = data[12] self.heading_angle = np.arctan2(self.orientation[1], self.orientation[0]) self.box3d = np.concatenate( [self.centroid, self.size, self.heading_angle[None]]) class SUNRGBDData(object): """SUNRGBD data. Generate scannet infos for sunrgbd_converter. Args: root_path (str): Root path of the raw data. split (str, optional): Set split type of the data. Default: 'train'. use_v1 (bool, optional): Whether to use v1. Default: False. num_points (int, optional): Number of points to sample. Set to -1 to utilize all points. Defaults to -1. """ def __init__(self, root_path, split='train', use_v1=False, num_points=-1): self.root_dir = root_path self.split = split self.split_dir = osp.join(root_path, 'sunrgbd_trainval') self.num_points = num_points self.classes = [ 'bed', 'table', 'sofa', 'chair', 'toilet', 'desk', 'dresser', 'night_stand', 'bookshelf', 'bathtub' ] self.cat2label = {cat: self.classes.index(cat) for cat in self.classes} self.label2cat = { label: self.classes[label] for label in range(len(self.classes)) } assert split in ['train', 'val', 'test'] split_file = osp.join(self.split_dir, f'{split}_data_idx.txt') mmcv.check_file_exist(split_file) self.sample_id_list = map(int, mmcv.list_from_file(split_file)) self.image_dir = osp.join(self.split_dir, 'image') self.calib_dir = osp.join(self.split_dir, 'calib') self.depth_dir = osp.join(self.split_dir, 'depth') if use_v1: self.label_dir = osp.join(self.split_dir, 'label_v1') else: self.label_dir = osp.join(self.split_dir, 'label') def __len__(self): return len(self.sample_id_list) def get_image(self, idx): img_filename = osp.join(self.image_dir, f'{idx:06d}.jpg') return mmcv.imread(img_filename) def get_image_shape(self, idx): image = self.get_image(idx) return np.array(image.shape[:2], dtype=np.int32) def get_depth(self, idx): depth_filename = osp.join(self.depth_dir, f'{idx:06d}.mat') depth = sio.loadmat(depth_filename)['instance'] return depth def get_calibration(self, idx): calib_filepath = osp.join(self.calib_dir, f'{idx:06d}.txt') lines = [line.rstrip() for line in open(calib_filepath)] Rt = np.array([float(x) for x in lines[0].split(' ')]) Rt = np.reshape(Rt, (3, 3), order='F').astype(np.float32) K = np.array([float(x) for x in lines[1].split(' ')]) K = np.reshape(K, (3, 3), order='F').astype(np.float32) return K, Rt def get_label_objects(self, idx): label_filename = osp.join(self.label_dir, f'{idx:06d}.txt') lines = [line.rstrip() for line in open(label_filename)] objects = [SUNRGBDInstance(line) for line in lines] return objects def get_infos(self, num_workers=4, has_label=True, sample_id_list=None): """Get data infos. This method gets information from the raw data. Args: num_workers (int, optional): Number of threads to be used. Default: 4. has_label (bool, optional): Whether the data has label. Default: True. sample_id_list (list[int], optional): Index list of the sample. Default: None. Returns: infos (list[dict]): Information of the raw data. """ def process_single_scene(sample_idx): print(f'{self.split} sample_idx: {sample_idx}') # convert depth to points pc_upright_depth = self.get_depth(sample_idx) pc_upright_depth_subsampled = random_sampling( pc_upright_depth, self.num_points) info = dict() pc_info = {'num_features': 6, 'lidar_idx': sample_idx} info['point_cloud'] = pc_info mmcv.mkdir_or_exist(osp.join(self.root_dir, 'points')) pc_upright_depth_subsampled.tofile( osp.join(self.root_dir, 'points', f'{sample_idx:06d}.bin')) info['pts_path'] = osp.join('points', f'{sample_idx:06d}.bin') img_path = osp.join('image', f'{sample_idx:06d}.jpg') image_info = { 'image_idx': sample_idx, 'image_shape': self.get_image_shape(sample_idx), 'image_path': img_path } info['image'] = image_info K, Rt = self.get_calibration(sample_idx) calib_info = {'K': K, 'Rt': Rt} info['calib'] = calib_info if has_label: obj_list = self.get_label_objects(sample_idx) annotations = {} annotations['gt_num'] = len([ obj.classname for obj in obj_list if obj.classname in self.cat2label.keys() ]) if annotations['gt_num'] != 0: annotations['name'] = np.array([ obj.classname for obj in obj_list if obj.classname in self.cat2label.keys() ]) annotations['bbox'] = np.concatenate([ obj.box2d.reshape(1, 4) for obj in obj_list if obj.classname in self.cat2label.keys() ], axis=0) annotations['location'] = np.concatenate([ obj.centroid.reshape(1, 3) for obj in obj_list if obj.classname in self.cat2label.keys() ], axis=0) annotations['dimensions'] = 2 * np.array([ [obj.length, obj.width, obj.height] for obj in obj_list if obj.classname in self.cat2label.keys() ]) # lwh (depth) format annotations['rotation_y'] = np.array([ obj.heading_angle for obj in obj_list if obj.classname in self.cat2label.keys() ]) annotations['index'] = np.arange( len(obj_list), dtype=np.int32) annotations['class'] = np.array([ self.cat2label[obj.classname] for obj in obj_list if obj.classname in self.cat2label.keys() ]) annotations['gt_boxes_upright_depth'] = np.stack( [ obj.box3d for obj in obj_list if obj.classname in self.cat2label.keys() ], axis=0) # (K,8) info['annos'] = annotations return info sample_id_list = sample_id_list if \ sample_id_list is not None else self.sample_id_list with futures.ThreadPoolExecutor(num_workers) as executor: infos = executor.map(process_single_scene, sample_id_list) return list(infos) ================================================ FILE: tools/data_converter/waymo_converter.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. r"""Adapted from `Waymo to KITTI converter `_. """ try: from waymo_open_dataset import dataset_pb2 except ImportError: raise ImportError( 'Please run "pip install waymo-open-dataset-tf-2-1-0==1.2.0" ' 'to install the official devkit first.') from glob import glob from os.path import join import mmcv import numpy as np import tensorflow as tf from waymo_open_dataset.utils import range_image_utils, transform_utils from waymo_open_dataset.utils.frame_utils import \ parse_range_image_and_camera_projection class Waymo2KITTI(object): """Waymo to KITTI converter. This class serves as the converter to change the waymo raw data to KITTI format. Args: load_dir (str): Directory to load waymo raw data. save_dir (str): Directory to save data in KITTI format. prefix (str): Prefix of filename. In general, 0 for training, 1 for validation and 2 for testing. workers (int, optional): Number of workers for the parallel process. test_mode (bool, optional): Whether in the test_mode. Default: False. """ def __init__(self, load_dir, save_dir, prefix, workers=64, test_mode=False): self.filter_empty_3dboxes = True self.filter_no_label_zone_points = True self.selected_waymo_classes = ['VEHICLE', 'PEDESTRIAN', 'CYCLIST'] # Only data collected in specific locations will be converted # If set None, this filter is disabled # Available options: location_sf (main dataset) self.selected_waymo_locations = None self.save_track_id = False # turn on eager execution for older tensorflow versions if int(tf.__version__.split('.')[0]) < 2: tf.enable_eager_execution() self.lidar_list = [ '_FRONT', '_FRONT_RIGHT', '_FRONT_LEFT', '_SIDE_RIGHT', '_SIDE_LEFT' ] self.type_list = [ 'UNKNOWN', 'VEHICLE', 'PEDESTRIAN', 'SIGN', 'CYCLIST' ] self.waymo_to_kitti_class_map = { 'UNKNOWN': 'DontCare', 'PEDESTRIAN': 'Pedestrian', 'VEHICLE': 'Car', 'CYCLIST': 'Cyclist', 'SIGN': 'Sign' # not in kitti } self.load_dir = load_dir self.save_dir = save_dir self.prefix = prefix self.workers = int(workers) self.test_mode = test_mode self.tfrecord_pathnames = sorted( glob(join(self.load_dir, '*.tfrecord'))) self.label_save_dir = f'{self.save_dir}/label_' self.label_all_save_dir = f'{self.save_dir}/label_all' self.image_save_dir = f'{self.save_dir}/image_' self.calib_save_dir = f'{self.save_dir}/calib' self.point_cloud_save_dir = f'{self.save_dir}/velodyne' self.pose_save_dir = f'{self.save_dir}/pose' self.timestamp_save_dir = f'{self.save_dir}/timestamp' self.create_folder() def convert(self): """Convert action.""" print('Start converting ...') mmcv.track_parallel_progress(self.convert_one, range(len(self)), self.workers) print('\nFinished ...') def convert_one(self, file_idx): """Convert action for single file. Args: file_idx (int): Index of the file to be converted. """ pathname = self.tfrecord_pathnames[file_idx] dataset = tf.data.TFRecordDataset(pathname, compression_type='') for frame_idx, data in enumerate(dataset): frame = dataset_pb2.Frame() frame.ParseFromString(bytearray(data.numpy())) if (self.selected_waymo_locations is not None and frame.context.stats.location not in self.selected_waymo_locations): continue self.save_image(frame, file_idx, frame_idx) self.save_calib(frame, file_idx, frame_idx) self.save_lidar(frame, file_idx, frame_idx) self.save_pose(frame, file_idx, frame_idx) self.save_timestamp(frame, file_idx, frame_idx) if not self.test_mode: self.save_label(frame, file_idx, frame_idx) def __len__(self): """Length of the filename list.""" return len(self.tfrecord_pathnames) def save_image(self, frame, file_idx, frame_idx): """Parse and save the images in png format. Args: frame (:obj:`Frame`): Open dataset frame proto. file_idx (int): Current file index. frame_idx (int): Current frame index. """ for img in frame.images: img_path = f'{self.image_save_dir}{str(img.name - 1)}/' + \ f'{self.prefix}{str(file_idx).zfill(3)}' + \ f'{str(frame_idx).zfill(3)}.png' img = mmcv.imfrombytes(img.image) mmcv.imwrite(img, img_path) def save_calib(self, frame, file_idx, frame_idx): """Parse and save the calibration data. Args: frame (:obj:`Frame`): Open dataset frame proto. file_idx (int): Current file index. frame_idx (int): Current frame index. """ # waymo front camera to kitti reference camera T_front_cam_to_ref = np.array([[0.0, -1.0, 0.0], [0.0, 0.0, -1.0], [1.0, 0.0, 0.0]]) camera_calibs = [] R0_rect = [f'{i:e}' for i in np.eye(3).flatten()] Tr_velo_to_cams = [] calib_context = '' for camera in frame.context.camera_calibrations: # extrinsic parameters T_cam_to_vehicle = np.array(camera.extrinsic.transform).reshape( 4, 4) T_vehicle_to_cam = np.linalg.inv(T_cam_to_vehicle) Tr_velo_to_cam = \ self.cart_to_homo(T_front_cam_to_ref) @ T_vehicle_to_cam if camera.name == 1: # FRONT = 1, see dataset.proto for details self.T_velo_to_front_cam = Tr_velo_to_cam.copy() Tr_velo_to_cam = Tr_velo_to_cam[:3, :].reshape((12, )) Tr_velo_to_cams.append([f'{i:e}' for i in Tr_velo_to_cam]) # intrinsic parameters camera_calib = np.zeros((3, 4)) camera_calib[0, 0] = camera.intrinsic[0] camera_calib[1, 1] = camera.intrinsic[1] camera_calib[0, 2] = camera.intrinsic[2] camera_calib[1, 2] = camera.intrinsic[3] camera_calib[2, 2] = 1 camera_calib = list(camera_calib.reshape(12)) camera_calib = [f'{i:e}' for i in camera_calib] camera_calibs.append(camera_calib) # all camera ids are saved as id-1 in the result because # camera 0 is unknown in the proto for i in range(5): calib_context += 'P' + str(i) + ': ' + \ ' '.join(camera_calibs[i]) + '\n' calib_context += 'R0_rect' + ': ' + ' '.join(R0_rect) + '\n' for i in range(5): calib_context += 'Tr_velo_to_cam_' + str(i) + ': ' + \ ' '.join(Tr_velo_to_cams[i]) + '\n' with open( f'{self.calib_save_dir}/{self.prefix}' + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+') as fp_calib: fp_calib.write(calib_context) fp_calib.close() def save_lidar(self, frame, file_idx, frame_idx): """Parse and save the lidar data in psd format. Args: frame (:obj:`Frame`): Open dataset frame proto. file_idx (int): Current file index. frame_idx (int): Current frame index. """ range_images, camera_projections, range_image_top_pose = \ parse_range_image_and_camera_projection(frame) # First return points_0, cp_points_0, intensity_0, elongation_0, mask_indices_0 = \ self.convert_range_image_to_point_cloud( frame, range_images, camera_projections, range_image_top_pose, ri_index=0 ) points_0 = np.concatenate(points_0, axis=0) intensity_0 = np.concatenate(intensity_0, axis=0) elongation_0 = np.concatenate(elongation_0, axis=0) mask_indices_0 = np.concatenate(mask_indices_0, axis=0) # Second return points_1, cp_points_1, intensity_1, elongation_1, mask_indices_1 = \ self.convert_range_image_to_point_cloud( frame, range_images, camera_projections, range_image_top_pose, ri_index=1 ) points_1 = np.concatenate(points_1, axis=0) intensity_1 = np.concatenate(intensity_1, axis=0) elongation_1 = np.concatenate(elongation_1, axis=0) mask_indices_1 = np.concatenate(mask_indices_1, axis=0) points = np.concatenate([points_0, points_1], axis=0) intensity = np.concatenate([intensity_0, intensity_1], axis=0) elongation = np.concatenate([elongation_0, elongation_1], axis=0) mask_indices = np.concatenate([mask_indices_0, mask_indices_1], axis=0) # timestamp = frame.timestamp_micros * np.ones_like(intensity) # concatenate x,y,z, intensity, elongation, timestamp (6-dim) point_cloud = np.column_stack( (points, intensity, elongation, mask_indices)) pc_path = f'{self.point_cloud_save_dir}/{self.prefix}' + \ f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.bin' point_cloud.astype(np.float32).tofile(pc_path) def save_label(self, frame, file_idx, frame_idx): """Parse and save the label data in txt format. The relation between waymo and kitti coordinates is noteworthy: 1. x, y, z correspond to l, w, h (waymo) -> l, h, w (kitti) 2. x-y-z: front-left-up (waymo) -> right-down-front(kitti) 3. bbox origin at volumetric center (waymo) -> bottom center (kitti) 4. rotation: +x around y-axis (kitti) -> +x around z-axis (waymo) Args: frame (:obj:`Frame`): Open dataset frame proto. file_idx (int): Current file index. frame_idx (int): Current frame index. """ fp_label_all = open( f'{self.label_all_save_dir}/{self.prefix}' + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'w+') id_to_bbox = dict() id_to_name = dict() for labels in frame.projected_lidar_labels: name = labels.name for label in labels.labels: # TODO: need a workaround as bbox may not belong to front cam bbox = [ label.box.center_x - label.box.length / 2, label.box.center_y - label.box.width / 2, label.box.center_x + label.box.length / 2, label.box.center_y + label.box.width / 2 ] id_to_bbox[label.id] = bbox id_to_name[label.id] = name - 1 for obj in frame.laser_labels: bounding_box = None name = None id = obj.id for lidar in self.lidar_list: if id + lidar in id_to_bbox: bounding_box = id_to_bbox.get(id + lidar) name = str(id_to_name.get(id + lidar)) break if bounding_box is None or name is None: name = '0' bounding_box = (0, 0, 0, 0) my_type = self.type_list[obj.type] if my_type not in self.selected_waymo_classes: continue if self.filter_empty_3dboxes and obj.num_lidar_points_in_box < 1: continue my_type = self.waymo_to_kitti_class_map[my_type] height = obj.box.height width = obj.box.width length = obj.box.length x = obj.box.center_x y = obj.box.center_y z = obj.box.center_z - height / 2 # project bounding box to the virtual reference frame pt_ref = self.T_velo_to_front_cam @ \ np.array([x, y, z, 1]).reshape((4, 1)) x, y, z, _ = pt_ref.flatten().tolist() rotation_y = -obj.box.heading - np.pi / 2 track_id = obj.id # not available truncated = 0 occluded = 0 alpha = -10 line = my_type + \ ' {} {} {} {} {} {} {} {} {} {} {} {} {} {}\n'.format( round(truncated, 2), occluded, round(alpha, 2), round(bounding_box[0], 2), round(bounding_box[1], 2), round(bounding_box[2], 2), round(bounding_box[3], 2), round(height, 2), round(width, 2), round(length, 2), round(x, 2), round(y, 2), round(z, 2), round(rotation_y, 2)) if self.save_track_id: line_all = line[:-1] + ' ' + name + ' ' + track_id + '\n' else: line_all = line[:-1] + ' ' + name + '\n' fp_label = open( f'{self.label_save_dir}{name}/{self.prefix}' + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt', 'a') fp_label.write(line) fp_label.close() fp_label_all.write(line_all) fp_label_all.close() def save_pose(self, frame, file_idx, frame_idx): """Parse and save the pose data. Note that SDC's own pose is not included in the regular training of KITTI dataset. KITTI raw dataset contains ego motion files but are not often used. Pose is important for algorithms that take advantage of the temporal information. Args: frame (:obj:`Frame`): Open dataset frame proto. file_idx (int): Current file index. frame_idx (int): Current frame index. """ pose = np.array(frame.pose.transform).reshape(4, 4) np.savetxt( join(f'{self.pose_save_dir}/{self.prefix}' + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'), pose) def save_timestamp(self, frame, file_idx, frame_idx): """Save the timestamp data in a separate file instead of the pointcloud. Note that SDC's own pose is not included in the regular training of KITTI dataset. KITTI raw dataset contains ego motion files but are not often used. Pose is important for algorithms that take advantage of the temporal information. Args: frame (:obj:`Frame`): Open dataset frame proto. file_idx (int): Current file index. frame_idx (int): Current frame index. """ with open( join(f'{self.timestamp_save_dir}/{self.prefix}' + f'{str(file_idx).zfill(3)}{str(frame_idx).zfill(3)}.txt'), 'w') as f: f.write(str(frame.timestamp_micros)) def create_folder(self): """Create folder for data preprocessing.""" if not self.test_mode: dir_list1 = [ self.label_all_save_dir, self.calib_save_dir, self.point_cloud_save_dir, self.pose_save_dir, self.timestamp_save_dir ] dir_list2 = [self.label_save_dir, self.image_save_dir] else: dir_list1 = [ self.calib_save_dir, self.point_cloud_save_dir, self.pose_save_dir, self.timestamp_save_dir ] dir_list2 = [self.image_save_dir] for d in dir_list1: mmcv.mkdir_or_exist(d) for d in dir_list2: for i in range(5): mmcv.mkdir_or_exist(f'{d}{str(i)}') def convert_range_image_to_point_cloud(self, frame, range_images, camera_projections, range_image_top_pose, ri_index=0): """Convert range images to point cloud. Args: frame (:obj:`Frame`): Open dataset frame. range_images (dict): Mapping from laser_name to list of two range images corresponding with two returns. camera_projections (dict): Mapping from laser_name to list of two camera projections corresponding with two returns. range_image_top_pose (:obj:`Transform`): Range image pixel pose for top lidar. ri_index (int, optional): 0 for the first return, 1 for the second return. Default: 0. Returns: tuple[list[np.ndarray]]: (List of points with shape [N, 3], camera projections of points with shape [N, 6], intensity with shape [N, 1], elongation with shape [N, 1], points' position in the depth map (element offset if points come from the main lidar otherwise -1) with shape[N, 1]). All the lists have the length of lidar numbers (5). """ calibrations = sorted( frame.context.laser_calibrations, key=lambda c: c.name) points = [] cp_points = [] intensity = [] elongation = [] mask_indices = [] frame_pose = tf.convert_to_tensor( value=np.reshape(np.array(frame.pose.transform), [4, 4])) # [H, W, 6] range_image_top_pose_tensor = tf.reshape( tf.convert_to_tensor(value=range_image_top_pose.data), range_image_top_pose.shape.dims) # [H, W, 3, 3] range_image_top_pose_tensor_rotation = \ transform_utils.get_rotation_matrix( range_image_top_pose_tensor[..., 0], range_image_top_pose_tensor[..., 1], range_image_top_pose_tensor[..., 2]) range_image_top_pose_tensor_translation = \ range_image_top_pose_tensor[..., 3:] range_image_top_pose_tensor = transform_utils.get_transform( range_image_top_pose_tensor_rotation, range_image_top_pose_tensor_translation) for c in calibrations: range_image = range_images[c.name][ri_index] if len(c.beam_inclinations) == 0: beam_inclinations = range_image_utils.compute_inclination( tf.constant( [c.beam_inclination_min, c.beam_inclination_max]), height=range_image.shape.dims[0]) else: beam_inclinations = tf.constant(c.beam_inclinations) beam_inclinations = tf.reverse(beam_inclinations, axis=[-1]) extrinsic = np.reshape(np.array(c.extrinsic.transform), [4, 4]) range_image_tensor = tf.reshape( tf.convert_to_tensor(value=range_image.data), range_image.shape.dims) pixel_pose_local = None frame_pose_local = None if c.name == dataset_pb2.LaserName.TOP: pixel_pose_local = range_image_top_pose_tensor pixel_pose_local = tf.expand_dims(pixel_pose_local, axis=0) frame_pose_local = tf.expand_dims(frame_pose, axis=0) range_image_mask = range_image_tensor[..., 0] > 0 if self.filter_no_label_zone_points: nlz_mask = range_image_tensor[..., 3] != 1.0 # 1.0: in NLZ range_image_mask = range_image_mask & nlz_mask range_image_cartesian = \ range_image_utils.extract_point_cloud_from_range_image( tf.expand_dims(range_image_tensor[..., 0], axis=0), tf.expand_dims(extrinsic, axis=0), tf.expand_dims(tf.convert_to_tensor( value=beam_inclinations), axis=0), pixel_pose=pixel_pose_local, frame_pose=frame_pose_local) mask_index = tf.where(range_image_mask) range_image_cartesian = tf.squeeze(range_image_cartesian, axis=0) points_tensor = tf.gather_nd(range_image_cartesian, mask_index) cp = camera_projections[c.name][ri_index] cp_tensor = tf.reshape( tf.convert_to_tensor(value=cp.data), cp.shape.dims) cp_points_tensor = tf.gather_nd(cp_tensor, mask_index) points.append(points_tensor.numpy()) cp_points.append(cp_points_tensor.numpy()) intensity_tensor = tf.gather_nd(range_image_tensor[..., 1], mask_index) intensity.append(intensity_tensor.numpy()) elongation_tensor = tf.gather_nd(range_image_tensor[..., 2], mask_index) elongation.append(elongation_tensor.numpy()) if c.name == 1: mask_index = (ri_index * range_image_mask.shape[0] + mask_index[:, 0] ) * range_image_mask.shape[1] + mask_index[:, 1] mask_index = mask_index.numpy().astype(elongation[-1].dtype) else: mask_index = np.full_like(elongation[-1], -1) mask_indices.append(mask_index) return points, cp_points, intensity, elongation, mask_indices def cart_to_homo(self, mat): """Convert transformation matrix in Cartesian coordinates to homogeneous format. Args: mat (np.ndarray): Transformation matrix in Cartesian. The input matrix shape is 3x3 or 3x4. Returns: np.ndarray: Transformation matrix in homogeneous format. The matrix shape is 4x4. """ ret = np.eye(4) if mat.shape == (3, 3): ret[:3, :3] = mat elif mat.shape == (3, 4): ret[:3, :] = mat else: raise ValueError(mat.shape) return ret ================================================ FILE: tools/deployment/mmdet3d2torchserve.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from argparse import ArgumentParser, Namespace from pathlib import Path from tempfile import TemporaryDirectory import mmcv try: from model_archiver.model_packaging import package_model from model_archiver.model_packaging_utils import ModelExportUtils except ImportError: package_model = None def mmdet3d2torchserve( config_file: str, checkpoint_file: str, output_folder: str, model_name: str, model_version: str = '1.0', force: bool = False, ): """Converts MMDetection3D model (config + checkpoint) to TorchServe `.mar`. Args: config_file (str): In MMDetection3D config format. The contents vary for each task repository. checkpoint_file (str): In MMDetection3D checkpoint format. The contents vary for each task repository. output_folder (str): Folder where `{model_name}.mar` will be created. The file created will be in TorchServe archive format. model_name (str): If not None, used for naming the `{model_name}.mar` file that will be created under `output_folder`. If None, `{Path(checkpoint_file).stem}` will be used. model_version (str, optional): Model's version. Default: '1.0'. force (bool, optional): If True, if there is an existing `{model_name}.mar` file under `output_folder` it will be overwritten. Default: False. """ mmcv.mkdir_or_exist(output_folder) config = mmcv.Config.fromfile(config_file) with TemporaryDirectory() as tmpdir: config.dump(f'{tmpdir}/config.py') args = Namespace( **{ 'model_file': f'{tmpdir}/config.py', 'serialized_file': checkpoint_file, 'handler': f'{Path(__file__).parent}/mmdet3d_handler.py', 'model_name': model_name or Path(checkpoint_file).stem, 'version': model_version, 'export_path': output_folder, 'force': force, 'requirements_file': None, 'extra_files': None, 'runtime': 'python', 'archive_format': 'default' }) manifest = ModelExportUtils.generate_manifest_json(args) package_model(args, manifest) def parse_args(): parser = ArgumentParser( description='Convert MMDetection models to TorchServe `.mar` format.') parser.add_argument('config', type=str, help='config file path') parser.add_argument('checkpoint', type=str, help='checkpoint file path') parser.add_argument( '--output-folder', type=str, required=True, help='Folder where `{model_name}.mar` will be created.') parser.add_argument( '--model-name', type=str, default=None, help='If not None, used for naming the `{model_name}.mar`' 'file that will be created under `output_folder`.' 'If None, `{Path(checkpoint_file).stem}` will be used.') parser.add_argument( '--model-version', type=str, default='1.0', help='Number used for versioning.') parser.add_argument( '-f', '--force', action='store_true', help='overwrite the existing `{model_name}.mar`') args = parser.parse_args() return args if __name__ == '__main__': args = parse_args() if package_model is None: raise ImportError('`torch-model-archiver` is required.' 'Try: pip install torch-model-archiver') mmdet3d2torchserve(args.config, args.checkpoint, args.output_folder, args.model_name, args.model_version, args.force) ================================================ FILE: tools/deployment/mmdet3d_handler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import base64 import os import numpy as np import torch from ts.torch_handler.base_handler import BaseHandler from mmdet3d.apis import inference_detector, init_model from mmdet3d.core.points import get_points_type class MMdet3dHandler(BaseHandler): """MMDetection3D Handler used in TorchServe. Handler to load models in MMDetection3D, and it will process data to get predicted results. For now, it only supports SECOND. """ threshold = 0.5 load_dim = 4 use_dim = [0, 1, 2, 3] coord_type = 'LIDAR' attribute_dims = None def initialize(self, context): """Initialize function loads the model in MMDetection3D. Args: context (context): It is a JSON Object containing information pertaining to the model artifacts parameters. """ properties = context.system_properties self.map_location = 'cuda' if torch.cuda.is_available() else 'cpu' self.device = torch.device(self.map_location + ':' + str(properties.get('gpu_id')) if torch.cuda. is_available() else self.map_location) self.manifest = context.manifest model_dir = properties.get('model_dir') serialized_file = self.manifest['model']['serializedFile'] checkpoint = os.path.join(model_dir, serialized_file) self.config_file = os.path.join(model_dir, 'config.py') self.model = init_model(self.config_file, checkpoint, self.device) self.initialized = True def preprocess(self, data): """Preprocess function converts data into LiDARPoints class. Args: data (List): Input data from the request. Returns: `LiDARPoints` : The preprocess function returns the input point cloud data as LiDARPoints class. """ for row in data: # Compat layer: normally the envelope should just return the data # directly, but older versions of Torchserve didn't have envelope. pts = row.get('data') or row.get('body') if isinstance(pts, str): pts = base64.b64decode(pts) points = np.frombuffer(pts, dtype=np.float32) points = points.reshape(-1, self.load_dim) points = points[:, self.use_dim] points_class = get_points_type(self.coord_type) points = points_class( points, points_dim=points.shape[-1], attribute_dims=self.attribute_dims) return points def inference(self, data): """Inference Function. This function is used to make a prediction call on the given input request. Args: data (`LiDARPoints`): LiDARPoints class passed to make the inference request. Returns: List(dict) : The predicted result is returned in this function. """ results, _ = inference_detector(self.model, data) return results def postprocess(self, data): """Postprocess function. This function makes use of the output from the inference and converts it into a torchserve supported response output. Args: data (List[dict]): The data received from the prediction output of the model. Returns: List: The post process function returns a list of the predicted output. """ output = [] for pts_index, result in enumerate(data): output.append([]) if 'pts_bbox' in result.keys(): pred_bboxes = result['pts_bbox']['boxes_3d'].tensor.numpy() pred_scores = result['pts_bbox']['scores_3d'].numpy() else: pred_bboxes = result['boxes_3d'].tensor.numpy() pred_scores = result['scores_3d'].numpy() index = pred_scores > self.threshold bbox_coords = pred_bboxes[index].tolist() score = pred_scores[index].tolist() output[pts_index].append({'3dbbox': bbox_coords, 'score': score}) return output ================================================ FILE: tools/deployment/test_torchserver.py ================================================ from argparse import ArgumentParser import numpy as np import requests from mmdet3d.apis import inference_detector, init_model def parse_args(): parser = ArgumentParser() parser.add_argument('pcd', help='Point cloud file') parser.add_argument('config', help='Config file') parser.add_argument('checkpoint', help='Checkpoint file') parser.add_argument('model_name', help='The model name in the server') parser.add_argument( '--inference-addr', default='127.0.0.1:8080', help='Address and port of the inference server') parser.add_argument( '--device', default='cuda:0', help='Device used for inference') parser.add_argument( '--score-thr', type=float, default=0.5, help='3d bbox score threshold') args = parser.parse_args() return args def parse_result(input): bbox = input[0]['3dbbox'] result = np.array(bbox) return result def main(args): # build the model from a config file and a checkpoint file model = init_model(args.config, args.checkpoint, device=args.device) # test a single point cloud file model_result, _ = inference_detector(model, args.pcd) # filter the 3d bboxes whose scores > 0.5 if 'pts_bbox' in model_result[0].keys(): pred_bboxes = model_result[0]['pts_bbox']['boxes_3d'].tensor.numpy() pred_scores = model_result[0]['pts_bbox']['scores_3d'].numpy() else: pred_bboxes = model_result[0]['boxes_3d'].tensor.numpy() pred_scores = model_result[0]['scores_3d'].numpy() model_result = pred_bboxes[pred_scores > 0.5] url = 'http://' + args.inference_addr + '/predictions/' + args.model_name with open(args.pcd, 'rb') as points: response = requests.post(url, points) server_result = parse_result(response.json()) assert np.allclose(model_result, server_result) if __name__ == '__main__': args = parse_args() main(args) ================================================ FILE: tools/dist_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} PORT=${PORT:-29500} MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch \ --nnodes=$NNODES \ --node_rank=$NODE_RANK \ --master_addr=$MASTER_ADDR \ --nproc_per_node=$GPUS \ --master_port=$PORT \ $(dirname "$0")/test.py \ $CONFIG \ $CHECKPOINT \ --launcher pytorch \ ${@:4} ================================================ FILE: tools/dist_train.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} PORT=${PORT:-29500} MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.launch \ --nnodes=$NNODES \ --node_rank=$NODE_RANK \ --master_addr=$MASTER_ADDR \ --nproc_per_node=$GPUS \ --master_port=$PORT \ $(dirname "$0")/train.py \ $CONFIG \ --seed 0 \ --launcher pytorch ${@:3} ================================================ FILE: tools/eval.py ================================================ import numpy as np import mmcv # data = mmcv.load('/mount/data/jiahan/fbbev/test/Sun_Oct_15_11_35/results_nusc_planning.json') data = mmcv.load('/mount/data/FBBEV/test/planner_r50_8x4_12ep_102x102_4f_S111_fix2_/Tue_Oct_24_03_58/results_nusc_planning.json') #sort keys = list(data.keys()) # print(keys) new_keys = [] for key in keys: s =key.split("-") new_keys.append([int(s[1]),int(s[2])]) new_keys=sorted(new_keys,key=(lambda x:(x[0], x[1]))) sorted_keys = [] for key in new_keys: v = ['scene', str(key[0]).zfill(4), str(key[1]) ] k='-'.join(v) sorted_keys.append(k) print(len(data)) all_scene_keys=[] key='-'.join(sorted_keys[0].split("-")[:2]) # print(key) scene=[] for k in sorted_keys: if(key in k): # print(True) scene.append(k) else: s =k.split("-") key='-'.join(s[:2]) if len(scene)<39: print(scene) all_scene_keys.append(scene) scene=[k] # print(all_scene_keys) len(all_scene_keys) #tranform raw data new_data={} for keys in all_scene_keys: l = len(keys) for i in range(l): val = [] index = i for j in range(i+1): if index>6: index-=1 else: val.append(data[keys[j]][index]) index-=1 new_data[keys[i]]=val #compute mean and var stable_dist_with_gt=[] stable_mean_distance=[] stable_variance_distance=[] for key, value in new_data.items(): #filter unstable data if(len(value)!=7): continue assert len(value)==7 #compute mean gt = value[-1] pred = value[:-1] coor_mean= np.mean(pred, axis=0) #L2 dist = np.linalg.norm(coor_mean - gt) stable_dist_with_gt.append(dist) #compute var data_array = np.array(pred) distances = np.linalg.norm(data_array - coor_mean, axis=1) mean_distance = np.mean(distances) variance_distance = np.var(distances) stable_mean_distance.append(mean_distance) stable_variance_distance.append(variance_distance) print('stable_dist_with_gt: {}'.format(np.mean(stable_dist_with_gt))) print('stable_mean_distance: {}'.format(np.mean(stable_mean_distance))) print('stable_variance_distance: {}'.format(np.mean(stable_variance_distance))) import random import math import matplotlib.pyplot as plt # 生成40种不同颜色的列表 colors = ['#%02X%02X%02X' % (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255) ) for _ in range(40)] # colors = ['g', 'b', 'c', 'm', 'y', 'k', 'purple', 'orange', 'pink', 'brown', 'lime', 'teal', 'gold', 'indigo', 'slategray', 'violet', 'darkred', 'maroon', 'orchid'] # 20种不同颜色选项 markers = ['x', 's', 'D', '^', 'v', 'p'] # 不同的标记选项 for keys in all_scene_keys: all_coor=[] l = len(keys) for i in range(l): coordinates=data[keys[i]] all_coor.extend(coordinates) min_x=min(coor[0] for coor in all_coor) min_y=min(coor[1] for coor in all_coor) max_x=max(coor[0] for coor in all_coor) max_y=max(coor[1] for coor in all_coor) ratio=math.ceil((max_y-min_y)/(max_x-min_x)) plt.figure(figsize=(8, 8*ratio), dpi=300) plt.gca().invert_yaxis() # 反转y轴,将原点移至左上角 gt_traj=[] for i in range(l): coordinates=data[keys[i]] x_coords, y_coords = zip(*coordinates) gt_traj.append(coordinates[0]) color = colors[i % len(colors)] plt.scatter(x_coords[0], y_coords[0], s=15, marker='o',c='r') # plt.scatter(x_coords[1:], y_coords[1:], s=15, marker='o',c=color) for j in range(len(coordinates) - 1): if i+j > l-2: break plt.plot([x_coords[j], x_coords[j + 1]], [y_coords[j], y_coords[j + 1]], '-',c=color, linewidth=0.5) x_gt_coords, y_gt_coords = zip(*gt_traj) for i in range(len(gt_traj) - 1): plt.plot([x_gt_coords[i], x_gt_coords[i + 1]], [y_gt_coords[i], y_gt_coords[i + 1]], '-',c='r', linewidth=1) plt.axis('equal') for i in range(l): col_coordinates=new_data[keys[i]] x_coords, y_coords = zip(*col_coordinates) color = colors[i % len(colors)] for j in range(len(col_coordinates)-1): marker = markers[j % len(markers) ] # plt.plot([x_coords[j], x_coords[j + 1]], [y_coords[j], y_coords[j + 1]], '-',c=color, linewidth=0.5) plt.scatter(x_coords[j], y_coords[j], s=10, marker=marker,c=color) plt.xlabel('X') plt.ylabel('Y') s =keys[0].split("-") key='-'.join(s[:2]) plt.savefig(f'../{key}_111_fix2.png') print(key) plt.close() ================================================ FILE: tools/misc/browse_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import warnings from os import path as osp from pathlib import Path import mmcv import numpy as np from mmcv import Config, DictAction, mkdir_or_exist from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes) from mmdet3d.core.visualizer import (show_multi_modality_result, show_result, show_seg_result) from mmdet3d.datasets import build_dataset def parse_args(): parser = argparse.ArgumentParser(description='Browse a dataset') parser.add_argument('config', help='train config file path') parser.add_argument( '--skip-type', type=str, nargs='+', default=['Normalize'], help='skip some useless pipeline') parser.add_argument( '--output-dir', default=None, type=str, help='If there is no display interface, you can save it') parser.add_argument( '--task', type=str, choices=['det', 'seg', 'multi_modality-det', 'mono-det'], help='Determine the visualization method depending on the task.') parser.add_argument( '--aug', action='store_true', help='Whether to visualize augmented datasets or original dataset.') parser.add_argument( '--online', action='store_true', help='Whether to perform online visualization. Note that you often ' 'need a monitor to do so.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') args = parser.parse_args() return args def build_data_cfg(config_path, skip_type, aug, cfg_options): """Build data config for loading visualization data.""" cfg = Config.fromfile(config_path) if cfg_options is not None: cfg.merge_from_dict(cfg_options) # extract inner dataset of `RepeatDataset` as `cfg.data.train` # so we don't need to worry about it later if cfg.data.train['type'] == 'RepeatDataset': cfg.data.train = cfg.data.train.dataset # use only first dataset for `ConcatDataset` if cfg.data.train['type'] == 'ConcatDataset': cfg.data.train = cfg.data.train.datasets[0] train_data_cfg = cfg.data.train if aug: show_pipeline = cfg.train_pipeline else: show_pipeline = cfg.eval_pipeline for i in range(len(cfg.train_pipeline)): if cfg.train_pipeline[i]['type'] == 'LoadAnnotations3D': show_pipeline.insert(i, cfg.train_pipeline[i]) # Collect points as well as labels if cfg.train_pipeline[i]['type'] == 'Collect3D': if show_pipeline[-1]['type'] == 'Collect3D': show_pipeline[-1] = cfg.train_pipeline[i] else: show_pipeline.append(cfg.train_pipeline[i]) train_data_cfg['pipeline'] = [ x for x in show_pipeline if x['type'] not in skip_type ] return cfg def to_depth_mode(points, bboxes): """Convert points and bboxes to Depth Coord and Depth Box mode.""" if points is not None: points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR, Coord3DMode.DEPTH) if bboxes is not None: bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR, Box3DMode.DEPTH) return points, bboxes def show_det_data(input, out_dir, show=False): """Visualize 3D point cloud and 3D bboxes.""" img_metas = input['img_metas']._data points = input['points']._data.numpy() gt_bboxes = input['gt_bboxes_3d']._data.tensor if img_metas['box_mode_3d'] != Box3DMode.DEPTH: points, gt_bboxes = to_depth_mode(points, gt_bboxes) filename = osp.splitext(osp.basename(img_metas['pts_filename']))[0] show_result( points, gt_bboxes.clone(), None, out_dir, filename, show=show, snapshot=True) def show_seg_data(input, out_dir, show=False): """Visualize 3D point cloud and segmentation mask.""" img_metas = input['img_metas']._data points = input['points']._data.numpy() gt_seg = input['pts_semantic_mask']._data.numpy() filename = osp.splitext(osp.basename(img_metas['pts_filename']))[0] show_seg_result( points, gt_seg.copy(), None, out_dir, filename, np.array(img_metas['PALETTE']), img_metas['ignore_index'], show=show, snapshot=True) def show_proj_bbox_img(input, out_dir, show=False, is_nus_mono=False): """Visualize 3D bboxes on 2D image by projection.""" gt_bboxes = input['gt_bboxes_3d']._data img_metas = input['img_metas']._data img = input['img']._data.numpy() # need to transpose channel to first dim img = img.transpose(1, 2, 0) # no 3D gt bboxes, just show img if gt_bboxes.tensor.shape[0] == 0: gt_bboxes = None filename = Path(img_metas['filename']).name if isinstance(gt_bboxes, DepthInstance3DBoxes): show_multi_modality_result( img, gt_bboxes, None, None, out_dir, filename, box_mode='depth', img_metas=img_metas, show=show) elif isinstance(gt_bboxes, LiDARInstance3DBoxes): show_multi_modality_result( img, gt_bboxes, None, img_metas['lidar2img'], out_dir, filename, box_mode='lidar', img_metas=img_metas, show=show) elif isinstance(gt_bboxes, CameraInstance3DBoxes): show_multi_modality_result( img, gt_bboxes, None, img_metas['cam2img'], out_dir, filename, box_mode='camera', img_metas=img_metas, show=show) else: # can't project, just show img warnings.warn( f'unrecognized gt box type {type(gt_bboxes)}, only show image') show_multi_modality_result( img, None, None, None, out_dir, filename, show=show) def main(): args = parse_args() if args.output_dir is not None: mkdir_or_exist(args.output_dir) cfg = build_data_cfg(args.config, args.skip_type, args.aug, args.cfg_options) try: dataset = build_dataset( cfg.data.train, default_args=dict(filter_empty_gt=False)) except TypeError: # seg dataset doesn't have `filter_empty_gt` key dataset = build_dataset(cfg.data.train) dataset_type = cfg.dataset_type # configure visualization mode vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det' progress_bar = mmcv.ProgressBar(len(dataset)) for input in dataset: if vis_task in ['det', 'multi_modality-det']: # show 3D bboxes on 3D point clouds show_det_data(input, args.output_dir, show=args.online) if vis_task in ['multi_modality-det', 'mono-det']: # project 3D bboxes to 2D image show_proj_bbox_img( input, args.output_dir, show=args.online, is_nus_mono=(dataset_type == 'NuScenesMonoDataset')) elif vis_task in ['seg']: # show 3D segmentation mask on 3D point clouds show_seg_data(input, args.output_dir, show=args.online) progress_bar.update() if __name__ == '__main__': main() ================================================ FILE: tools/misc/download.sh ================================================ # Download zip dataset from Google Drive filename='dd3d_det_final.pth' # https://drive.google.com/file/d/158ltbC_wjRoe3uBnktbwCgeIByadwxTY/view?usp=share_link # https://drive.google.com/file/d/1gQkhWERCzAosBwG5bh2BKkt1k0TJZt-A/view?usp=share_link fileid='1gQkhWERCzAosBwG5bh2BKkt1k0TJZt-A' wget --load-cookies /tmp.txt "https://drive.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://drive.google.com/uc?export=download&id=${fileid}' -O- | sed -rn 's/.confirm=([0-9A-Za-z_]+)./\1\n/p')&id=${fileid}" -O ${filename} ================================================ FILE: tools/misc/fuse_conv_bn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import torch from mmcv.runner import save_checkpoint from torch import nn as nn from mmdet3d.apis import init_model def fuse_conv_bn(conv, bn): """During inference, the functionary of batch norm layers is turned off but only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv layers to save computations and simplify network structures.""" conv_w = conv.weight conv_b = conv.bias if conv.bias is not None else torch.zeros_like( bn.running_mean) factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) return conv def fuse_module(m): last_conv = None last_conv_name = None for name, child in m.named_children(): if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)): if last_conv is None: # only fuse BN that is after Conv continue fused_conv = fuse_conv_bn(last_conv, child) m._modules[last_conv_name] = fused_conv # To reduce changes, set BN as Identity instead of deleting it. m._modules[name] = nn.Identity() last_conv = None elif isinstance(child, nn.Conv2d): last_conv = child last_conv_name = name else: fuse_module(child) return m def parse_args(): parser = argparse.ArgumentParser( description='fuse Conv and BN layers in a model') parser.add_argument('config', help='config file path') parser.add_argument('checkpoint', help='checkpoint file path') parser.add_argument('out', help='output path of the converted model') args = parser.parse_args() return args def main(): args = parse_args() # build the model from a config file and a checkpoint file model = init_model(args.config, args.checkpoint) # fuse conv and bn layers of the model fused_model = fuse_module(model) save_checkpoint(fused_model, args.out) if __name__ == '__main__': main() ================================================ FILE: tools/misc/print_config.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse from mmcv import Config, DictAction def parse_args(): parser = argparse.ArgumentParser(description='Print the whole config') parser.add_argument('config', help='config file path') parser.add_argument( '--options', nargs='+', action=DictAction, help='arguments in dict') args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) print(f'Config:\n{cfg.pretty_text}') if __name__ == '__main__': main() ================================================ FILE: tools/misc/tmp.txt ================================================ # Netscape HTTP Cookie File # http://curl.haxx.se/rfc/cookie_spec.html # This file was generated by Cookie-Editor .google.com TRUE / TRUE 1717919270 SAPISID fDOISBW6egQFC6aK/AYuRwhTHDp55K6ccn .google.com TRUE / TRUE 1717919270 __Secure-3PAPISID fDOISBW6egQFC6aK/AYuRwhTHDp55K6ccn .google.com TRUE / TRUE 1693385071 AEC AUEFqZfwniMLFUHoIt6OD4mKbetRVudbGNtaLrck-T7gPkWVxz9BodiWk14 .google.com TRUE /drive FALSE 1713617653 _ga_3WTQFP9ECQ GS1.1-2.1679055684.1.1.1679057653.0.0.0 .google.com TRUE /drive FALSE 1713617653 _ga GA1.2-2.1901804985.1679055685 .google.com TRUE / TRUE 1700296889 NID 511=rk4NeUDX9SGFBufJUHAaorRIa-Fl4MPmY6B2hUaho_L1KK2SQD1dc-0w1MGV-Z_BMFkIES9xOzD8JHz-ywW2j-f_l-9wjuMTrTW2p2Ykge9XU01HugAoZSiR3fK4G_D7maxP3AwWIskdRFLMM3yiGO8EjvAN8V_Q89eoSresXbjSpNV0n4RFNrWq4bIaaDasSU4mmEqoREXGeULS15PcX2Hx9aF9C8FIh0A5tqSJ1ClkxhNouUh5pf8jTPdeqKNB9lBy6qbYM9wUy968rwt-pOetIumwWB7x0Q .google.com TRUE / FALSE 1717919270 APISID pPIDTbOXJnT7lKlc/A4fGqNyULYJeeVvtd .google.com TRUE / TRUE 1716025323 __Secure-1PSIDTS sidts-CjIBLFra0u27E7T2JqfS2rhsCoQmeceLn796H-Ut4GZqXKQg5v5SyYKGSL-OhwScp5CHNxAA drive.google.com TRUE /drive TRUE 1685353314 COMPASS drive=CgAQ8qSdowYaWQAJa4lXUmoMUbVs5gaqovaTKg_TrPkHDxURXheR7ig5ALHoe4GC0baVX590fz95pSKf606cWNPRQvdlqawQDUYeobek7OFDtYNM5LnYCdhOn7Y7vMSvcahE drive.google.com TRUE / TRUE 1717073695 __Secure-OSID VwgfZ96E2CAZyjiD6u_Nx_5J9Y5hb917w3tcSp-Fvp5kgVIQ2Lv5E8fIgvtXiYExVyDZZg. .google.com TRUE / TRUE 1717919270 __Secure-1PAPISID fDOISBW6egQFC6aK/AYuRwhTHDp55K6ccn .google.com TRUE / TRUE 1717919270 __Secure-3PSID WQgfZxy61xHIQI6nrH63KOv8Nt7LeDdOB65vZO17iNbB0VQvKtzPHNMwpPDZZtXpUlYDdA. drive.google.com TRUE / TRUE 1684576399 OTZ 6994673_24_24__24_ .google.com TRUE / FALSE 1687006233 1P_JAR 2023-5-18-12 .google.com TRUE / TRUE 1717919270 __Secure-1PSID WQgfZxy61xHIQI6nrH63KOv8Nt7LeDdOB65vZO17iNbB0VQvIgq7ABf39EliH1-4MFh0bw. .google.com TRUE / TRUE 1716025497 __Secure-1PSIDCC AP8dLtz1xdzlPTgBciw5RLQBNR2VfYf_cq5yiAdUv6uH6W0QLclkU5DOa5CsffyvO_hJrNJiKmm7 .google.com TRUE / TRUE 1716025505 __Secure-3PSIDCC AP8dLtzwXzEPEVW8QkHFSVWbGpToU60VUdNUFYqTaqwa76HMtku_K1Mx5WnEFYRSLYSS-eVTalK6 .google.com TRUE / TRUE 1716025323 __Secure-3PSIDTS sidts-CjIBLFra0u27E7T2JqfS2rhsCoQmeceLn796H-Ut4GZqXKQg5v5SyYKGSL-OhwScp5CHNxAA .google.com TRUE / TRUE 1698201757 __Secure-ENID 10.SE=RAec2IGzq1Xo3xWELDpWGbcO3L7Vjn-ZuxxpsVg7wVU3TdXnOGr2p6zVusjvIqoHmYPQiK3hWtXIuFqBoJSf_sOEL46i922mvjDWB_eoSQBBK8yCyrkB1jtzlr_nfZjW0ZDtJUTA7UaDHXiDxRTVOyBKrTdY-k9ZTWeLzWp9LxjzsI8L_Aur09UvApRT01Ycsb1H_LGzQcPYThN3NEhfELdBApDvGJT5w9EH0sFgz7RIz42x5QBNJy4zJlWdeNFJwdgkbA-l4h6q1RpGVq4z .google.com TRUE / TRUE 1699694398 CONSENT PENDING+442 .google.com TRUE / FALSE 1717919270 HSID A8wA95lYkUAKA7rDu drive.google.com TRUE / TRUE 1717073695 OSID VwgfZ96E2CAZyjiD6u_Nx_5J9Y5hb917w3tcSp-Fvp5kgVIQZmJpCHob8u2FFlJTXPv3mA. .google.com TRUE / TRUE 1684575936 S billing-ui-v3=lmhpo1c6WCSMkIRa_Js-b_qLtL8W9dQ6:billing-ui-v3-efe=lmhpo1c6WCSMkIRa_Js-b_qLtL8W9dQ6 .google.com TRUE / FALSE 1697607299 SEARCH_SAMESITE CgQIjJgB .google.com TRUE / FALSE 1717919270 SID WQgfZxy61xHIQI6nrH63KOv8Nt7LeDdOB65vZO17iNbB0VQv7-kmUUio1TuXFFn2qkHV6g. .google.com TRUE / FALSE 1716025497 SIDCC AP8dLtzAhi_Ib-NgPFkfzLkT3mT9_Pn7dhRyf5YdQWjsj6mYNL1-3JSY2BSLd6fLFKDJBYoKuhM .google.com TRUE / TRUE 1717919270 SSID AkJid5vjB9OBQaw4W ================================================ FILE: tools/misc/visualize_results.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import mmcv from mmcv import Config from mmdet3d.datasets import build_dataset def parse_args(): parser = argparse.ArgumentParser( description='MMDet3D visualize the results') parser.add_argument('config', help='test config file path') parser.add_argument('--result', help='results file in pickle format') parser.add_argument( '--show-dir', help='directory where visualize results will be saved') args = parser.parse_args() return args def main(): args = parse_args() if args.result is not None and \ not args.result.endswith(('.pkl', '.pickle')): raise ValueError('The results file must be a pkl file.') cfg = Config.fromfile(args.config) cfg.data.test.test_mode = True # build the dataset dataset = build_dataset(cfg.data.test) results = mmcv.load(args.result) if getattr(dataset, 'show', None) is not None: # data loading pipeline for showing eval_pipeline = cfg.get('eval_pipeline', {}) if eval_pipeline: dataset.show(results, args.show_dir, pipeline=eval_pipeline) else: dataset.show(results, args.show_dir) # use default pipeline else: raise NotImplementedError( 'Show is not implemented for dataset {}!'.format( type(dataset).__name__)) if __name__ == '__main__': main() ================================================ FILE: tools/model_converters/convert_h3dnet_checkpoints.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import tempfile import torch from mmcv import Config from mmcv.runner import load_state_dict from mmdet3d.models import build_detector def parse_args(): parser = argparse.ArgumentParser( description='MMDet3D upgrade model version(before v0.6.0) of H3DNet') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='path of the output checkpoint file') args = parser.parse_args() return args def parse_config(config_strings): """Parse config from strings. Args: config_strings (string): strings of model config. Returns: Config: model config """ temp_file = tempfile.NamedTemporaryFile() config_path = f'{temp_file.name}.py' with open(config_path, 'w') as f: f.write(config_strings) config = Config.fromfile(config_path) # Update backbone config if 'pool_mod' in config.model.backbone.backbones: config.model.backbone.backbones.pop('pool_mod') if 'sa_cfg' not in config.model.backbone: config.model.backbone['sa_cfg'] = dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True) if 'type' not in config.model.rpn_head.vote_aggregation_cfg: config.model.rpn_head.vote_aggregation_cfg['type'] = 'PointSAModule' # Update rpn_head config if 'pred_layer_cfg' not in config.model.rpn_head: config.model.rpn_head['pred_layer_cfg'] = dict( in_channels=128, shared_conv_channels=(128, 128), bias=True) if 'feat_channels' in config.model.rpn_head: config.model.rpn_head.pop('feat_channels') if 'vote_moudule_cfg' in config.model.rpn_head: config.model.rpn_head['vote_module_cfg'] = config.model.rpn_head.pop( 'vote_moudule_cfg') if config.model.rpn_head.vote_aggregation_cfg.use_xyz: config.model.rpn_head.vote_aggregation_cfg.mlp_channels[0] -= 3 for cfg in config.model.roi_head.primitive_list: cfg['vote_module_cfg'] = cfg.pop('vote_moudule_cfg') cfg.vote_aggregation_cfg.mlp_channels[0] -= 3 if 'type' not in cfg.vote_aggregation_cfg: cfg.vote_aggregation_cfg['type'] = 'PointSAModule' if 'type' not in config.model.roi_head.bbox_head.suface_matching_cfg: config.model.roi_head.bbox_head.suface_matching_cfg[ 'type'] = 'PointSAModule' if config.model.roi_head.bbox_head.suface_matching_cfg.use_xyz: config.model.roi_head.bbox_head.suface_matching_cfg.mlp_channels[ 0] -= 3 if 'type' not in config.model.roi_head.bbox_head.line_matching_cfg: config.model.roi_head.bbox_head.line_matching_cfg[ 'type'] = 'PointSAModule' if config.model.roi_head.bbox_head.line_matching_cfg.use_xyz: config.model.roi_head.bbox_head.line_matching_cfg.mlp_channels[0] -= 3 if 'proposal_module_cfg' in config.model.roi_head.bbox_head: config.model.roi_head.bbox_head.pop('proposal_module_cfg') temp_file.close() return config def main(): """Convert keys in checkpoints for VoteNet. There can be some breaking changes during the development of mmdetection3d, and this tool is used for upgrading checkpoints trained with old versions (before v0.6.0) to the latest one. """ args = parse_args() checkpoint = torch.load(args.checkpoint) cfg = parse_config(checkpoint['meta']['config']) # Build the model and load checkpoint model = build_detector( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) orig_ckpt = checkpoint['state_dict'] converted_ckpt = orig_ckpt.copy() if cfg['dataset_type'] == 'ScanNetDataset': NUM_CLASSES = 18 elif cfg['dataset_type'] == 'SUNRGBDDataset': NUM_CLASSES = 10 else: raise NotImplementedError RENAME_PREFIX = { 'rpn_head.conv_pred.0': 'rpn_head.conv_pred.shared_convs.layer0', 'rpn_head.conv_pred.1': 'rpn_head.conv_pred.shared_convs.layer1' } DEL_KEYS = [ 'rpn_head.conv_pred.0.bn.num_batches_tracked', 'rpn_head.conv_pred.1.bn.num_batches_tracked' ] EXTRACT_KEYS = { 'rpn_head.conv_pred.conv_cls.weight': ('rpn_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]), 'rpn_head.conv_pred.conv_cls.bias': ('rpn_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]), 'rpn_head.conv_pred.conv_reg.weight': ('rpn_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]), 'rpn_head.conv_pred.conv_reg.bias': ('rpn_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)]) } # Delete some useless keys for key in DEL_KEYS: converted_ckpt.pop(key) # Rename keys with specific prefix RENAME_KEYS = dict() for old_key in converted_ckpt.keys(): for rename_prefix in RENAME_PREFIX.keys(): if rename_prefix in old_key: new_key = old_key.replace(rename_prefix, RENAME_PREFIX[rename_prefix]) RENAME_KEYS[new_key] = old_key for new_key, old_key in RENAME_KEYS.items(): converted_ckpt[new_key] = converted_ckpt.pop(old_key) # Extract weights and rename the keys for new_key, (old_key, indices) in EXTRACT_KEYS.items(): cur_layers = orig_ckpt[old_key] converted_layers = [] for (start, end) in indices: if end != -1: converted_layers.append(cur_layers[start:end]) else: converted_layers.append(cur_layers[start:]) converted_layers = torch.cat(converted_layers, 0) converted_ckpt[new_key] = converted_layers if old_key in converted_ckpt.keys(): converted_ckpt.pop(old_key) # Check the converted checkpoint by loading to the model load_state_dict(model, converted_ckpt, strict=True) checkpoint['state_dict'] = converted_ckpt torch.save(checkpoint, args.out) if __name__ == '__main__': main() ================================================ FILE: tools/model_converters/convert_votenet_checkpoints.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import tempfile import torch from mmcv import Config from mmcv.runner import load_state_dict from mmdet3d.models import build_detector def parse_args(): parser = argparse.ArgumentParser( description='MMDet3D upgrade model version(before v0.6.0) of VoteNet') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='path of the output checkpoint file') args = parser.parse_args() return args def parse_config(config_strings): """Parse config from strings. Args: config_strings (string): strings of model config. Returns: Config: model config """ temp_file = tempfile.NamedTemporaryFile() config_path = f'{temp_file.name}.py' with open(config_path, 'w') as f: f.write(config_strings) config = Config.fromfile(config_path) # Update backbone config if 'pool_mod' in config.model.backbone: config.model.backbone.pop('pool_mod') if 'sa_cfg' not in config.model.backbone: config.model.backbone['sa_cfg'] = dict( type='PointSAModule', pool_mod='max', use_xyz=True, normalize_xyz=True) if 'type' not in config.model.bbox_head.vote_aggregation_cfg: config.model.bbox_head.vote_aggregation_cfg['type'] = 'PointSAModule' # Update bbox_head config if 'pred_layer_cfg' not in config.model.bbox_head: config.model.bbox_head['pred_layer_cfg'] = dict( in_channels=128, shared_conv_channels=(128, 128), bias=True) if 'feat_channels' in config.model.bbox_head: config.model.bbox_head.pop('feat_channels') if 'vote_moudule_cfg' in config.model.bbox_head: config.model.bbox_head['vote_module_cfg'] = config.model.bbox_head.pop( 'vote_moudule_cfg') if config.model.bbox_head.vote_aggregation_cfg.use_xyz: config.model.bbox_head.vote_aggregation_cfg.mlp_channels[0] -= 3 temp_file.close() return config def main(): """Convert keys in checkpoints for VoteNet. There can be some breaking changes during the development of mmdetection3d, and this tool is used for upgrading checkpoints trained with old versions (before v0.6.0) to the latest one. """ args = parse_args() checkpoint = torch.load(args.checkpoint) cfg = parse_config(checkpoint['meta']['config']) # Build the model and load checkpoint model = build_detector( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) orig_ckpt = checkpoint['state_dict'] converted_ckpt = orig_ckpt.copy() if cfg['dataset_type'] == 'ScanNetDataset': NUM_CLASSES = 18 elif cfg['dataset_type'] == 'SUNRGBDDataset': NUM_CLASSES = 10 else: raise NotImplementedError RENAME_PREFIX = { 'bbox_head.conv_pred.0': 'bbox_head.conv_pred.shared_convs.layer0', 'bbox_head.conv_pred.1': 'bbox_head.conv_pred.shared_convs.layer1' } DEL_KEYS = [ 'bbox_head.conv_pred.0.bn.num_batches_tracked', 'bbox_head.conv_pred.1.bn.num_batches_tracked' ] EXTRACT_KEYS = { 'bbox_head.conv_pred.conv_cls.weight': ('bbox_head.conv_pred.conv_out.weight', [(0, 2), (-NUM_CLASSES, -1)]), 'bbox_head.conv_pred.conv_cls.bias': ('bbox_head.conv_pred.conv_out.bias', [(0, 2), (-NUM_CLASSES, -1)]), 'bbox_head.conv_pred.conv_reg.weight': ('bbox_head.conv_pred.conv_out.weight', [(2, -NUM_CLASSES)]), 'bbox_head.conv_pred.conv_reg.bias': ('bbox_head.conv_pred.conv_out.bias', [(2, -NUM_CLASSES)]) } # Delete some useless keys for key in DEL_KEYS: converted_ckpt.pop(key) # Rename keys with specific prefix RENAME_KEYS = dict() for old_key in converted_ckpt.keys(): for rename_prefix in RENAME_PREFIX.keys(): if rename_prefix in old_key: new_key = old_key.replace(rename_prefix, RENAME_PREFIX[rename_prefix]) RENAME_KEYS[new_key] = old_key for new_key, old_key in RENAME_KEYS.items(): converted_ckpt[new_key] = converted_ckpt.pop(old_key) # Extract weights and rename the keys for new_key, (old_key, indices) in EXTRACT_KEYS.items(): cur_layers = orig_ckpt[old_key] converted_layers = [] for (start, end) in indices: if end != -1: converted_layers.append(cur_layers[start:end]) else: converted_layers.append(cur_layers[start:]) converted_layers = torch.cat(converted_layers, 0) converted_ckpt[new_key] = converted_layers if old_key in converted_ckpt.keys(): converted_ckpt.pop(old_key) # Check the converted checkpoint by loading to the model load_state_dict(model, converted_ckpt, strict=True) checkpoint['state_dict'] = converted_ckpt torch.save(checkpoint, args.out) if __name__ == '__main__': main() ================================================ FILE: tools/model_converters/publish_model.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import subprocess import torch def parse_args(): parser = argparse.ArgumentParser( description='Process a checkpoint to be published') parser.add_argument('in_file', help='input checkpoint filename') parser.add_argument('out_file', help='output checkpoint filename') args = parser.parse_args() return args def process_checkpoint(in_file, out_file): checkpoint = torch.load(in_file, map_location='cpu') # remove optimizer for smaller file size if 'optimizer' in checkpoint: del checkpoint['optimizer'] # if it is necessary to remove some sensitive data in checkpoint['meta'], # add the code here. torch.save(checkpoint, out_file) sha = subprocess.check_output(['sha256sum', out_file]).decode() final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8]) subprocess.Popen(['mv', out_file, final_file]) def main(): args = parse_args() process_checkpoint(args.in_file, args.out_file) if __name__ == '__main__': main() ================================================ FILE: tools/model_converters/regnet2mmdet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse from collections import OrderedDict import torch def convert_stem(model_key, model_weight, state_dict, converted_names): new_key = model_key.replace('stem.conv', 'conv1') new_key = new_key.replace('stem.bn', 'bn1') state_dict[new_key] = model_weight converted_names.add(model_key) print(f'Convert {model_key} to {new_key}') def convert_head(model_key, model_weight, state_dict, converted_names): new_key = model_key.replace('head.fc', 'fc') state_dict[new_key] = model_weight converted_names.add(model_key) print(f'Convert {model_key} to {new_key}') def convert_reslayer(model_key, model_weight, state_dict, converted_names): split_keys = model_key.split('.') layer, block, module = split_keys[:3] block_id = int(block[1:]) layer_name = f'layer{int(layer[1:])}' block_name = f'{block_id - 1}' if block_id == 1 and module == 'bn': new_key = f'{layer_name}.{block_name}.downsample.1.{split_keys[-1]}' elif block_id == 1 and module == 'proj': new_key = f'{layer_name}.{block_name}.downsample.0.{split_keys[-1]}' elif module == 'f': if split_keys[3] == 'a_bn': module_name = 'bn1' elif split_keys[3] == 'b_bn': module_name = 'bn2' elif split_keys[3] == 'c_bn': module_name = 'bn3' elif split_keys[3] == 'a': module_name = 'conv1' elif split_keys[3] == 'b': module_name = 'conv2' elif split_keys[3] == 'c': module_name = 'conv3' new_key = f'{layer_name}.{block_name}.{module_name}.{split_keys[-1]}' else: raise ValueError(f'Unsupported conversion of key {model_key}') print(f'Convert {model_key} to {new_key}') state_dict[new_key] = model_weight converted_names.add(model_key) def convert(src, dst): """Convert keys in pycls pretrained RegNet models to mmdet style.""" # load caffe model regnet_model = torch.load(src) blobs = regnet_model['model_state'] # convert to pytorch style state_dict = OrderedDict() converted_names = set() for key, weight in blobs.items(): if 'stem' in key: convert_stem(key, weight, state_dict, converted_names) elif 'head' in key: convert_head(key, weight, state_dict, converted_names) elif key.startswith('s'): convert_reslayer(key, weight, state_dict, converted_names) # check if all layers are converted for key in blobs: if key not in converted_names: print(f'not converted: {key}') # save checkpoint checkpoint = dict() checkpoint['state_dict'] = state_dict torch.save(checkpoint, dst) def main(): parser = argparse.ArgumentParser(description='Convert model keys') parser.add_argument('src', help='src detectron model path') parser.add_argument('dst', help='save path') args = parser.parse_args() convert(args.src, args.dst) if __name__ == '__main__': main() ================================================ FILE: tools/slurm_test.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 CHECKPOINT=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} PY_ARGS=${@:5} SRUN_ARGS=${SRUN_ARGS:-""} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/test.py ${CONFIG} ${CHECKPOINT} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/slurm_train.sh ================================================ #!/usr/bin/env bash set -x PARTITION=$1 JOB_NAME=$2 CONFIG=$3 WORK_DIR=$4 GPUS=${GPUS:-8} GPUS_PER_NODE=${GPUS_PER_NODE:-8} CPUS_PER_TASK=${CPUS_PER_TASK:-5} SRUN_ARGS=${SRUN_ARGS:-""} PY_ARGS=${@:5} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --cpus-per-task=${CPUS_PER_TASK} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/train.py ${CONFIG} --work-dir=${WORK_DIR} --launcher="slurm" ${PY_ARGS} ================================================ FILE: tools/test.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import os import warnings import mmcv import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) import mmdet from mmdet3d.apis import single_gpu_test from mmdet3d.datasets import build_dataloader, build_dataset from mmdet3d.models import build_model from mmdet.apis import multi_gpu_test, set_random_seed from mmdet3d.apis.test import custom_multi_gpu_test from mmdet.datasets import replace_ImageToTensor import os.path as osp import time if mmdet.__version__ > '2.23.0': # If mmdet version > 2.23.0, setup_multi_processes would be imported and # used from mmdet instead of mmdet3d. from mmdet.utils import setup_multi_processes else: from mmdet3d.utils import setup_multi_processes try: # If mmdet version > 2.23.0, compat_cfg would be imported and # used from mmdet instead of mmdet3d. from mmdet.utils import compat_cfg except ImportError: from mmdet3d.utils import compat_cfg def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--gpu-ids', type=int, nargs='+', help='(Deprecated, please use --gpu-id) ids of gpus to use ' '(only applicable to non-distributed training)') parser.add_argument( '--gpu-id', type=int, default=0, help='id of gpu to use ' '(only applicable to non-distributed testing)') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument( '--save', action='store_true', help='save occupancy_data') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where results will be saved') parser.add_argument( '--tag', default='', help='tags') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--no-aavt', action='store_true', help='Do not align after view transformer.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both specified, ' '--options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() # assert args.out or args.eval or args.format_only or args.show \ # or args.show_dir, \ # ('Please specify at least one operation (save/eval/format/show the ' # 'results / save the results) with the argument "--out", "--eval"' # ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) cfg = compat_cfg(cfg) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # each process may have different time out_dir = osp.join('test', args.config.split('/')[-1][:-3]+ '_' + str(args.tag), time.ctime().replace(' ','_').replace(':','_'))[:-8] if args.save: cfg.model.occupancy_save_path = out_dir mmcv.mkdir_or_exist(out_dir) mmcv.mkdir_or_exist(os.path.join(out_dir, 'occupancy_pred')) cfg.model.pretrained = None if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed testing. Use the first GPU ' 'in `gpu_ids` now.') else: cfg.gpu_ids = [args.gpu_id] # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) test_dataloader_default_args = dict( samples_per_gpu=1, workers_per_gpu=2, dist=distributed, shuffle=False) # in case the test dataset is concatenated if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True if cfg.data.test_dataloader.get('samples_per_gpu', 1) > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) test_loader_cfg = { **test_dataloader_default_args, **cfg.data.get('test_dataloader', {}) } # set random seeds if args.seed is not None: set_random_seed(args.seed, deterministic=args.deterministic) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader(dataset, **test_loader_cfg) # build the model and load checkpoint if not args.no_aavt: if '4D' in cfg.model.type: cfg.model.align_after_view_transfromation=True cfg.model.train_cfg = None model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu', revise_keys=[(r'^module\.', ''), (r'^teacher\.', '')]) if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility sync_bn = cfg.get('sync_bn', False) if distributed and sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) print('Convert to SyncBatchNorm') if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES # palette for visualization in segmentation tasks if 'PALETTE' in checkpoint.get('meta', {}): model.PALETTE = checkpoint['meta']['PALETTE'] elif hasattr(dataset, 'PALETTE'): # segmentation dataset has `PALETTE` attribute model.PALETTE = dataset.PALETTE if not distributed: model = MMDataParallel(model, device_ids=cfg.gpu_ids) outputs = single_gpu_test(model, data_loader, args.show, args.show_dir) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) if cfg.get('use_custom_gpu_test', True): outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) else: outputs = multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect) rank, _ = get_dist_info() if rank == 0: if args.out: print(f'\nwriting results to {args.out}') mmcv.dump(outputs, args.out) kwargs = {} if args.eval_options is None else args.eval_options kwargs['jsonfile_prefix'] = out_dir if args.format_only: dataset.format_results(outputs, **kwargs) if True: eval_kwargs = cfg.get('evaluation', {}).copy() # kwargs['save'] = args.save # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule' ]: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) print(dataset.evaluate(outputs, **eval_kwargs)) if __name__ == '__main__': main() ================================================ FILE: tools/train.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. from __future__ import division import argparse import copy import os import time import warnings from os import path as osp import mmcv import torch import torch.distributed as dist from mmcv import Config, DictAction from mmcv.runner import get_dist_info, init_dist from mmdet import __version__ as mmdet_version from mmdet3d import __version__ as mmdet3d_version from mmdet3d.apis import init_random_seed, train_model from mmdet3d.datasets import build_dataset from mmdet3d.models import build_model from mmdet3d.utils import collect_env, get_root_logger from mmdet.apis import set_random_seed from mmseg import __version__ as mmseg_version from collections import OrderedDict import torch.multiprocessing as mp try: # If mmdet version > 2.20.0, setup_multi_processes would be imported and # used from mmdet instead of mmdet3d. from mmdet.utils import setup_multi_processes except ImportError: from mmdet3d.utils import setup_multi_processes def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--resume-from', help='the checkpoint file to resume from') parser.add_argument( '--auto-resume', action='store_true', help='resume from the latest checkpoint automatically') parser.add_argument( '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') group_gpus = parser.add_mutually_exclusive_group() group_gpus.add_argument( '--gpus', type=int, help='(Deprecated, please use --gpu-id) number of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-ids', type=int, nargs='+', help='(Deprecated, please use --gpu-id) ids of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-id', type=int, default=0, help='number of gpus to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument( '--diff-seed', action='store_true', help='Whether or not set different seeds for different ranks') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file (deprecate), ' 'change to --cfg-options instead.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--autoscale-lr', action='store_true', help='automatically scale lr with the number of gpus') args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.cfg_options: raise ValueError( '--options and --cfg-options cannot be both specified, ' '--options is deprecated in favor of --cfg-options') if args.options: warnings.warn('--options is deprecated in favor of --cfg-options') args.cfg_options = args.options return args def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # set multi-process settings setup_multi_processes(cfg) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: # update configs according to CLI args if args.work_dir is not None cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: # use config filename as default work_dir if cfg.work_dir is None cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if osp.isfile(osp.join(cfg.work_dir, 'done.txt')): print('job has finished, designed for NVIDIA ORD') exit(1) if args.resume_from is not None and osp.isfile(args.resume_from): cfg.resume_from = args.resume_from if cfg.resume_from is not None and not osp.isfile(cfg.resume_from): cfg.resume_from = None if args.auto_resume: cfg.auto_resume = args.auto_resume warnings.warn('`--auto-resume` is only supported when mmdet' 'version >= 2.20.0 for 3D detection model or' 'mmsegmentation verision >= 0.21.0 for 3D' 'segmentation model') if args.gpus is not None: cfg.gpu_ids = range(1) warnings.warn('`--gpus` is deprecated because we only support ' 'single GPU mode in non-distributed training. ' 'Use `gpus=1` now.') if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids[0:1] warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. ' 'Because we only support single GPU mode in ' 'non-distributed training. Use the first GPU ' 'in `gpu_ids` now.') if args.gpus is None and args.gpu_ids is None: cfg.gpu_ids = [args.gpu_id] if args.autoscale_lr: # apply the linear scaling rule (https://arxiv.org/abs/1706.02677) cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False rank = 0 else: distributed = True init_dist(args.launcher, **cfg.dist_params) # re-set gpu_ids with distributed training mode rank, world_size = get_dist_info() cfg.gpu_ids = range(world_size) gpu = rank % torch.cuda.device_count() os.environ['LOCAL_RANK'] = str(gpu) for each in cfg.log_config['hooks']: if each['type'] == 'WandbLoggerHook': each['init_kwargs']['name'] = args.config.split('/')[-1] each['init_kwargs']['config'] = dict() each['init_kwargs']['resume'] = 'allow' each['init_kwargs']['config']['job_id'] = os.environ.get('HOSTNAME','None') each['init_kwargs']['config']['link'] = dict() for key in ['model', 'lr_config', 'load_from', 'fp16', 'optimizer', 'data', 'train_pipeline', 'data_config']: each['init_kwargs']['config'][key] = dict(cfg._cfg_dict).get(key, 'None') break # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') # specify logger name, if we still use 'mmdet', the output info will be # filtered and won't be saved in the log_file # TODO: ugly workaround to judge whether we are training det or seg model if cfg.model.type in ['EncoderDecoder3D']: logger_name = 'mmseg' else: logger_name = 'mmdet' logger = get_root_logger( log_file=log_file, log_level=cfg.log_level, name=logger_name) # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') logger.info(f'Config:\n{cfg.pretty_text}') # set random seeds seed = init_random_seed(args.seed) seed = seed + dist.get_rank() if args.diff_seed else seed logger.info(f'Set random seed to {seed}, ' f'deterministic: {args.deterministic}') set_random_seed(seed, deterministic=args.deterministic) cfg.seed = seed meta['seed'] = seed meta['exp_name'] = osp.basename(args.config) model = build_model( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) model.init_weights() sync_bn = cfg.get('sync_bn', False) if distributed and sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) print('Convert to SyncBatchNorm') if 'trainable_components' in cfg: logger.info(f"param need to update:", cfg['trainable_components']) for key in cfg['trainable_components']: for name, param in model.named_parameters(): if key not in name: param.requires_grad = False from torch import nn def fix_bn(m): if isinstance(m, nn.BatchNorm1d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.SyncBatchNorm): m.track_running_stats = False model.img_backbone.apply(fix_bn) model.img_neck.apply(fix_bn) model.depth_net.apply(fix_bn) model.forward_projection.apply(fix_bn) model.img_bev_encoder_backbone.apply(fix_bn) model.img_bev_encoder_neck.apply(fix_bn) model.pts_bbox_head.apply(fix_bn) # model.pts_bbox_head.ego_fut_decoder.apply(fix_bn) # model.pts_voxel_layer.apply(fix_bn) # model.pts_voxel_encoder.apply(fix_bn) # model.pts_middle_encoder.apply(fix_bn) # model.pts_backbone.apply(fix_bn) # model.pts_neck.apply(fix_bn) logger.info(f'Model:\n{model}') datasets = [build_dataset(cfg.data.train)] if len(cfg.workflow) == 2: val_dataset = copy.deepcopy(cfg.data.val) # in case we use a dataset wrapper if 'dataset' in cfg.data.train: val_dataset.pipeline = cfg.data.train.dataset.pipeline else: val_dataset.pipeline = cfg.data.train.pipeline # set test_mode=False here in deep copied config # which do not affect AP/AR calculation later # refer to https://mmdetection3d.readthedocs.io/en/latest/tutorials/customize_runtime.html#customize-workflow # noqa val_dataset.test_mode = False datasets.append(build_dataset(val_dataset)) if cfg.checkpoint_config is not None: # save mmdet version, config file content and class names in # checkpoints as meta data cfg.checkpoint_config.meta = dict( mmdet_version=mmdet_version, mmseg_version=mmseg_version, mmdet3d_version=mmdet3d_version, config=cfg.pretty_text, CLASSES=datasets[0].CLASSES, PALETTE=datasets[0].PALETTE # for segmentors if hasattr(datasets[0], 'PALETTE') else None) # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.allow_tf32 = True train_model( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) with open(osp.join(cfg.work_dir, 'done.txt'), 'w') as f: f.write('done: ' + time.ctime()) if __name__ == '__main__': mp.set_start_method('spawn') main() ================================================ FILE: tools/update_data_coords.py ================================================ import argparse import time from os import path as osp import mmcv import numpy as np from mmdet3d.core.bbox import limit_period def update_sunrgbd_infos(root_dir, out_dir, pkl_files): print(f'{pkl_files} will be modified because ' f'of the refactor of the Depth coordinate system.') if root_dir == out_dir: print(f'Warning, you are overwriting ' f'the original data under {root_dir}.') time.sleep(3) for pkl_file in pkl_files: in_path = osp.join(root_dir, pkl_file) print(f'Reading from input file: {in_path}.') a = mmcv.load(in_path) print('Start updating:') for item in mmcv.track_iter_progress(a): if 'rotation_y' in item['annos']: item['annos']['rotation_y'] = -item['annos']['rotation_y'] item['annos']['gt_boxes_upright_depth'][:, -1:] = \ -item['annos']['gt_boxes_upright_depth'][:, -1:] out_path = osp.join(out_dir, pkl_file) print(f'Writing to output file: {out_path}.') mmcv.dump(a, out_path, 'pkl') def update_outdoor_dbinfos(root_dir, out_dir, pkl_files): print(f'{pkl_files} will be modified because ' f'of the refactor of the LIDAR coordinate system.') if root_dir == out_dir: print(f'Warning, you are overwriting ' f'the original data under {root_dir}.') time.sleep(3) for pkl_file in pkl_files: in_path = osp.join(root_dir, pkl_file) print(f'Reading from input file: {in_path}.') a = mmcv.load(in_path) print('Start updating:') for k in a.keys(): print(f'Updating samples of class {k}:') for item in mmcv.track_iter_progress(a[k]): boxes = item['box3d_lidar'].copy() # swap l, w (or dx, dy) item['box3d_lidar'][3] = boxes[4] item['box3d_lidar'][4] = boxes[3] # change yaw item['box3d_lidar'][6] = -boxes[6] - np.pi / 2 item['box3d_lidar'][6] = limit_period( item['box3d_lidar'][6], period=np.pi * 2) out_path = osp.join(out_dir, pkl_file) print(f'Writing to output file: {out_path}.') mmcv.dump(a, out_path, 'pkl') def update_nuscenes_or_lyft_infos(root_dir, out_dir, pkl_files): print(f'{pkl_files} will be modified because ' f'of the refactor of the LIDAR coordinate system.') if root_dir == out_dir: print(f'Warning, you are overwriting ' f'the original data under {root_dir}.') time.sleep(3) for pkl_file in pkl_files: in_path = osp.join(root_dir, pkl_file) print(f'Reading from input file: {in_path}.') a = mmcv.load(in_path) print('Start updating:') for item in mmcv.track_iter_progress(a['infos']): boxes = item['gt_boxes'].copy() # swap l, w (or dx, dy) item['gt_boxes'][:, 3] = boxes[:, 4] item['gt_boxes'][:, 4] = boxes[:, 3] # change yaw item['gt_boxes'][:, 6] = -boxes[:, 6] - np.pi / 2 item['gt_boxes'][:, 6] = limit_period( item['gt_boxes'][:, 6], period=np.pi * 2) out_path = osp.join(out_dir, pkl_file) print(f'Writing to output file: {out_path}.') mmcv.dump(a, out_path, 'pkl') parser = argparse.ArgumentParser(description='Arg parser for data coords ' 'update due to coords sys refactor.') parser.add_argument('dataset', metavar='kitti', help='name of the dataset') parser.add_argument( '--root-dir', type=str, default='./data/kitti', help='specify the root dir of dataset') parser.add_argument( '--version', type=str, default='v1.0', required=False, help='specify the dataset version, no need for kitti') parser.add_argument( '--out-dir', type=str, default=None, required=False, help='name of info pkl') args = parser.parse_args() if __name__ == '__main__': if args.out_dir is None: args.out_dir = args.root_dir if args.dataset == 'kitti': # KITTI infos is in CAM coord sys (unchanged) # KITTI dbinfos is in LIDAR coord sys (changed) # so we only update dbinfos pkl_files = ['kitti_dbinfos_train.pkl'] update_outdoor_dbinfos( root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) elif args.dataset == 'nuscenes': # nuScenes infos is in LIDAR coord sys (changed) # nuScenes dbinfos is in LIDAR coord sys (changed) # so we update both infos and dbinfos pkl_files = ['nuscenes_infos_val.pkl'] if args.version != 'v1.0-mini': pkl_files.append('nuscenes_infos_train.pkl') else: pkl_files.append('nuscenes_infos_train_tiny.pkl') update_nuscenes_or_lyft_infos( root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) if args.version != 'v1.0-mini': pkl_files = ['nuscenes_dbinfos_train.pkl'] update_outdoor_dbinfos( root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) elif args.dataset == 'lyft': # Lyft infos is in LIDAR coord sys (changed) # Lyft has no dbinfos # so we update infos pkl_files = ['lyft_infos_train.pkl', 'lyft_infos_val.pkl'] update_nuscenes_or_lyft_infos( root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) elif args.dataset == 'waymo': # Waymo infos is in CAM coord sys (unchanged) # Waymo dbinfos is in LIDAR coord sys (changed) # so we only update dbinfos pkl_files = ['waymo_dbinfos_train.pkl'] update_outdoor_dbinfos( root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) elif args.dataset == 'scannet': # ScanNet infos is in DEPTH coord sys (changed) # but bbox is without yaw # so ScanNet is unaffected pass elif args.dataset == 's3dis': # Segmentation datasets are not affected pass elif args.dataset == 'sunrgbd': # SUNRGBD infos is in DEPTH coord sys (changed) # and bbox is with yaw # so we update infos pkl_files = ['sunrgbd_infos_train.pkl', 'sunrgbd_infos_val.pkl'] update_sunrgbd_infos( root_dir=args.root_dir, out_dir=args.out_dir, pkl_files=pkl_files) ================================================ FILE: tools/update_data_coords.sh ================================================ #!/usr/bin/env bash set -x export PYTHONPATH=`pwd`:$PYTHONPATH PARTITION=$1 DATASET=$2 GPUS=${GPUS:-1} GPUS_PER_NODE=${GPUS_PER_NODE:-1} SRUN_ARGS=${SRUN_ARGS:-""} JOB_NAME=update_data_coords srun -p ${PARTITION} \ --job-name=${JOB_NAME} \ --gres=gpu:${GPUS_PER_NODE} \ --ntasks=${GPUS} \ --ntasks-per-node=${GPUS_PER_NODE} \ --kill-on-bad-exit=1 \ ${SRUN_ARGS} \ python -u tools/update_data_coords.py ${DATASET} \ --root-dir ./data/${DATASET} \ --out-dir ./data/${DATASET}