Repository: Nicholasli1995/EgoNet Branch: master Commit: 13e3758388ab Files: 56 Total size: 467.4 KB Directory structure: gitextract_zura5gg8/ ├── .gitignore ├── LICENSE ├── README.md ├── configs/ │ ├── KITTI_inference:demo.yml │ ├── KITTI_inference:test_submission.yml │ ├── KITTI_train_IGRs.yml │ ├── KITTI_train_IGRs_Ped.yml │ └── KITTI_train_lifting.yml ├── docs/ │ ├── demo.md │ ├── inference.md │ ├── preparation.md │ ├── spec-list.txt │ └── training.md ├── libs/ │ ├── arguments/ │ │ ├── __init__.py │ │ └── parse.py │ ├── common/ │ │ ├── __init__.py │ │ ├── format.py │ │ ├── img_proc.py │ │ ├── transformation.py │ │ └── utils.py │ ├── dataset/ │ │ ├── KITTI/ │ │ │ ├── __init__.py │ │ │ └── car_instance.py │ │ ├── __init__.py │ │ ├── basic/ │ │ │ ├── __init__.py │ │ │ └── basic_classes.py │ │ └── normalization/ │ │ ├── __init__.py │ │ └── operations.py │ ├── logger/ │ │ ├── __init__.py │ │ └── logger.py │ ├── loss/ │ │ ├── __init__.py │ │ └── function.py │ ├── metric/ │ │ └── criterions.py │ ├── model/ │ │ ├── FCmodel.py │ │ ├── __init__.py │ │ ├── egonet.py │ │ └── heatmapModel/ │ │ ├── __init__.py │ │ ├── hrnet.py │ │ └── resnet.py │ ├── optimizer/ │ │ ├── __init__.py │ │ └── optimizer.py │ ├── trainer/ │ │ ├── __init__.py │ │ ├── accuracy.py │ │ └── trainer.py │ └── visualization/ │ ├── __init__.py │ ├── debug.py │ ├── egonet_utils.py │ └── points.py └── tools/ ├── inference.py ├── inference_legacy.py ├── kitti-eval/ │ ├── README.md │ ├── evaluate_object_3d.cpp │ ├── evaluate_object_3d_offline.cpp │ ├── evaluate_object_3d_offline_r40.cpp │ └── mail.h ├── train_IGRs.py └── train_lifting.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ #/* **/__pycache__ .spyproject/ *.log *.ini *.bak *.pth *.csv *.jpg *.png *.pdf /tools/kitti-eval/evaluate_object_3d_offline /outputs ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2022 Shichao Li Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ [![PWC](https://img.shields.io/endpoint.svg?url=https://paperswithcode.com/badge/exploring-intermediate-representation-for/vehicle-pose-estimation-on-kitti-cars-hard)](https://paperswithcode.com/sota/vehicle-pose-estimation-on-kitti-cars-hard?p=exploring-intermediate-representation-for) # EgoNet Official project website for the CVPR 2021 paper "Exploring intermediate representation for monocular vehicle pose estimation". This repo includes an implementation that performs vehicle orientation estimation on the KITTI dataset from a single RGB image. News: (2022-??-??): v-1.1 will be released which include pre-trained models for other object classes (Pedestrian and Cyclist in KITTI). (2021-08-16): v-1.0 is released. The training documentation is added. (2021-06-21): v-0.9 (beta version) is released. **The inference utility is here!** For Q&A, go to [discussions](https://github.com/Nicholasli1995/EgoNet/discussions). If you believe there is a technical problem, submit to [issues](https://github.com/Nicholasli1995/EgoNet/issues). (2021-06-16): This repo is under final code cleaning and documentation preparation. Stay tuned and come back in a week! **Check our 5-min video ([Youtube](https://www.youtube.com/watch?v=isKo0F3MU68), [爱奇艺](https://www.iqiyi.com/v_y6lrdy33kg.html)) for an introduction.** **中文详解**:[哔哩哔哩](https://www.bilibili.com/video/BV1jP4y1t7ee)

## Run a demo with a one-line command! Check instructions [here](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/demo.md).

## Performance: APBEV@R40 on KITTI val set for Car (monocular RGB) The validation results in the paper was based on R11, the results using R40 are attached here. | Method | Reference|Easy|Moderate|Hard| | ------------------------- | ---------------| --------------| --------------| --------------| |[M3D-RPN](https://arxiv.org/abs/1907.06038)|ICCV 2019|20.85| 15.62| 11.88| |[MonoDIS](https://openaccess.thecvf.com/content_ICCV_2019/papers/Simonelli_Disentangling_Monocular_3D_Object_Detection_ICCV_2019_paper.pdf)|ICCV 2019|18.45 |12.58 |10.66| |[MonoPair](https://arxiv.org/abs/2003.00504)|CVPR 2020|24.12| 18.17| 15.76| |[D4LCN](https://github.com/dingmyu/D4LCN)|CVPR 2020|31.53 |22.58 |17.87| |[Kinematic3D](https://arxiv.org/abs/2007.09548)|ECCV 2020|27.83| 19.72| 15.10| |[GrooMeD-NMS](https://github.com/abhi1kumar/groomed_nms)|CVPR 2021 |27.38|19.75|15.92| |[MonoDLE](https://github.com/xinzhuma/monodle)|CVPR 2021|24.97| 19.33| 17.01| |Ours (@R11) |CVPR 2021 |**33.60**|**25.38**|**22.80**| |Ours (@R40) |CVPR 2021 |**34.31**|**24.80**|**20.16**| ## Performance: AOS@R40 on KITTI test set for Car (RGB) | Method | Reference|Configuration|Easy|Moderate|Hard| | ------------------------- | ---------------| --------------| --------------| --------------| --------------| |[M3D-RPN](https://arxiv.org/abs/1907.06038)|ICCV 2019|Monocular|88.38 |82.81| 67.08| |[DSGN](https://github.com/Jia-Research-Lab/DSGN)|CVPR 2020|Stereo|95.42|86.03| 78.27| |[Disp-RCNN](https://github.com/zju3dv/disprcnn)|CVPR 2020|Stereo |93.02 | 81.70 | 67.16| |[MonoPair](https://arxiv.org/abs/2003.00504)|CVPR 2020|Monocular|91.65 |86.11 |76.45| |[D4LCN](https://github.com/dingmyu/D4LCN)|CVPR 2020|Monocular|90.01|82.08| 63.98| |[Kinematic3D](https://arxiv.org/abs/2007.09548)|ECCV 2020|Monocular|58.33 | 45.50 | 34.81| |[MonoDLE](https://github.com/xinzhuma/monodle)|CVPR 2021|Monocular|93.46| 90.23| 80.11| |[Ours](http://www.cvlibs.net/datasets/kitti/eval_object_detail.php?&result=e5233225fd5ef36fa63eb00252d9c00024961f2c) |CVPR 2021 |Monocular|**96.11**|**91.23**|**80.96**| ## Inference/Deployment Check instructions [here](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/inference.md) to **reproduce** the above quantitative results. ## Training Check instructions [here](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/training.md) to train Ego-Net and learn how to prepare your own training dataset other than KITTI. ## Citation Please star this repository and cite the following paper in your publications if it helps your research: @InProceedings{Li_2021_CVPR, author = {Li, Shichao and Yan, Zengqiang and Li, Hongyang and Cheng, Kwang-Ting}, title = {Exploring intermediate representation for monocular vehicle pose estimation}, booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)}, month = {June}, year = {2021}, pages = {1873-1883} } ## License This repository can be used freely for non-commercial purposes. Contact me if you are interested in a commercial license. ## Links Link to the paper: [Exploring intermediate representation for monocular vehicle pose estimation](https://arxiv.org/abs/2011.08464) Link to the presentation video: [Youtube](https://www.youtube.com/watch?v=isKo0F3MU68), [爱奇艺](https://www.iqiyi.com/v_y6lrdy33kg.html) Relevant ECCV 2020 work: [GSNet](https://github.com/lkeab/gsnet) ================================================ FILE: configs/KITTI_inference:demo.yml ================================================ # This is a YAML file storing experimental configurations for KITTI dataset ## general settings name: 'refine a given set of predictions from D4LCN' exp_type: 'inference' model_type: 'heatmapModel' use_gpu: True use_pred_box: True use_gt_box: True gpu_id: [0] ## operations train: False save: False evaluate: False inference: True ## used directories dirs: # output directory output: 'YOUR_OURPUT_DIR' ckpt: 'YOUR_PRETRAINED_DIR' load_prediction_file: '../resources/D4LCN/data' ## CUDNN settings cudnn: enabled: True deterministic: True benchmark: False ## dataset settings dataset: name: 'KITTI' split: 'valid' detect_classes: ['Car'] 3d_kpt_sample_style: 'bbox9' # interpolate the 3D bbox interpolate: flag: True style: 'bbox12' coef: [0.332, 0.667] # do some pre-processing pre-process: False root: 'YOUR_KITTI_DIR' # augmentation parameters scaling_factor: 0.2 rotation_factor: 30. # degrees # pytorch image transformation setting pth_transform: # mean: [0.485, 0.456, 0.406, 0., 0.] # std: [0.229, 0.224, 0.225, 1., 1.] mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] # annotation style for 2d key-point 2d_kpt_style: 'bbox9' # projected 3d bounding box corner and center points # input-output representation for 2d-to-3d lifting lft_in_rep: 'coordinates2d' # 2d coordinates on screen lft_out_rep: 'R3d+T' # 3d coordinates relative to centroid plus translation vector ## model settings for a fully-connected network if used FCModel: name: 'lifter' refine_3d: False norm_twoD: False num_blocks: 2 input_size: 66 output_size: 96 num_neurons: 1024 dropout: 0.5 leaky: False ## settings for a fully-convolutional heatmap/coordinate regression model heatmapModel: name: hrnet # here a high-resolution (hr) model is used add_xy: False # concatenate xy coodrinate maps along with the input jitter_bbox: False jitter_params: shift: - 0.1 - 0.1 scaling: - 0.4 - 0.4 input_size: - 256 - 256 # rotate and scaling and input images augment_input: False # one can choose to regress dense semantic heatmaps or coordinates head_type: 'coordinates' # up-sampling with pixel-shuffle pixel_shuffle: False # if an intermediate heatmap is produced heatmap_size: - 64 - 64 init_weights: true num_joints: 33 extra: pretrained_layers: - 'conv1' - 'bn1' - 'conv2' - 'bn2' - 'layer1' - 'transition1' - 'stage2' - 'transition2' - 'stage3' - 'transition3' - 'stage4' final_conv_kernel: 1 stage2: num_modules: 1 num_branches: 2 block: basic num_blocks: - 4 - 4 num_channels: - 48 - 96 fuse_method: sum stage3: num_modules: 4 num_branches: 3 block: basic num_blocks: - 4 - 4 - 4 num_channels: - 48 - 96 - 192 fuse_method: sum stage4: num_modules: 3 num_branches: 4 block: basic num_blocks: - 4 - 4 - 4 - 4 num_channels: - 48 - 96 - 192 - 384 fuse_method: sum ## testing settings testing_settings: batch_size: 1 num_threads: 0 shuffle: True pin_memory: False apply_dropout: False unnormalize: False alpha_mode: 'proj' ================================================ FILE: configs/KITTI_inference:test_submission.yml ================================================ # YAML file storing experimental configurations for KITTI dataset ## general settings name: 'produce vehicle pose predictions on KITTI test split given detected bounding boxes' exp_type: 'inference' model_type: 'heatmapModel' use_gpu: True use_pred_box: True use_gt_box: False gpu_id: [0] ## operations train: False save: False visualize: False # visualize during inference batch_to_show: 1000000 # how many batches to visualize if needed evaluate: False inference: True conf_thres: 0.1 # discard low score boxes ## used directories dirs: # output directory output: 'YOUR_OURPUT_DIR' ckpt: 'YOUR_PRETRAINED_DIR' # raw detection results on test set by using RRC-Net load_prediction_file: '../resources/test_boxes' ## CUDNN settings cudnn: enabled: True deterministic: True benchmark: False ## dataset settings dataset: name: 'KITTI' split: 'test' detect_classes: ['Car'] 3d_kpt_sample_style: 'bbox9' # interpolate the 3D bbox interpolate: flag: True style: 'bbox12' coef: [0.332, 0.667] # do some pre-processing pre-process: False root: 'YOUR_KITTI_DIR' # augmentation parameters scaling_factor: 0.2 rotation_factor: 30. # degrees # pytorch image transformation setting pth_transform: mean: [0.485, 0.456, 0.406] # TODO re-calculate this: R, G, B, X, Y std: [0.229, 0.224, 0.225] # annotation style for 2d key-point 2d_kpt_style: 'bbox9' # projected 3d bounding box corner and center points # input-output representation for 2d-to-3d lifting lft_in_rep: 'coordinates2d' # 2d coordinates on screen lft_out_rep: 'R3d+T' # 3d coordinates relative to centroid plus translation vector ## model settings for a fully-connected network if used FCModel: name: 'lifter' refine_3d: False norm_twoD: False num_blocks: 2 input_size: 66 output_size: 96 num_neurons: 1024 dropout: 0.5 leaky: False ## settings for a fully-convolutional heatmap regression model heatmapModel: name: hrnet # here a high-resolution (hr) model is used add_xy: False # concatenate xy coodrinate maps along with the input jitter_bbox: True jitter_params: shift: - 0.1 - 0.1 scaling: - 0.4 - 0.4 input_size: - 256 - 256 # rotate and scaling and input images augment_input: True head_type: 'coordinates' pixel_shuffle: False # if an intermediate heatmap is produced heatmap_size: - 64 - 64 init_weights: true num_joints: 33 use_different_joints_weight: False extra: pretrained_layers: - 'conv1' - 'bn1' - 'conv2' - 'bn2' - 'layer1' - 'transition1' - 'stage2' - 'transition2' - 'stage3' - 'transition3' - 'stage4' final_conv_kernel: 1 stage2: num_modules: 1 num_branches: 2 block: basic num_blocks: - 4 - 4 num_channels: - 48 - 96 fuse_method: sum stage3: num_modules: 4 num_branches: 3 block: basic num_blocks: - 4 - 4 - 4 num_channels: - 48 - 96 - 192 fuse_method: sum stage4: num_modules: 3 num_branches: 4 block: basic num_blocks: - 4 - 4 - 4 - 4 num_channels: - 48 - 96 - 192 - 384 fuse_method: sum ## testing settings testing_settings: batch_size: 1 num_threads: 0 shuffle: True pin_memory: False apply_dropout: False unnormalize: False alpha_mode: 'proj' ================================================ FILE: configs/KITTI_train_IGRs.yml ================================================ # YAML file storing experimental configurations for training on KITTI dataset ## general settings name: 'kitti_kpt_loc' exp_type: 'instanceto2d' model_type: 'heatmapModel' use_gpu: True gpu_id: [0,1,2] # MODIFY this to the GPU/GPUs ids in your computer ## operations train: True save: True visualize: False evaluate: False ## output directories dirs: # MODIFY them to your preferred directories output: '../outputs/training_record' # This directory save intermediate training results (optional) debug: '../outputs/training_record/debug' ## CUDNN settings cudnn: enabled: True deterministic: False benchmark: False ## dataset settings dataset: name: 'KITTI' detect_classes: ['Car'] 3d_kpt_sample_style: 'bbox9' interpolate: flag: True style: 'bbox12' coef: [0.332, 0.667] # do some pre-processing pre-process: False # MODIFY this to your KITTI directory root: '$YOUR_DIR/KITTI' # augmentation parameters scaling_factor: 0.2 rotation_factor: 30. # degrees # pytorch image transformation setting pth_transform: # mean: [0.485, 0.456, 0.406, 0., 0.] # std: [0.229, 0.224, 0.225, 1., 1.] mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] 2d_kpt_style: 'bbox9' ## self-supervision settings ss: flag: False # MODIFY this to your unlabeled image record if you enable self-supervised representation learning record_path: '$YOUR_DIR/Apollo_ss_record.npy' img_root: '$YOUR_DIR/ApolloScape/images' max_per_img: 6 ## settings for a fully-convolutional heatmap/coordinate regression model heatmapModel: name: hrnet # here a high-resolution (hr) model is used add_xy: False # concatenate xy coodrinate maps along with the input # data augmentation by adding noise to bounding box location jitter_bbox: True jitter_params: shift: - 0.1 - 0.1 scaling: - 0.4 - 0.4 input_size: - 256 - 256 # rotate and scaling and input images augment_input: True head_type: 'coordinates' # up-sampling with pixel-shuffle pixel_shuffle: False # if an intermediate heatmap is produced heatmap_size: - 64 - 64 loss_type: JointsCompositeLoss # the following two settings are only valid for JointsCompositeLoss loss_spec_list: ['mse', 'l1', 'sl1'] loss_weight_list: [1.0, 0.1, 'None'] cr_loss_threshold: 0.15 init_weights: true num_joints: 33 #use_different_joints_weight: False # use a pre-trained checkpoint to initialize the model # MODIFY it to your own checkpoint directory pretrained: '../resources/start_point.pth' target_type: gaussian sigma: 1 extra: pretrained_layers: - 'conv1' - 'bn1' - 'conv2' - 'bn2' - 'layer1' - 'transition1' - 'stage2' - 'transition2' - 'stage3' - 'transition3' - 'stage4' final_conv_kernel: 1 stage2: num_modules: 1 num_branches: 2 block: basic num_blocks: - 4 - 4 num_channels: - 48 - 96 fuse_method: sum stage3: num_modules: 4 num_branches: 3 block: basic num_blocks: - 4 - 4 - 4 num_channels: - 48 - 96 - 192 fuse_method: sum stage4: num_modules: 3 num_branches: 4 block: basic num_blocks: - 4 - 4 - 4 - 4 num_channels: - 48 - 96 - 192 - 384 fuse_method: sum ## training settings training_settings: total_epochs: 45 resume: False batch_size: 24 num_threads: 16 # MODIFY this accordingly based on your machine shuffle: True pin_memory: False # weighted loss computation use_target_weight: False report_every: 30 eval_every: 130 eval_during: False # set this to True if you want to evaluate during training eval_metrics: ['JointDistance2DSIP'] plot_loss: False # debugging configurations debug: save: True # save some intermeadiate images with keypoint prediction save_images_kpts: True save_hms_gt: True save_hms_pred: True ## testing settings testing_settings: batch_size: 2 num_threads: 4 shuffle: False pin_memory: False apply_dropout: False unnormalize: False eval_metrics: ['JointDistance2DSIP'] ## optimizer settings optimizer: # for ADAM optim_type: 'adam' lr: 0.001 weight_decay: 0.0 # for SGD momentum: 0.9 # learning rate decay milestones: [10, 20, 30, 40] gamma: 0.5 ================================================ FILE: configs/KITTI_train_IGRs_Ped.yml ================================================ # YAML file storing experimental configurations for training on KITTI dataset for the Pedestrian class ## general settings name: 'kitti_kpt_loc_pedestrian' exp_type: 'instanceto2d' # baselin model_type: 'heatmapModel' use_gpu: True gpu_id: [0,1,] ## operations train: True save: True visualize: False evaluate: False ## output directories dirs: # MODIFY them to your preferred directories output: '../outputs/training_record' # This directory save intermediate training results (optional) debug: '../outputs/training_record/debug' ## CUDNN settings cudnn: enabled: True deterministic: False benchmark: False ## dataset settings dataset: name: 'KITTI' detect_classes: ['Pedestrian'] 3d_kpt_sample_style: 'bbox9' interpolate: flag: True style: 'bbox12' coef: [0.332, 0.667] enlarge_factor: 1.05 # patch size parameter # do some pre-processing pre-process: True root: '/media/nicholas/Database/datasets/KITTI' # augmentation parameters scaling_factor: 0.2 rotation_factor: 30. # degrees # pytorch image transformation setting pth_transform: mean: [0.485, 0.456, 0.406] std: [0.229, 0.224, 0.225] 2d_kpt_style: 'bbox9' ## self-supervision settings ss: flag: False # MODIFY this to your unlabeled image record if you enable self-supervised representation learning record_path: '$YOUR_DIR/Apollo_ss_record.npy' img_root: '$YOUR_DIR/ApolloScape/images' max_per_img: 6 ## settings for a fully-convolutional heatmap regression model heatmapModel: name: hrnet # here a high-resolution (hr) model is used add_xy: False # concatenate xy coodrinate maps along with the input jitter_bbox: True jitter_params: shift: - 0.1 - 0.1 scaling: - 0.2 - 0.2 input_size: - 192 - 256 # rotate and scaling and input images augment_input: True head_type: 'coordinates' # up-sampling with pixel-shuffle pixel_shuffle: False # if an intermediate heatmap is produced heatmap_size: - 48 - 64 loss_type: JointsCompositeLoss # the following two settings are only valid for JointsCompositeLoss loss_spec_list: ['mse', 'l1', 'None'] loss_weight_list: [1.0, 0.1, 0.] cr_loss_threshold: 0.1 init_weights: true num_joints: 33 use_different_joints_weight: False # use a pre-trained checkpoint to initialize the model # MODIFY it to your own checkpoint directory pretrained: '../resources/start_point.pth' target_type: gaussian sigma: 2 extra: pretrained_layers: - 'conv1' - 'bn1' - 'conv2' - 'bn2' - 'layer1' - 'transition1' - 'stage2' - 'transition2' - 'stage3' - 'transition3' - 'stage4' freeze_layers: - 'conv1' - 'bn1' - 'conv2' - 'bn2' - 'layer1' - 'transition1' - 'stage2' final_conv_kernel: 1 stage2: num_modules: 1 num_branches: 2 block: basic num_blocks: - 4 - 4 num_channels: - 32 - 64 fuse_method: sum stage3: num_modules: 4 num_branches: 3 block: basic num_blocks: - 4 - 4 - 4 num_channels: - 32 - 64 - 128 fuse_method: sum stage4: num_modules: 3 num_branches: 4 block: basic num_blocks: - 4 - 4 - 4 - 4 num_channels: - 32 - 64 - 128 - 256 fuse_method: sum ## training settings training_settings: total_epochs: 40 resume: False begin_epoch: 1 end_epoch: 10 snapshot_epochs: [20, 30, 40] batch_size: 2 num_threads: 0 shuffle: True pin_memory: False # weighted loss computation use_target_weight: False report_every: 100 eval_every: 1000 eval_during: True eval_metrics: ['JointDistance2DSIP'] plot_loss: False # debugging configurations debug: save: True # save some intermeadiate results save_images_kpts: True save_hms_gt: True save_hms_pred: True ## testing settings testing_settings: batch_size: 2 num_threads: 0 shuffle: False pin_memory: False apply_dropout: False unnormalize: False eval_metrics: ['JointDistance2DSIP'] # save_debug: True save_debug: False ## optimizer settings optimizer: # for ADAM optim_type: 'adam' lr: 0.001 weight_decay: 0.0 # for SGD momentum: 0.9 # learning rate decay milestones: [10, 20, 30] gamma: 0.5 ================================================ FILE: configs/KITTI_train_lifting.yml ================================================ # YAML file storing experimental configurations for KITTI dataset ## general settings name: 'lifter' exp_type: '2dto3d' model_type: 'FCModel' use_gpu: True gpu_id: [1] # modify this to the GPU ids that you use ## operations train: True # perform training save: True # save the trained model visualize: False # visualize the training results evaluate: False # perform evaluation ## paths to the relevant directories dirs: # output directory output: '../outputs/training_record' debug: '../outputs/training_record/debug' data_vis: '../outputs/training_record/data_vis' ## CUDNN settings cudnn: enabled: True deterministic: False benchmark: False ## evaluation metrics metrics: R3D: T_style: 'direct' R_style: 'euler' ## dataset settings dataset: name: 'KITTI' detect_classes: ['Car'] # used class for training 3d_kpt_sample_style: 'bbox9' # construct a cuboid for each 3D bounding box # interpolate the 3D bbox interpolate: flag: True style: 'bbox12' coef: [0.332, 0.667] # do some pre-processing pre-process: False root: '$YOUR_DIR/KITTI' # MODIFY this to your own path # input-output representation for 2d-to-3d lifting lft_in_rep: 'coordinates2d' # 2d coordinates on screen lft_out_rep: 'R3d' # 3d coordinates relative to centroid plus translation vector ## optional cascaded regression cascade: num_stages: 1 # the default is simply no cascade ## model settings for a fully-connected network if used FCModel: name: 'lifter' refine_3d: False norm_twoD: False num_blocks: 2 num_neurons: 1024 dropout: 0.5 leaky: False loss_type: MSELoss1D loss_reduction: 'mean' ## training settings training_settings: # total_epochs: 300 total_epochs: 1 eval_start_epoch: 250 # start evaluation after this epoch resume: False batch_size: 2048 num_threads: 4 # set the number of workers that works for your machine shuffle: True pin_memory: False # report_every: 500 # report every 500 batches # eval_every: 500 # test on the evaluation set every 500 batches report_every: 5 # report every 500 batches eval_every: 5 # test on the evaluation set every 500 batches eval_during: False # MODIFY this to True if you want to evaluate during the training process # how many times to augment data for 2D-to-3D lifting lft_aug: True lft_aug_times: 100 # what evaluation metrics to use eval_metrics: ['RError3D'] plot_loss: False # visualize the loss function during training ## testing settings if used testing_settings: apply_dropout: False unnormalize: True batch_size: 1024 num_threads: 4 shuffle: False # vis_epoch: 290 # start ploting after this epoch ## optimizer settings optimizer: # for ADAM optim_type: 'adam' lr: 0.001 weight_decay: 0.0 # for SGD momentum: 0.9 # learning rate will decay at each milestone epoch milestones: [50, 100, 150, 250] gamma: 0.5 ================================================ FILE: docs/demo.md ================================================ Firstly you need to prepare the dataset and pre-trained models as described [here](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/preparation.md). Then modify the directories by ```bash cd ${EgoNet_DIR}/configs && vim KITTI_inference:demo.yml ``` Edit dirs:ckpt to your pre-trained model directory. Edit dataset:root to your KITTI directory. Finally, go to ${EgoNet_DIR}/tools and run ```bash python inference.py --cfg "../configs/KITTI_inference:demo.yml" --visualize True --batch_to_show 2 ``` You can set --batch_to_show to other integers to see more results. The visualized 3D bounding boxes are distinguished by their colors: 1. Black indicates ground truth 3D boxes. 2. Magenta indicates 3D bounding boxes predicted by another 3D object detector ([D4LCN](https://github.com/dingmyu/D4LCN)). 3. Red indicates the predictions of Ego-Net, using the 2D bounding boxes from [D4LCN](https://github.com/dingmyu/D4LCN). 4. Yellow indicates the predictions of Ego-Net, using the ground truth 2D bounding boxes. ================================================ FILE: docs/inference.md ================================================ Firstly you need to prepare the dataset and pre-trained models as described [here](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/preparation.md). ## Reproduce D4LCN + EgoNet on the val split You need to modify the directories by ```bash cd ${EgoNet_DIR}/configs && vim KITTI_inference:demo.yml ``` Edit dirs:output to where you want to save the predictions. Edit dirs:ckpt to your pre-trained model directory. Edit dataset:root to your KITTI directory. Finally, go to ${EgoNet_DIR}/tools and run ```bash python inference.py --cfg "../configs/KITTI_inference:demo.yml" ``` This will load D4LCN predictions, refine their vehicle orientation predictions and save the results. The official evaluation program will automatically run to produce quantitative performance. ## Reproduce results on the test split You need to modify the directories by ```bash cd ${EgoNet_DIR}/configs && vim KITTI_inference:test_submission.yml ``` Edit dirs:output to where you want to save the predictions. Edit dirs:ckpt to your pre-trained model directory. Edit dataset:root to your KITTI directory. Finally, go to ${EgoNet_DIR}/tools and run ```bash python inference.py --cfg "../configs/KITTI_inference:test_submission.yml" ``` This will load prepared 2D bounding boxes, predict the vehicle orientation and save the predictions. Now you can zip the results and submit it to the [official server](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=2d)! You can hit [91.23% AOS](http://www.cvlibs.net/datasets/kitti/eval_object_detail.php?&result=e5233225fd5ef36fa63eb00252d9c00024961f2c) for the moderate setting! This is the **most important** metric for joint vehicle detection and pose estimation on KITTI. You achieved this with a single RGB image without extra training data. ================================================ FILE: docs/preparation.md ================================================ ## Data Preparation You need to download KITTI dataset [here](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Download left images, calibration files and labels. Download the split files [here](https://drive.google.com/drive/folders/1YLtptqspOFw08QG2MsxewDT9tjF2O45g?usp=sharing) and place them at ${YOUR_KITTI_DIR}/SPLIT/ImageSets. Your data folder should look like this: ``` ${YOUR_KITTI_DIR} ├── training ├── calib ├── xxxxxx.txt (Camera parameters for image xxxxxx) ├── image_2 ├── xxxxxx.png (image xxxxxx) ├── label_2 ├── xxxxxx.txt (object labels for image xxxxxx) ├── ImageSets ├── train.txt ├── val.txt ├── trainval.txt ├── testing ├── calib ├── xxxxxx.txt (Camera parameters for image xxxxxx) ├── image_2 ├── xxxxxx.png (image xxxxxx) ├── ImageSets ├── test.txt ``` ## Download pre-trained model You need to download the pre-trained checkpoints [here](https://drive.google.com/file/d/1JsVzw7HMfchxOXoXgvWG1I_bPRD1ierE/view?usp=sharing) in order to use Ego-Net. Unzip it to ${YOUR_MODEL_DIR}. ## Compile the official evaluator Go to the folder storing the source code ```bash cd ${EgoNet_DIR}/tools/kitti-eval ``` Compile the source code ```bash g++ -o evaluate_object_3d_offline evaluate_object_3d_offline.cpp -O3 ``` ## Download the input bounding boxes Download the [resources folder](https://drive.google.com/drive/folders/1atfXLmsLFG6XEtNnwZuEYLydKqjr7Icf?usp=sharing) and unzip its contents. Place the resource folder at ${EgoNet_DIR}/resources ## Environment You need to create an environment that meets the following dependencies. The versions included in the parenthesis are **tested**. Other versions may also work but are **not tested**. - Python (3.7.9) - Numpy (1.19.2) - PyTorch (1.6.0, GPU required) - Scipy (1.5.2) - Matplotlib (3.3.4) - OpenCV (3.4.2) - pyyaml (5.4.1) For more details of my tested local environment, refer to [spec-list.txt](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/spec-list.txt). The recommended environment manager is [Anaconda](https://www.anaconda.com/), which can create an environment using this provided spec-list. For debugging using an IDE, I personally use and recommend Spyder 4.2 which you can get by ```bash conda install spyder ``` ================================================ FILE: docs/spec-list.txt ================================================ # This file may be used to create an environment using: # $ conda create --name --file # platform: linux-64 @EXPLICIT https://repo.anaconda.com/pkgs/main/linux-64/_libgcc_mutex-0.1-main.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/blas-1.0-mkl.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ca-certificates-2021.1.19-h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ld_impl_linux-64-2.33.1-h53a641e_7.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libgfortran-ng-7.3.0-hdf63c60_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libstdcxx-ng-9.1.0-hdf63c60_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pandoc-2.12-h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/cudatoolkit-9.2-0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libgcc-ng-9.1.0-hdf63c60_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/bzip2-1.0.8-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/expat-2.2.10-he6710b0_2.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/freeglut-3.0.0-hf484d3e_5.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/graphite2-1.3.14-h23475e2_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/icu-58.2-he6710b0_3.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/jpeg-9b-h024ee3a_2.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libffi-3.3-he6710b0_2.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libglu-9.0.0-hf484d3e_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libopus-1.3.1-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libsodium-1.0.18-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libspatialindex-1.9.3-h2531618_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libuuid-1.0.3-h1bed415_2.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libvpx-1.7.0-h439df22_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libwebp-base-1.2.0-h27cfd23_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libxcb-1.14-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/lz4-c-1.9.3-h2531618_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ncurses-6.2-he6710b0_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/openssl-1.1.1k-h27cfd23_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pcre-8.44-he6710b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pixman-0.40.0-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/xz-5.2.5-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/yaml-0.2.5-h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/zlib-1.2.11-h7b6447c_3.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/glib-2.68.0-h36276a3_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/hdf5-1.10.2-hba1933b_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/jasper-2.0.14-h07fcdf6_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libpng-1.6.37-hbc83047_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libxml2-2.9.10-hb55368b_3.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/readline-8.1-h27cfd23_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/tk-8.6.10-hbc83047_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/zeromq-4.3.4-h2531618_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/zstd-1.4.5-h9ceee32_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/dbus-1.13.18-hb2f20db_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/freetype-2.10.4-h5ab3b9f_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/gstreamer-1.14.0-h28cd5cc_2.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libtiff-4.2.0-h85742a9_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/sqlite-3.35.2-hdfb4753_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ffmpeg-4.0-hcdf2ecd_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/fontconfig-2.13.1-h6c09931_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/gst-plugins-base-1.14.0-h8213a91_2.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/lcms2-2.11-h396b838_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/python-3.7.9-h7579374_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/alabaster-0.7.12-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/appdirs-1.4.4-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/argh-0.26.2-py37_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/atomicwrites-1.4.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/attrs-20.3.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/backcall-0.2.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/cairo-1.16.0-hf32fb01_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/certifi-2020.12.5-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/chardet-4.0.0-py37h06a4308_1003.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/click-7.1.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/cloudpickle-1.6.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/colorama-0.4.4-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/decorator-4.4.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/defusedxml-0.7.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/diff-match-patch-20200713-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/docutils-0.16-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/entrypoints-0.3-py37_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/future-0.18.2-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/idna-2.10-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/imagesize-1.2.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/ipython_genutils-0.2.0-pyhd3eb1b0_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/jeepney-0.6.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/kiwisolver-1.3.1-py37h2531618_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/lazy-object-proxy-1.6.0-py37h27cfd23_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/markupsafe-1.1.1-py37h14c3975_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mccabe-0.6.1-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mistune-0.8.4-py37h14c3975_1001.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mypy_extensions-0.4.3-py37_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ninja-1.10.2-py37hff7bd54_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/olefile-0.46-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pandocfilters-1.4.3-py37h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/parso-0.7.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pathspec-0.7.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pickleshare-0.7.5-pyhd3eb1b0_1003.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/psutil-5.8.0-py37h27cfd23_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/ptyprocess-0.7.0-pyhd3eb1b0_2.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pycodestyle-2.6.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pycparser-2.20-py_2.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pyflakes-2.2.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pyparsing-2.4.7-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pyrsistent-0.17.3-py37h7b6447c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pysocks-1.7.1-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pytz-2021.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pyxdg-0.27-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pyyaml-5.4.1-py37h27cfd23_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pyzmq-20.0.0-py37h2531618_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/qdarkstyle-2.8.1-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/qt-5.9.7-h5867ecd_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/qtpy-1.9.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/regex-2021.3.17-py37h27cfd23_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/rope-0.18.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/rtree-0.9.4-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/sip-4.19.8-py37hf484d3e_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/six-1.15.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/snowballstemmer-2.1.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sortedcontainers-2.3.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinxcontrib-applehelp-1.0.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinxcontrib-devhelp-1.0.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinxcontrib-htmlhelp-1.0.3-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinxcontrib-jsmath-1.0.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinxcontrib-qthelp-1.0.3-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinxcontrib-serializinghtml-1.1.4-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/testpath-0.4.4-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/textdistance-4.2.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/toml-0.10.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/tornado-6.1-py37h27cfd23_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/typed-ast-1.4.2-py37h27cfd23_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/typing_extensions-3.7.4.3-pyha847dfd_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ujson-4.0.2-py37h2531618_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/wcwidth-0.2.5-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/webencodings-0.5.1-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/wheel-0.36.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/wrapt-1.12.1-py37h7b6447c_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/wurlitzer-2.0.1-py37_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/yapf-0.31.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/zipp-3.4.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/autopep8-1.5.6-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/babel-2.9.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/black-19.10b0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/cffi-1.14.5-py37h261ae71_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/cycler-0.10.0-py37_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/harfbuzz-1.8.8-hffaf4a1_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/importlib-metadata-3.7.3-py37h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/intervaltree-3.1.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/jedi-0.17.2-py37h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/packaging-20.9-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pexpect-4.8.0-pyhd3eb1b0_3.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pillow-8.1.2-py37he98fc37_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/prompt-toolkit-3.0.17-pyh06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pydocstyle-6.0.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pyqt-5.9.2-py37h05f1152_2.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/python-dateutil-2.8.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/python-jsonrpc-server-0.4.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/qtawesome-1.0.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/setuptools-52.0.0-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/three-merge-0.1.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/traitlets-5.0.5-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/watchdog-1.0.2-py37h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/astroid-2.5-py37h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/bleach-3.3.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/brotlipy-0.7.0-py37h27cfd23_1003.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/cryptography-3.4.6-py37hd23ed53_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/flake8-3.9.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/importlib_metadata-3.7.3-hd3eb1b0_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/isort-5.8.0-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/jinja2-2.11.3-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/jupyter_core-4.7.1-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/libopencv-3.4.2-hb342d67_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pip-21.0.1-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pygments-2.8.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ipython-7.21.0-py37hb070fc8_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/jsonschema-3.2.0-py_2.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/jupyter_client-6.1.7-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pluggy-0.13.1-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/pylint-2.7.2-py37h06a4308_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pyopenssl-20.0.1-pyhd3eb1b0_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/secretstorage-3.3.1-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/ipykernel-5.3.4-py37h5ca1d4c_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/keyring-22.3.0-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/nbformat-5.1.2-pyhd3eb1b0_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/python-language-server-0.36.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/urllib3-1.26.4-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/nbconvert-5.6.1-py37_1.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pyls-black-0.4.6-hd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/pyls-spyder-0.3.2-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/qtconsole-5.0.3-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/requests-2.25.1-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/spyder-kernels-1.10.2-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/sphinx-3.5.3-pyhd3eb1b0_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/numpydoc-1.1.0-pyhd3eb1b0_1.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/spyder-4.2.4-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/noarch/imageio-2.9.0-py_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/intel-openmp-2020.2-254.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mkl-2020.2-256.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mkl-service-2.3.0-py37he8ac12f_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-3.3.4-py37h06a4308_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/matplotlib-base-3.3.4-py37h62a2d02_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mkl_fft-1.3.0-py37h54f3939_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/mkl_random-1.1.1-py37h0573a6f_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/numpy-1.19.2-py37h54aff64_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/numpy-base-1.19.2-py37hfa32c7d_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/py-opencv-3.4.2-py37hb342d67_1.tar.bz2 https://conda.anaconda.org/pytorch/linux-64/pytorch-1.6.0-py3.7_cuda9.2.148_cudnn7.6.3_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/scipy-1.5.2-py37h0b6359f_0.tar.bz2 https://repo.anaconda.com/pkgs/main/linux-64/opencv-3.4.2-py37h6fd60c2_1.tar.bz2 https://conda.anaconda.org/pytorch/linux-64/torchvision-0.7.0-py37_cu92.tar.bz2 ================================================ FILE: docs/training.md ================================================ Firstly you need to prepare the dataset as described [here](https://github.com/Nicholasli1995/EgoNet/blob/master/docs/preparation.md). Then download a start point model [here](https://drive.google.com/file/d/1VFtMGgBG0cLGnbr3brrnPnJii2xGYj-9/view?usp=sharing) and place it at ${EgoNet_DIR}/resources. The training phase consists of two stages which are described as follows. For training on other datasets. You need to prepare the training images and camera parameters accordingly. ## Stage 1: train a lifter (L.pth) You need to modify the configuration by ```bash cd ${EgoNet_DIR}/configs && vim KITTI_train_lifting.yml ``` Edit dataset:root to your KITTI directory. (Optional) Edit dirs:output to where you want to save the output model. (Optional) You can evaluate during training by setting eval_during to True. Finally, run ```bash cd tools python train_lifting.py --cfg "../configs/KITTI_train_lifting.yml" ``` ## Stage 2: train the remaining part (HC.pth) You need to modify the configuration by ```bash cd ${EgoNet_DIR}/configs && vim KITTI_train_IGRs.yml ``` Edit dataset:root to your KITTI directory. Edit gpu_id according to your local machine and set batch_size based on how much GPU memory you have. (Optional) Edit dirs:output to where you want to save the output model. (Optional) You can evaluate during training by setting eval_during to True. (Optional) Edit ss to enable self-supervised representation learning. You need to prepare unlabeled ApolloScape images and download record [here](https://drive.google.com/file/d/1uPdOC7LioomMF5DieUNrx3aZKsgobP5U/view?usp=sharing). (Optional) Edit training_settings:debug to disable saveing intermediate training results. Finally, run ```bash cd tools python train_IGRs.py --cfg "../configs/KITTI_train_IGRs.yml" ``` ================================================ FILE: libs/arguments/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/arguments/parse.py ================================================ """ Argument parser for command line inputs and experiment configuration file. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import yaml import argparse def read_yaml_file(path): """ Read a .yml file. """ try: with open (path, 'r') as file: configs = yaml.safe_load(file) except Exception as e: print('Error reading the config file: ', e) return configs def parse_args(): """ Read a .yml experiment configuration file whose path is provided by the user. You can add more arguments and modify configs accordingly. """ parser = argparse.ArgumentParser(description='a general parser') # path to the configuration file parser.add_argument('--cfg', help='experiment configuration file path', type=str ) parser.add_argument('--visualize', default=False, type=bool ) parser.add_argument('--batch_to_show', default=1000000, type=int ) args, unknown = parser.parse_known_args() configs = read_yaml_file(args.cfg) configs['config_path'] = args.cfg configs['visualize'] = args.visualize configs['batch_to_show'] = args.batch_to_show return configs ================================================ FILE: libs/common/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/common/format.py ================================================ """ Methods for formatted output. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import os from copy import deepcopy def format_str_submission(roll, pitch, yaw, x, y, z, score): """ Get a prediction string in ApolloScape style. """ tempt_str = "{pitch:.3f} {yaw:.3f} {roll:.3f} {x:.3f} {y:.3f} {z:.3f} {score:.3f}".format( pitch=pitch, yaw=yaw, roll=roll, x=x, y=y, z=z, score=score) return tempt_str def get_instance_str(dic): """ Produce KITTI style prediction string for one instance. """ string = "" string += dic['class'] + " " string += "{:.1f} ".format(dic['truncation']) string += "{:.1f} ".format(dic['occlusion']) string += "{:.6f} ".format(dic['alpha']) string += "{:.6f} {:.6f} {:.6f} {:.6f} ".format(dic['bbox'][0], dic['bbox'][1], dic['bbox'][2], dic['bbox'][3]) string += "{:.6f} {:.6f} {:.6f} ".format(dic['dimensions'][1], dic['dimensions'][2], dic['dimensions'][0]) string += "{:.6f} {:.6f} {:.6f} ".format(dic['locations'][0], dic['locations'][1], dic['locations'][2]) string += "{:.6f} ".format(dic['rot_y']) if 'score' in dic: string += "{:.8f} ".format(dic['score']) else: string += "{:.8f} ".format(1.0) return string def get_pred_str(record): """ Produce KITTI style prediction string for a record dictionary. """ # replace the rotation predictions of input bounding boxes updated_txt = deepcopy(record['raw_txt_format']) for instance_id in range(len(record['euler_angles'])): updated_txt[instance_id]['rot_y'] = record['euler_angles'][instance_id, 1] updated_txt[instance_id]['alpha'] = record['alphas'][instance_id] pred_str = "" angles = record['euler_angles'] for instance_id in range(len(angles)): # format a string for submission tempt_str = get_instance_str(updated_txt[instance_id]) if instance_id != len(angles) - 1: tempt_str += '\n' pred_str += tempt_str return pred_str def save_txt_file(img_path, prediction, params): """ Save a txt file for predictions of an image. """ if not params['flag']: return file_name = img_path.split('/')[-1][:-3] + 'txt' save_path = os.path.join(params['save_dir'], file_name) with open(save_path, 'w') as f: f.write(prediction['pred_str']) print('Wrote prediction file at {:s}'.format(save_path)) return ================================================ FILE: libs/common/img_proc.py ================================================ """ Image processing utilities. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import cv2 import numpy as np import torch import torch.nn.functional as F import os SIZE = 200.0 def transform_preds(coords, center, scale, output_size): """ Transform local coordinates within a patch to screen coordinates. """ target_coords = np.zeros(coords.shape) trans = get_affine_transform(center, scale, 0, output_size, inv=1) for p in range(coords.shape[0]): target_coords[p, 0:2] = affine_transform(coords[p, 0:2], trans) return target_coords def get_affine_transform(center, scale, rot, output_size, shift=np.array([0, 0], dtype=np.float32), inv=0 ): """ Estimate an affine transformation given crop parameters (center, scale and rotation) and output resolution. """ if isinstance(scale, list): scale = np.array(scale) if isinstance(center, list): center = np.array(center) scale_tmp = scale * SIZE src_w = scale_tmp[0] dst_h, dst_w = output_size rot_rad = np.pi * rot / 180 src_dir = get_dir([0, src_w * -0.5], rot_rad) dst_dir = np.array([0, dst_w * -0.5], np.float32) src = np.zeros((3, 2), dtype=np.float32) dst = np.zeros((3, 2), dtype=np.float32) src[0, :] = center + scale_tmp * shift src[1, :] = center + src_dir + scale_tmp * shift dst[0, :] = [dst_w * 0.5, dst_h * 0.5] dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir src[2:, :] = get_3rd_point(src[0, :], src[1, :]) dst[2:, :] = get_3rd_point(dst[0, :], dst[1, :]) if inv: trans = cv2.getAffineTransform(np.float32(dst), np.float32(src)) else: trans = cv2.getAffineTransform(np.float32(src), np.float32(dst)) return trans def affine_transform(pt, t): new_pt = np.array([pt[0], pt[1], 1.]).T new_pt = np.dot(t, new_pt) return new_pt[:2] def affine_transform_modified(pts, t): """ Apply affine transformation with homogeneous coordinates. """ # pts of shape [n, 2] new_pts = np.hstack([pts, np.ones((len(pts), 1))]).T new_pts = t @ new_pts return new_pts[:2, :].T def get_3rd_point(a, b): direct = a - b return b + np.array([-direct[1], direct[0]], dtype=np.float32) def get_dir(src_point, rot_rad): sn, cs = np.sin(rot_rad), np.cos(rot_rad) src_result = [0, 0] src_result[0] = src_point[0] * cs - src_point[1] * sn src_result[1] = src_point[0] * sn + src_point[1] * cs return src_result def crop(img, center, scale, output_size, rot=0): """ A cropping function implemented as warping. """ trans = get_affine_transform(center, scale, rot, output_size) dst_img = cv2.warpAffine(img, trans, (int(output_size[0]), int(output_size[1])), flags=cv2.INTER_LINEAR ) return dst_img def simple_crop(input_image, center, crop_size): """ A simple cropping function without warping. """ assert len(input_image.shape) == 3, 'Unsupported image format.' channel = input_image.shape[2] # crop a rectangular region around the center in the image start_x = int(center[0] - crop_size[0]) end_x = int(center[0] + crop_size[0]) start_y = int(center[1] - crop_size[1]) end_y = int(center[1] + crop_size[1]) cropped = np.zeros((end_y - start_y, end_x - start_x, channel), dtype = input_image.dtype) # new bounding box index new_start_x = max(-start_x, 0) new_end_x = min(input_image.shape[1], end_x) - start_x new_start_y = max(-start_y, 0) new_end_y = min(input_image.shape[0], end_y) - start_y # clamped old bounding box index old_start_x = max(start_x, 0) old_end_x = min(end_x, input_image.shape[1]) old_start_y = max(start_y, 0) old_end_y = min(end_y, input_image.shape[0]) try: cropped[new_start_y:new_end_y, new_start_x:new_end_x,:] = input_image[ old_start_y:old_end_y, old_start_x:old_end_x,:] except ValueError: print('Error: cropping fails') return cropped def np_random(): """ Return a random number sampled uniformly from [-1, 1] """ return np.random.rand()*2 - 1 def jitter_bbox_with_kpts(old_bbox, joints, parameters): """ Randomly shifting and resizeing a bounding box and mask out occluded joints. Used as data augmentation to improve robustness to detector noise. bbox: [x1, y1, x2, y2] joints: [N, 3] """ new_joints = joints.copy() width, height = old_bbox[2] - old_bbox[0], old_bbox[3] - old_bbox[1] old_center = [0.5*(old_bbox[0] + old_bbox[2]), 0.5*(old_bbox[1] + old_bbox[3])] horizontal_shift = parameters['shift'][0]*width*np_random() vertical_shift = parameters['shift'][1]*height*np_random() new_center = [old_center[0] + horizontal_shift, old_center[1] + vertical_shift] horizontal_scaling = parameters['scaling'][0]*np_random() + 1 vertical_scaling = parameters['scaling'][1]*np_random() + 1 new_width = width*horizontal_scaling new_height = height*vertical_scaling new_bbox = [new_center[0] - 0.5*new_width, new_center[1] - 0.5*new_height, new_center[0] + 0.5*new_width, new_center[1] + 0.5*new_height] # predicate from upper left corner predicate1 = joints[:, :2] - np.array([[new_bbox[0], new_bbox[1]]]) predicate1 = (predicate1 > 0.).prod(axis=1) # predicate from lower right corner predicate2 = joints[:, :2] - np.array([[new_bbox[2], new_bbox[3]]]) predicate2 = (predicate2 < 0.).prod(axis=1) new_joints[:, 2] *= predicate1*predicate2 return new_bbox, new_joints def jitter_bbox_with_kpts_no_occlu(old_bbox, joints, parameters): """ Similar to the function above, but does not produce occluded joints """ width, height = old_bbox[2] - old_bbox[0], old_bbox[3] - old_bbox[1] old_center = [0.5 * (old_bbox[0] + old_bbox[2]), 0.5 * (old_bbox[1] + old_bbox[3])] horizontal_scaling = parameters['scaling'][0] * np.random.rand() + 1 vertical_scaling = parameters['scaling'][1] * np.random.rand() + 1 horizontal_shift = 0.5 * (horizontal_scaling - 1) * width * np_random() vertical_shift = 0.5 * (vertical_scaling - 1) * height * np_random() new_center = [old_center[0] + horizontal_shift, old_center[1] + vertical_shift] new_width = width * horizontal_scaling new_height = height * vertical_scaling new_bbox = [new_center[0] - 0.5 * new_width, new_center[1] - 0.5 * new_height, new_center[0] + 0.5 * new_width, new_center[1] + 0.5 * new_height] return new_bbox, joints def generate_xy_map(bbox, resolution, global_size): """ Generate the normalized coordinates as 2D maps which encodes location information. bbox: [x1, y1, x2, y2] the local region resolution (height, width): target resolution global_size (height, width): the size of original image """ map_width, map_height = resolution g_height, g_width = global_size x_start, x_end = 2*bbox[0]/g_width - 1, 2*bbox[2]/g_width - 1 y_start, y_end = 2*bbox[1]/g_height - 1, 2*bbox[3]/g_height - 1 x_map = np.tile(np.linspace(x_start, x_end, map_width), (map_height, 1)) x_map = x_map.reshape(map_height, map_width, 1) y_map = np.linspace(y_start, y_end, map_height).reshape(map_height, 1) y_map = np.tile(y_map, (1, map_width)) y_map = y_map.reshape(map_height, map_width, 1) return np.concatenate([x_map, y_map], axis=2) def crop_single_instance(data_numpy, bbox, joints, parameters, pth_trans=None): """ Crop an instance from an image given the bounding box and part coordinates. """ reso = parameters['input_size'] # (height, width) transformed_joints = joints.copy() if parameters['jitter_bbox']: bbox, joints = jitter_bbox_with_kpts_no_occlu(bbox, joints, parameters['jitter_params'] ) joints_vis = joints[:, 2] if parameters['resize']: ret = resize_bbox(bbox[0], bbox[1], bbox[2], bbox[3], target_ar=reso[0]/reso[1]) c, s = ret['c'], ret['s'] else: c, s = bbox2cs(bbox) trans = get_affine_transform(c, s, 0.0, reso) input = cv2.warpAffine(data_numpy, trans, (int(reso[1]), int(reso[0])), flags=cv2.INTER_LINEAR ) # add two more channels to encode object location if parameters['add_xy']: xymap = generate_xy_map(ret['bbox'], reso, parameters['global_size']) input = np.concatenate([input, xymap.astype(np.float32)], axis=2) #cv2.imwrite('test.jpg', input) #input = torch.from_numpy(input.transpose(2,0,1)) input = input if pth_trans is None else pth_trans(input) for i in range(len(joints)): if joints_vis[i] > 0.0: transformed_joints[i, 0:2] = affine_transform(joints[i, 0:2], trans) c = c.reshape(1, 2) s = s.reshape(1, 2) return input.unsqueeze(0), transformed_joints, c, s def get_tensor_from_img(path, parameters, sf=0.2, rf=30., r_prob=0.6, aug=False, rgb=True, joints=None, global_box=None, pth_trans=None, generate_hm=False, max_cnt=None ): """ Read image and apply data augmentation to obtain a tensor. Keypoints are also transformed if given. path: image path c: cropping center s: cropping scale r: rotation reso: resolution of output image sf: scaling factor rf: rotation factor aug: apply data augmentation joints: key-point locations with optional visibility [N_instance, N_joint, 3] generate_hm: whether to generate heatmap based on joint locations """ # data_numpy = cv2.imread( # path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION # ) data_numpy = cv2.imread( path, 1 | 128 ) if data_numpy is None: raise ValueError('Fail to read {}'.format(path)) if rgb: data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) all_inputs = [] all_target = [] all_centers = [] all_scales = [] all_target_weight = [] # the dimension of the image parameters['global_size'] = data_numpy.shape[:-1] all_transformed_joints = [] if parameters['reference'] == 'bbox': # crop around the given bounding boxes # bbox = [0, 0, data_numpy.shape[1] - 1, data_numpy.shape[0] - 1] \ # if 'bbox' not in parameters else parameters['bbox'] bboxes = parameters['boxes'] # [N_instance, 4] for idx, bbox in enumerate(bboxes): input, transformed_joints, c, s = crop_single_instance(data_numpy, bbox, joints[idx], parameters, pth_trans ) all_inputs.append(input) all_centers.append(c) all_scales.append(s) # s = s * np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) # r = np.clip(np.random.randn() * rf, -rf, rf) if np.random.rand() <= r_prob else 0 target = target_weight = 1. if generate_hm: target, target_weight = generate_target(transformed_joints, transformed_joints[:,2], parameters) target = torch.unsqueeze(torch.from_numpy(target), 0) target_weight = torch.unsqueeze(torch.from_numpy(target_weight), 0) all_target.append(target) all_target_weight.append(target_weight) all_transformed_joints.append(np.expand_dims(transformed_joints,0)) all_transformed_joints = np.concatenate(all_transformed_joints) if max_cnt is not None and max_cnt < len(all_inputs): end = max_cnt else: end = len(all_inputs) end_indices = list(range(end)) meta = { 'path': path, 'original_joints': joints[end_indices], 'transformed_joints': all_transformed_joints[end_indices], 'center': np.vstack(all_centers[:end]), 'scale': np.vstack(all_scales[:end]), 'joints_vis': all_transformed_joints[end_indices][:,:,2] # 'rotation': r, } inputs = torch.cat(all_inputs[:end], dim=0) if generate_hm: targets = torch.cat(all_target[:end], dim=0) target_weights = torch.cat(all_target_weight[:end], dim=0) else: targets, target_weights = None, None return inputs, targets, target_weights, meta def generate_target(joints, joints_vis, parameters): """ Generate heatmap targets by drawing Gaussian dots. joints: [num_joints, 3] joints_vis: [num_joints] return: target, target_weight (1: visible, 0: invisible) """ num_joints = parameters['num_joints'] target_type = parameters['target_type'] input_size = parameters['input_size'] heatmap_size = parameters['heatmap_size'] sigma = parameters['sigma'] target_weight = np.ones((num_joints, 1), dtype=np.float32) target_weight[:, 0] = joints_vis assert target_type == 'gaussian', 'Only support gaussian map now!' if target_type == 'gaussian': target = np.zeros((num_joints, heatmap_size[0], heatmap_size[1]), dtype=np.float32) tmp_size = sigma * 3 for joint_id in range(num_joints): if target_weight[joint_id] <= 0.5: continue feat_stride = input_size / heatmap_size mu_x = int(joints[joint_id][0] / feat_stride[0] + 0.5) mu_y = int(joints[joint_id][1] / feat_stride[1] + 0.5) # Check that any part of the gaussian is in-bounds ul = [int(mu_x - tmp_size), int(mu_y - tmp_size)] br = [int(mu_x + tmp_size + 1), int(mu_y + tmp_size + 1)] if ul[0] >= heatmap_size[1] or ul[1] >= heatmap_size[0] \ or br[0] < 0 or br[1] < 0: # If not, just return the image as is target_weight[joint_id] = 0 continue # # Generate gaussian size = 2 * tmp_size + 1 x = np.arange(0, size, 1, np.float32) y = x[:, np.newaxis] x0 = y0 = size // 2 # The gaussian is not normalized, we want the center value to equal 1 g = np.exp(- ((x - x0) ** 2 + (y - y0) ** 2) / (2 * sigma ** 2)) # Usable gaussian range g_x = max(0, -ul[0]), min(br[0], heatmap_size[1]) - ul[0] g_y = max(0, -ul[1]), min(br[1], heatmap_size[0]) - ul[1] # Image range img_x = max(0, ul[0]), min(br[0], heatmap_size[1]) img_y = max(0, ul[1]), min(br[1], heatmap_size[0]) target[joint_id][img_y[0]:img_y[1], img_x[0]:img_x[1]] = \ g[g_y[0]:g_y[1], g_x[0]:g_x[1]] if parameters['use_different_joints_weight']: target_weight = np.multiply(target_weight, parameters['joints_weight']) return target, target_weight def resize_bbox(left, top, right, bottom, target_ar=1.): """ Resize a bounding box to pre-defined aspect ratio. """ width = right - left height = bottom - top aspect_ratio = height/width center_x = (left + right)/2 center_y = (top + bottom)/2 if aspect_ratio > target_ar: new_width = height*(1/target_ar) new_left = center_x - 0.5*new_width new_right = center_x + 0.5*new_width new_top = top new_bottom = bottom else: new_height = width*target_ar new_left = left new_right = right new_top = center_y - 0.5*new_height new_bottom = center_y + 0.5*new_height return {'bbox': [new_left, new_top, new_right, new_bottom], 'c': np.array([center_x, center_y]), 's': np.array([(new_right - new_left)/SIZE, (new_bottom - new_top)/SIZE]) } def enlarge_bbox(left, top, right, bottom, enlarge): """ Enlarge a bounding box. """ width = right - left height = bottom - top new_width = width * enlarge[0] new_height = height * enlarge[1] center_x = (left + right) / 2 center_y = (top + bottom) / 2 new_left = center_x - 0.5 * new_width new_right = center_x + 0.5 * new_width new_top = center_y - 0.5 * new_height new_bottom = center_y + 0.5 * new_height return [new_left, new_top, new_right, new_bottom] def modify_bbox(bbox, target_ar, enlarge=1.1): """ Modify a bounding box by enlarging/resizing. """ lbbox = enlarge_bbox(bbox[0], bbox[1], bbox[2], bbox[3], [enlarge, enlarge]) ret = resize_bbox(lbbox[0], lbbox[1], lbbox[2], lbbox[3], target_ar=target_ar) return ret def resize_crop(crop_size, target_ar=None): """ Resize a crop size to a pre-defined aspect ratio. """ if target_ar is None: return crop_size width = crop_size[0] height = crop_size[1] aspect_ratio = height / width if aspect_ratio > target_ar: new_width = height * (1 / target_ar) new_height = height else: new_height = width*target_ar new_width = width return [new_width, new_height] def bbox2cs(bbox): """ Convert bounding box annotation to center and scale. """ return [(bbox[0] + bbox[2]/2), (bbox[1] + bbox[3]/2)], \ [(bbox[2] - bbox[0]/SIZE), (bbox[3] - bbox[1]/SIZE)] def cs2bbox(center, size): """ Convert center/scale to a bounding box annotation. """ x1 = center[0] - size[0] y1 = center[1] - size[1] x2 = center[0] + size[0] y2 = center[1] + size[1] return [x1, y1, x2, y2] def kpts2cs(keypoints, enlarge=1.1, method='boundary', target_ar=None, use_visibility=True ): """ Convert instance screen coordinates to cropping center and size keypoints of shape [n_joints, 2/3] """ assert keypoints.shape[1] in [2, 3], 'Unsupported input.' if keypoints.shape[1] == 2: visible_keypoints = keypoints vis_rate = 1.0 elif keypoints.shape[1] == 3 and use_visibility: visible_indices = keypoints[:, 2].nonzero()[0] visible_keypoints = keypoints[visible_indices, :2] vis_rate = len(visible_keypoints)/len(keypoints) else: visible_keypoints = keypoints[:, :2] visible_indices = np.array(range(len(keypoints))) vis_rate = 1.0 if method == 'centroid': center = np.ceil(visible_keypoints.mean(axis=0, keepdims=True)) dif = np.abs(visible_keypoints - center).max(axis=0, keepdims=True) crop_size = np.ceil(dif*enlarge).squeeze() center = center.squeeze() elif method == 'boundary': left_top = visible_keypoints.min(axis=0, keepdims=True) right_bottom = visible_keypoints.max(axis=0, keepdims=True) center = ((left_top + right_bottom) / 2).squeeze() crop_size = ((right_bottom - left_top)*enlarge/2).squeeze() else: raise NotImplementedError # resize the bounding box to a specified aspect ratio crop_size = resize_crop(crop_size, target_ar) x1, y1, x2, y2 = cs2bbox(center, crop_size) new_origin = np.array([[x1, y1]], dtype=keypoints.dtype) new_keypoints = keypoints.copy() if keypoints.shape[1] == 2: new_keypoints = visible_keypoints - new_origin elif keypoints.shape[1] == 3: new_keypoints[visible_indices, :2] = visible_keypoints - new_origin return center, crop_size, new_keypoints, vis_rate def draw_bboxes(img_path, bboxes_dict, save_path=None): """ Draw bounding boxes with OpenCV. """ data_numpy = cv2.imread(img_path, 1 | 128) for name, (color, bboxes) in bboxes_dict.items(): for bbox in bboxes: start_point = (bbox[0], bbox[1]) end_point = (bbox[2], bbox[3]) cv2.rectangle(data_numpy, start_point, end_point, color, 2) if save_path is not None: cv2.imwrite(save_path, data_numpy) return data_numpy def imread_rgb(img_path): """ Read image with OpenCV. """ data_numpy = cv2.imread(img_path, 1 | 128) data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) return data_numpy def save_cropped_patches(img_path, keypoints, save_dir="./", threshold=0.25, enlarge=1.4, target_ar=None ): """ Crop instances from a image given part screen coordinates and save them. """ # data_numpy = cv2.imread( # img_path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION # ) data_numpy = cv2.imread(img_path, 1 | 128) # data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) # debug # import matplotlib.pyplot as plt # plt.imshow(data_numpy[:,:,::-1]) # plt.plot(keypoints[0][:,0], keypoints[0][:,1], 'ro') # plt.pause(0.1) if not os.path.exists(save_dir): os.makedirs(save_dir) new_paths = [] all_new_keypoints = [] all_bbox = [] for i in range(len(keypoints)): center, crop_size, new_keypoints, vis_rate = kpts2cs(keypoints[i], enlarge, target_ar=target_ar) all_bbox.append(list(map(int, cs2bbox(center, crop_size)))) if vis_rate < threshold: continue all_new_keypoints.append(new_keypoints.reshape(1, keypoints.shape[1], -1)) cropped = simple_crop(data_numpy, center, crop_size) save_path = os.path.join(save_dir, "instance_{:d}.jpg".format(i)) new_paths.append(save_path) cv2.imwrite(save_path, cropped) del cropped if len(new_paths) == 0: # No instances cropped return new_paths, np.zeros((0, keypoints.shape[1], 3)), all_bbox else: return new_paths, np.concatenate(all_new_keypoints, axis=0), all_bbox def get_max_preds(batch_heatmaps): """ Get predictions from heatmaps with hard arg-max. batch_heatmaps: numpy.ndarray([batch_size, num_joints, height, width]) """ assert isinstance(batch_heatmaps, np.ndarray), \ 'batch_heatmaps should be numpy.ndarray' assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim' batch_size = batch_heatmaps.shape[0] num_joints = batch_heatmaps.shape[1] width = batch_heatmaps.shape[3] heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) idx = np.argmax(heatmaps_reshaped, 2) maxvals = np.amax(heatmaps_reshaped, 2) maxvals = maxvals.reshape((batch_size, num_joints, 1)) idx = idx.reshape((batch_size, num_joints, 1)) preds = np.tile(idx, (1, 1, 2)).astype(np.float32) preds[:, :, 0] = (preds[:, :, 0]) % width preds[:, :, 1] = np.floor((preds[:, :, 1]) / width) pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) pred_mask = pred_mask.astype(np.float32) preds *= pred_mask return preds, maxvals def soft_arg_max_np(batch_heatmaps): """ Soft-argmax instead of hard-argmax considering quantization errors. """ assert isinstance(batch_heatmaps, np.ndarray), \ 'batch_heatmaps should be numpy.ndarray' assert batch_heatmaps.ndim == 4, 'batch_images should be 4-ndim' batch_size = batch_heatmaps.shape[0] num_joints = batch_heatmaps.shape[1] height = batch_heatmaps.shape[2] width = batch_heatmaps.shape[3] # get score/confidence for each joint heatmaps_reshaped = batch_heatmaps.reshape((batch_size, num_joints, -1)) maxvals = np.amax(heatmaps_reshaped, 2) maxvals = maxvals.reshape((batch_size, num_joints, 1)) # normalize the heatmaps so that they sum to 1 #assert batch_heatmaps.min() >= 0.0 batch_heatmaps = np.clip(batch_heatmaps, a_min=0.0, a_max=None) temp_sum = heatmaps_reshaped.sum(axis = 2, keepdims=True) heatmaps_reshaped /= temp_sum ## another normalization method: softmax # spatial soft-max #heatmaps_reshaped = softmax(heatmaps_reshaped, axis=2) ## batch_heatmaps = heatmaps_reshaped.reshape(batch_size, num_joints, height, width) x = batch_heatmaps.sum(axis = 2) y = batch_heatmaps.sum(axis = 3) x_indices = np.arange(width).astype(np.float32).reshape(1,1,width) y_indices = np.arange(height).astype(np.float32).reshape(1,1,height) x *= x_indices y *= y_indices x = x.sum(axis = 2, keepdims=True) y = y.sum(axis = 2, keepdims=True) preds = np.concatenate([x, y], axis=2) pred_mask = np.tile(np.greater(maxvals, 0.0), (1, 1, 2)) pred_mask = pred_mask.astype(np.float32) preds *= pred_mask return preds, maxvals def soft_arg_max(batch_heatmaps): """ A pytorch version of soft-argmax """ assert len(batch_heatmaps.shape) == 4, 'batch_images should be 4-ndim' batch_size = batch_heatmaps.shape[0] num_joints = batch_heatmaps.shape[1] height = batch_heatmaps.shape[2] width = batch_heatmaps.shape[3] heatmaps_reshaped = batch_heatmaps.view((batch_size, num_joints, -1)) # get score/confidence for each joint maxvals = heatmaps_reshaped.max(dim=2)[0] maxvals = maxvals.view((batch_size, num_joints, 1)) # normalize the heatmaps so that they sum to 1 heatmaps_reshaped = F.softmax(heatmaps_reshaped, dim=2) batch_heatmaps = heatmaps_reshaped.view(batch_size, num_joints, height, width) x = batch_heatmaps.sum(dim = 2) y = batch_heatmaps.sum(dim = 3) x_indices = torch.arange(width).type(torch.cuda.FloatTensor) x_indices = torch.cuda.comm.broadcast(x_indices, devices=[x.device.index])[0] x_indices = x_indices.view(1, 1, width) y_indices = torch.arange(height).type(torch.cuda.FloatTensor) y_indices = torch.cuda.comm.broadcast(y_indices, devices=[y.device.index])[0] y_indices = y_indices.view(1, 1, height) x *= x_indices y *= y_indices x = x.sum(dim = 2, keepdim=True) y = y.sum(dim = 2, keepdim=True) preds = torch.cat([x, y], dim=2) return preds, maxvals def appro_cr(coordinates): """ Approximate the square of cross-ratio along four ordered 2D points using inner-product coordinates: PyTorch tensor of shape [4, 2] """ AC = coordinates[2] - coordinates[0] BD = coordinates[3] - coordinates[1] BC = coordinates[2] - coordinates[1] AD = coordinates[3] - coordinates[0] return (AC.dot(AC) * BD.dot(BD)) / (BC.dot(BC) * AD.dot(AD)) def to_npy(tensor): """ Convert PyTorch tensor to numpy array. """ if isinstance(tensor, np.ndarray): return tensor else: return tensor.data.cpu().numpy() ================================================ FILE: libs/common/transformation.py ================================================ """ Coordinate transformation functions. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import numpy as np import cv2 def move_to(points, xyz=np.zeros((1,3))): # points of shape [n_points, 3] centroid = points.mean(axis=0, keepdims=True) return points - (centroid - xyz) def world_to_camera_frame(P, R, T): """ Convert points from world to camera coordinates P: Nx3 3d points in world coordinates R: 3x3 Camera rotation matrix T: 3x1 Camera translation parameters Returns X_cam: Nx3 3d points in camera coordinates """ assert len(P.shape) == 2 assert P.shape[1] == 3 X_cam = R.dot( P.T - T ) # rotate and translate return X_cam.T def camera_to_world_frame(P, R, T): """ Inverse of world_to_camera_frame P: Nx3 points in camera coordinates R: 3x3 Camera rotation matrix T: 3x1 Camera translation parameters Returns X_cam: Nx3 points in world coordinates """ assert len(P.shape) == 2 assert P.shape[1] == 3 X_cam = R.T.dot( P.T ) + T # rotate and translate return X_cam.T def compute_similarity_transform(X, Y, compute_optimal_scale=False): """ A port of MATLAB's `procrustes` function to Numpy. Adapted from http://stackoverflow.com/a/18927641/1884420 Args X: array NxM of targets, with N number of points and M point dimensionality Y: array NxM of inputs compute_optimal_scale: whether we compute optimal scale or force it to be 1 Returns: d: squared error after transformation Z: transformed Y T: computed rotation b: scaling c: translation """ muX = X.mean(0) muY = Y.mean(0) X0 = X - muX Y0 = Y - muY ssX = (X0**2.).sum() ssY = (Y0**2.).sum() # centred Frobenius norm normX = np.sqrt(ssX) normY = np.sqrt(ssY) # scale to equal (unit) norm X0 = X0 / normX Y0 = Y0 / normY # optimum rotation matrix of Y A = np.dot(X0.T, Y0) U,s,Vt = np.linalg.svd(A,full_matrices=False) V = Vt.T T = np.dot(V, U.T) # Make sure we have a rotation detT = np.linalg.det(T) V[:,-1] *= np.sign( detT ) s[-1] *= np.sign( detT ) T = np.dot(V, U.T) traceTA = s.sum() if compute_optimal_scale: # Compute optimum scaling of Y. b = traceTA * normX / normY d = 1 - traceTA**2 Z = normX*traceTA*np.dot(Y0, T) + muX else: # If no scaling allowed b = 1 d = 1 + ssY/ssX - 2 * traceTA * normY / normX Z = normY*np.dot(Y0, T) + muX c = muX - b*np.dot(muY, T) return d, Z, T, b, c def compute_rigid_transform(X, Y, W=None, verbose=False): """ A least-sqaure estimate of rigid transformation by SVD. Reference: https://content.sakai.rutgers.edu/access/content/group/ 7bee3f05-9013-4fc2-8743-3c5078742791/material/svd_ls_rotation.pdf X, Y: [d, N] N data points of dimention d W: [N, ] optional weight (importance) matrix for N data points """ assert len(X) == len(Y) assert (W is None) or (len(W.shape) in [1, 2]) # find mean column wise centroid_X = np.mean(X, axis=1, keepdims=True) centroid_Y = np.mean(Y, axis=1, keepdims=True) # subtract mean Xm = X - centroid_X Ym = Y - centroid_Y if W is None: H = Xm @ Ym.T else: W = np.diag(W) if len(W.shape) == 1 else W H = Xm @ W @ Ym.T # find rotation U, S, Vt = np.linalg.svd(H) R = Vt.T @ U.T if np.linalg.det(R) < 0: # special reflection case if verbose: print("det(R) < R, reflection detected!, correcting for it ...\n"); # the global minimizer with a orthogonal transformation is not possible # the next best transformation is chosen Vt[-1,:] *= -1 R = Vt.T @ U.T t = -R @ centroid_X + centroid_Y return R, t def procrustes_transform(X, Y): """ Compute a rigid transformation trans() from X to Y and return trans(X) """ R, t = compute_rigid_transform(X, Y) return R @ X + t def pnp_refine(prediction, observation, intrinsics, dist_coeffs): """ Refine 3D prediction with observed image projection based on the PnP algorithm. """ (success, R, T) = cv2.solvePnP(prediction, observation, intrinsics, dist_coeffs, flags=cv2.SOLVEPNP_ITERATIVE) if not success: print('PnP failed.') return prediction else: refined_prediction = cv2.Rodrigues(R)[0] @ prediction.T + T return refined_prediction ================================================ FILE: libs/common/utils.py ================================================ """ Common utilities. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import torch import torch.nn as nn import numpy as np from libs.metric.criterions import PCK_THRES import os from os.path import join as pjoin from collections import namedtuple def make_dir(name): """ Create a directory. """ if not os.path.exists(os.path.dirname(name)): try: os.makedirs(os.path.dirname(name)) except OSError as exc: print('make_dir failed.') raise exc return def save_checkpoint(states, is_best, output_dir, filename='checkpoint.pth'): torch.save(states, pjoin(output_dir, filename)) if is_best and 'state_dict' in states: torch.save(states['best_state_dict'], pjoin(output_dir, 'model_best.pth')) def get_model_summary(model, *input_tensors, item_length=26, verbose=False): """ Summarize a model. For now only convolution, batch normalization and linear layers are considered for parameters and FLOPs. """ summary = [] ModuleDetails = namedtuple( "Layer", ["name", "input_size", "output_size", "num_parameters", "multiply_adds"] ) hooks = [] layer_instances = {} def hook(module, input, output): class_name = str(module.__class__.__name__) instance_index = 1 if class_name not in layer_instances: layer_instances[class_name] = instance_index else: instance_index = layer_instances[class_name] + 1 layer_instances[class_name] = instance_index layer_name = class_name + "_" + str(instance_index) params = 0 if class_name.find("Conv") != -1 or class_name.find("BatchNorm") != -1 or \ class_name.find("Linear") != -1: for param_ in module.parameters(): params += param_.view(-1).size(0) flops = "Not Available" if class_name.find("Conv") != -1 and hasattr(module, "weight"): flops = ( torch.prod( torch.LongTensor(list(module.weight.data.size()))) * torch.prod( torch.LongTensor(list(output.size())[2:]))).item() elif isinstance(module, nn.Linear): flops = (torch.prod(torch.LongTensor(list(output.size()))) \ * input[0].size(1)).item() if isinstance(input[0], list): input = input[0] if isinstance(output, list): output = output[0] summary.append( ModuleDetails( name=layer_name, input_size=list(input[0].size()), output_size=list(output.size()), num_parameters=params, multiply_adds=flops) ) def add_hooks(module): if not isinstance(module, nn.ModuleList) \ and not isinstance(module, nn.Sequential) \ and module != model: hooks.append(module.register_forward_hook(hook)) model.eval() model.apply(add_hooks) space_len = item_length model(*input_tensors) for h in hooks: h.remove() details = '' if verbose: details = "Model Summary" + \ os.linesep + \ "Name{}Input Size{}Output Size{}Parameters{}Multiply Adds (Flops){}".format( ' ' * (space_len - len("Name")), ' ' * (space_len - len("Input Size")), ' ' * (space_len - len("Output Size")), ' ' * (space_len - len("Parameters")), ' ' * (space_len - len("Multiply Adds (Flops)"))) \ + os.linesep + '-' * space_len * 5 + os.linesep params_sum = 0 flops_sum = 0 for layer in summary: params_sum += layer.num_parameters if layer.multiply_adds != "Not Available": flops_sum += layer.multiply_adds if verbose: details += "{}{}{}{}{}{}{}{}{}{}".format( layer.name, ' ' * (space_len - len(layer.name)), layer.input_size, ' ' * (space_len - len(str(layer.input_size))), layer.output_size, ' ' * (space_len - len(str(layer.output_size))), layer.num_parameters, ' ' * (space_len - len(str(layer.num_parameters))), layer.multiply_adds, ' ' * (space_len - len(str(layer.multiply_adds)))) \ + os.linesep + '-' * space_len * 5 + os.linesep details += os.linesep \ + "Total Parameters: {:,}".format(params_sum) \ + os.linesep + '-' * space_len * 5 + os.linesep details += "Total Multiply Adds (For Convolution and Linear Layers only): {:,} GFLOPs".format(flops_sum/(1024**3)) \ + os.linesep + '-' * space_len * 5 + os.linesep details += "Number of Layers" + os.linesep for layer in layer_instances: details += "{} : {} layers ".format(layer, layer_instances[layer]) return details class AverageMeter(object): """ An averaege meter object that computes and stores the average and current value. """ def __init__(self): self.reset() self.PCK_stats = {} def reset(self): self.val = 0 self.avg = 0 self.sum = 0 self.count = 0 return def update(self, val, n=1, others=None): self.val = val self.sum += val * n self.count += n self.avg = self.sum / self.count if self.count != 0 else 0 if others is not None and'correct_cnt' in others: if 'sum' not in self.PCK_stats: self.PCK_stats['sum'] = np.zeros(len(others['correct_cnt'])) self.PCK_stats['sum'] += others['correct_cnt'] if 'total' not in self.PCK_stats: self.PCK_stats['total'] = 0. self.PCK_stats['total'] += n return def print_content(self): if 'sum' in self.PCK_stats: for idx, value in enumerate(self.PCK_stats['sum']): PCK = value / self.PCK_stats['total'] print('Average PCK at threshold {:.2f}: {:.3f}'.format(PCK_THRES[idx], PCK)) return ================================================ FILE: libs/dataset/KITTI/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/dataset/KITTI/car_instance.py ================================================ """ KITTI dataset implemented as PyTorch dataset object. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import libs.dataset.basic.basic_classes as bc import libs.visualization.points as vp import libs.common.img_proc as lip from libs.common.utils import make_dir from libs.common.img_proc import get_affine_transform import numpy as np import matplotlib.pyplot as plt import torch import torchvision.transforms as transforms import cv2 import csv import copy from PIL import Image from mpl_toolkits.mplot3d import Axes3D from torch.utils.data.dataloader import default_collate from os.path import join as pjoin from os.path import sep as osep from os.path import exists from os import listdir # maximum number of instances to the network depending on your GPU memory MAX_INS_CNT = 140 #MAX_INS_CNT = 64 TYPE_ID_CONVERSION = { 'Car': 0, 'Cyclist': 1, 'Pedestrian': 2, } # annotation style of KITTI dataset FIELDNAMES = ['type', 'truncated', 'occluded', 'alpha', 'xmin', 'ymin', 'xmax', 'ymax', 'dh', 'dw', 'dl', 'lx', 'ly', 'lz', 'ry'] # the format of prediction has one more field: confidence score FIELDNAMES_P = FIELDNAMES.copy() + ['score'] # indices used for performing interpolation # key->value: style->index arrays interp_dict = { 'bbox12':(np.array([1,3,5,7,# h direction 1,2,3,4,# l direction 1,2,5,6]), # w direction np.array([2,4,6,8, 5,6,7,8, 3,4,7,8]) ), 'bbox12l':(np.array([1,2,3,4,]), # w direction np.array([5,6,7,8]) ), 'bbox12h':(np.array([1,3,5,7]), # w direction np.array([2,4,6,8]) ), 'bbox12w':(np.array([1,2,5,6]), # w direction np.array([3,4,7,8]) ), } # indices used for computing the cross ratio cr_indices_dict = { 'bbox12':np.array([[ 1, 9, 21, 2], [ 3, 10, 22, 4], [ 5, 11, 23, 6], [ 7, 12, 24, 8], [ 1, 13, 25, 5], [ 2, 14, 26, 6], [ 3, 15, 27, 7], [ 4, 16, 28, 8], [ 1, 17, 29, 3], [ 2, 18, 30, 4], [ 5, 19, 31, 7], [ 6, 20, 32, 8]] ) } def get_cr_indices(): """ Helper function to define the indices used in computing the cross-ratio. """ num_base_pts = 9 num_lines = 12 parents, children = interp_dict['bbox12'] cr_indices = [] for line_idx in range(num_lines): parent_idx = parents[line_idx] # first point child_idx = children[line_idx] # last point second_point_idx = num_base_pts + line_idx third_point_idx = num_base_pts + num_lines + line_idx cr_indices.append(np.array([parent_idx, second_point_idx, third_point_idx, child_idx] ).reshape(1,4) ) cr_indices = np.vstack(cr_indices) return cr_indices class KITTI(bc.SupervisedDataset): """ KITTI dataset. """ def __init__(self, cfgs, split, logger, scale=1.0): super().__init__(cfgs, split, logger) self.logger = logger self.logger.info("Initializing KITTI {:s} set, please wait...".format(split)) self.exp_type = cfgs['exp_type'] # exp_type: experiment type self._data_dir = cfgs['dataset']['root'] # root directory self._classes = cfgs['dataset']['detect_classes'] # used object classes self._get_data_parameters(cfgs) # initialize hyper-parameters self._set_paths() # initialize paths self._inference_mode = False self.car_sizes = [] # dimension of cars self._load_image_list() if self.split in ['train', 'valid', 'trainvalid'] and \ self.exp_type in ['instanceto2d', 'baselinealpha', 'baselinetheta']: # prepare local coordinates used in certain types of experiments self._prepare_key_points(cfgs) # save cropped car instances for debugging # cropped_path = pjoin(self._data_config['cropped_dir'], self.kpts_style, # self.split) # if not exists(cropped_path) and cfgs['dataset']['pre-process']: # self._save_cropped_instances() # prepare data used for future loading self.generate_pairs() # self.visualize() if self.split in ['train', 'trainvalid'] and self.exp_type in ['2dto3d']: # 2dto3d means the data is used by the lifter that predicts 3D # cuboid based on 2D screen coordinates self.normalize() # data normalization used for the lifter network if 'ss' in cfgs and cfgs['ss']['flag']: # use unlabeled images for weak self-supervision self.use_ss = True self.ss_settings = cfgs['ss'] self._initialize_unlabeled_data(cfgs) self.logger.info("Initialization finished for KITTI {:s} set".format(split)) # self.show_statistics() # debugging code if you need # test = self[10] # test = self.extract_ss_sample(1) def _get_image_path_list(self): """ Prepare list of image paths for the used split. """ assert 'image_name_list' in self._data_config image_path_list = [] for name in self._data_config['image_name_list']: img_path = pjoin(self._data_config['image_dir'], name) image_path_list.append(img_path) self._data_config['image_path_list'] = image_path_list return def _initialize_unlabeled_data(self, cfgs): """ Initialize unlabeled data for self-supervision experiment. """ self.ss_record = np.load(cfgs['ss']['record_path'], allow_pickle=True).item() self.logger.info('Found prepared self-supervision record at: ' + cfgs['ss']['record_path']) return def _load_image_list(self): """ Prepare list of image names for the used split. """ path = self._data_config[self.split + '_list'] with open(path, "r") as f: image_name_list = f.read().splitlines() for idx, line in enumerate(image_name_list): base_name = line.replace("\n", "") image_name = base_name + ".png" image_name_list[idx] = image_name self._data_config['image_name_list'] = image_name_list self._get_image_path_list() return def _check_precomputed_file(self, path, name): """ Check if a pre-computed numpy file exists or not. """ if exists(path): self.logger.info('Found prepared {0:s} at {1:s}'.format(name, path)) value = np.load(path, allow_pickle=True).item() setattr(self, name, value) return True else: return False def _save_precomputed_file(self, data_dic, pre_computed_path, name): """ Save a pre-computed numpy file. """ setattr(self, name, data_dic) make_dir(pre_computed_path) np.save(pre_computed_path, data_dic) self.logger.info('Save prepared {0:s} at {1:s}'.format(name, pre_computed_path)) return def _prepare_key_points_custom(self, style, interp_params, vis_thresh=0.25): """ Project 3D bounding boxes to image planes to prepare screen coordinates. """ assert 'keypoint_dir' in self._data_config kpt_dir = self._data_config['keypoint_dir'] if interp_params['flag']: style += str(interp_params['coef']) pre_computed_path_kpts = pjoin(kpt_dir, '{0:s}_{1:s}_{2:s}.npy'.format(style, self.split, str(self._classes))) pre_computed_path_ids = pjoin(kpt_dir, '{0:s}_{1:s}_{2:s}_ids.npy'.format(style, self.split, str(self._classes))) pre_computed_path_rots = pjoin(kpt_dir, '{0:s}_{1:s}_{2:s}_rots.npy'.format(style, self.split, str(self._classes))) if self._check_precomputed_file(pre_computed_path_kpts, 'keypoints'): pass if self._check_precomputed_file(pre_computed_path_ids, 'instance_ids'): pass if self._check_precomputed_file(pre_computed_path_rots, 'rotations'): return path_list = self._data_config['image_path_list'] data_dic_kpts = {} data_dic_ids = {} data_dic_rots = {} for path in path_list: image_name = path.split(osep)[-1] # instances that lie out of the image plane will be discarded list_2d, _, list_id, _, list_rots = self.get_2d_3d_pair(path, style=style, augment=False, add_visibility=True, filter_outlier=True, add_rotation=True ) if len(list_2d) == 0: continue for idx, kpts in enumerate(list_2d): list_2d[idx] = kpts.reshape(1, -1, 3) data_dic_kpts[image_name] = np.concatenate(list_2d, axis=0) data_dic_ids[image_name] = list_id data_dic_rots[image_name] = np.concatenate(list_rots, axis=0) self._save_precomputed_file(data_dic_kpts, pre_computed_path_kpts, 'keypoints') self._save_precomputed_file(data_dic_ids, pre_computed_path_ids, 'instance_ids') self._save_precomputed_file(data_dic_rots, pre_computed_path_rots, 'rotations') return def _prepare_key_points(self, cfgs): self.kpts_style = cfgs['dataset']['2d_kpt_style'] self._prepare_key_points_custom(self.kpts_style, cfgs['dataset']['interpolate']) if 'enlarge_factor' in cfgs['dataset']: self.enlarge_factor = cfgs['dataset']['enlarge_factor'] else: self.enlarge_factor = 1.1 return def _save_cropped_instances(self): # DEPRECATED, will be removed in a future release """ Crop and save car instance images with given 2d key-points """ assert hasattr(self, 'keypoints') all_save_paths = [] all_keypoints = [] all_bbox = [] target_ar = self.hm_para['target_ar'] for image_name in self.keypoints.keys(): image_path = pjoin(self._data_config['image_dir'], image_name) save_dir = pjoin(self._data_config['cropped_dir'], self.kpts_style, self.split, image_name[:-4]) keypoints = self.keypoints[image_name] new_paths, new_keypoints, bboxes = lip.save_cropped_patches(image_path, keypoints, save_dir, enlarge=self.enlarge_factor, target_ar=target_ar) all_save_paths += new_paths all_keypoints.append(new_keypoints) all_bbox += bboxes annot_save_name = pjoin(self._data_config['cropped_dir'], self.kpts_style, self.split, 'annot.npy') np.save(annot_save_name, {'paths': all_save_paths, 'kpts': np.concatenate(all_keypoints, axis=0), 'global_box': all_bbox }) return def _prepare_2d_pose_annot(self, threshold=4): """ Prepare annotation for training the coordinate regression model. """ all_paths = [] all_boxes = [] all_rotations = [] all_keypoints = [] all_keypoints_raw = [] for image_name in self.keypoints.keys(): image_path = pjoin(self._data_config['image_dir'], image_name) # raw keypoints using camera projection keypoints = self.keypoints[image_name] rotations = self.rotations[image_name] boxes_img = [] rots_img = [] visible_kpts_img = [] for i in range(len(keypoints)): # Note here severely-occluded instances are ignored in the trainign data visible_cnt = np.sum(keypoints[i][:, 2]) if visible_cnt < threshold: continue else: # now set all keypoints as visible tempt_kpts = keypoints[i][:,:2] visible_kpts_img.append(np.expand_dims(tempt_kpts, 0)) center, crop_size, new_keypoints, vis_rate = lip.kpts2cs(tempt_kpts, enlarge=self.enlarge_factor) bbox_instance = np.array((list(map(int, lip.cs2bbox(center, crop_size))))) boxes_img.append(bbox_instance.reshape(1,4)) rots_img.append(rotations[i].reshape(1,2)) if len(boxes_img) == 0: continue all_paths.append(image_path) all_boxes.append(np.concatenate(boxes_img)) all_rotations.append(np.concatenate(rots_img)) all_keypoints.append(np.concatenate(visible_kpts_img)) all_keypoints_raw.append(keypoints) return {'paths':all_paths, 'boxes':all_boxes, 'rots':all_rotations, 'kpts':all_keypoints, 'raw_kpts':all_keypoints_raw } def _prepare_detection_records(self, save=False, threshold = 0.1): # DEPRECATED UNTIL FURTHER UPDATE raise ValueError def gather_annotations(self, threshold=0.1, use_raw_bbox=False, add_gt=True, filter_outlier=False ): """ Read ground truth 3D bounding box labels. """ path_list = self._data_config['image_path_list'] record_dict = {} for img_path in path_list: image_name = img_path.split(osep)[-1] if self.split != 'test': # default: use gt label and calibration label_path = pjoin(self._data_config['label_dir'], image_name[:-4] + '.txt' ) self.read_single_file(image_name, record_dict, label_path=label_path, fieldnames=FIELDNAMES, add_gt=add_gt, use_raw_bbox=use_raw_bbox, filter_outlier=filter_outlier ) else: record_dict[image_name] = {} self.annot_dict = record_dict return def read_single_file(self, image_name, record_dict, label_path=None, calib_path=None, threshold=0.1, fieldnames=FIELDNAMES_P, add_gt=False, use_raw_bbox=True, filter_outlier=False, bbox_only=False ): """ Read labels and prepare annotation for a single image. """ style = self._data_config['3d_kpt_sample_style'] image_path = pjoin(self._data_config['image_dir'], image_name) if label_path is None: # default is ground truth annotation label_path = pjoin(self._data_config['label_dir'], image_name[:-3] + 'txt') if calib_path is None: calib_path = pjoin(self._data_config['calib_dir'], image_name[:-3] + 'txt') list_2d, list_3d, list_id, pv, raw_bboxes = self.get_2d_3d_pair(image_path, label_path=label_path, calib_path=calib_path, style=style, augment=False, add_raw_bbox=True, bbox_only=bbox_only, filter_outlier=filter_outlier, fieldnames=fieldnames # also load the confidence score ) if len(raw_bboxes) == 0: return False if image_name not in record_dict: record_dict[image_name] = {} raw_annot, P = self.load_annotations(label_path, calib_path, fieldnames=fieldnames) # use different (slightly) intrinsic parameters for different images K = P[:, :3] if len(list_2d) != 0: for idx, kpts in enumerate(list_2d): list_2d[idx] = kpts.reshape(1, -1, 3) list_3d[idx] = list_3d[idx].reshape(1, -1, 3) all_keypoints_2d = np.concatenate(list_2d, axis=0) all_keypoints_3d = np.concatenate(list_3d, axis=0) # compute 2D bounding box based on the projected 3D boxes bboxes_kpt = [] for idx, keypoints in enumerate(all_keypoints_2d): # relatively tight bounding box: use enlarge = 1.0 # delete invisible instances center, crop_size, _, _ = lip.kpts2cs(keypoints[:,:2], enlarge=1.01) bbox = np.array(lip.cs2bbox(center, crop_size)) bboxes_kpt.append(np.array(bbox).reshape(1, 4)) record_dict[image_name]['kpts_3d'] = all_keypoints_3d if add_gt: # special key name representing ground truth record_dict[image_name]['kpts'] = all_keypoints_2d record_dict[image_name]['kpts_3d_gt'] = all_keypoints_3d if use_raw_bbox: bboxes = np.vstack(raw_bboxes) elif len(bboxes_kpt) != 0: bboxes = np.vstack(bboxes_kpt) record_dict[image_name]['bbox_2d'] = bboxes record_dict[image_name]['raw_txt_format'] = raw_annot record_dict[image_name]['K'] = K # add some key-value pairs as ground truth annotation if add_gt: pvs = np.vstack(pv) if len(pv) != 0 else [] tempt_dic = {'boxes': bboxes, 'pose_vecs_gt':pvs } record_dict[image_name] = {**record_dict[image_name], **tempt_dic} return True def read_predictions(self, path): """ Read the prediction files in the same format as the ground truth. """ self.logger.info("Reading predictions from {:s}".format(path)) file_list = listdir(path) record_dict = {} use_raw_bbox = True if self.split == 'test' else False for file_name in file_list: if not file_name.endswith(".txt"): continue image_name = file_name[:-4] + ".png" label_path = pjoin(path, file_name) self.read_single_file(image_name, record_dict, label_path=label_path, use_raw_bbox=use_raw_bbox ) self.logger.info("Reading predictions finished.") return record_dict def _get_data_parameters(self, cfgs): """ Initialize dataset-relevant parameters. """ self._data_config = {} self._data_config['image_size_raw'] = NotImplemented if self.exp_type in ['2dto3d', 'inference', 'finetune']: # parameters relevant to input/output representation for key in ['3d_kpt_sample_style', 'lft_in_rep', 'lft_out_rep']: self._data_config[key] = cfgs['dataset'][key] if self.exp_type in ['2dto3d']: # parameters relevant to data augmentation for key in ['lft_aug','lft_aug_times']: self._data_config[key] = cfgs['training_settings'][key] # parameters relevant to cuboid interpolation self.interp_params = cfgs['dataset']['interpolate'] # parameters relevant to heatmap regression model and image data augmentation if 'heatmapModel' in cfgs: hm = cfgs['heatmapModel'] jitter_flag = hm['jitter_bbox'] and self.split=='train' and cfgs['train'] self.hm_para = {'reference': 'bbox', 'resize': True, 'add_xy': hm['add_xy'], 'jitter_bbox': jitter_flag, 'jitter_params': hm['jitter_params'], # (height, width) 'input_size': np.array([hm['input_size'][1], hm['input_size'][0]]), 'heatmap_size': np.array([hm['heatmap_size'][1], hm['heatmap_size'][0]]), 'target_ar': hm['heatmap_size'][1]/hm['heatmap_size'][0], 'augment': hm['augment_input'], 'sf': cfgs['dataset']['scaling_factor'], 'rf': cfgs['dataset']['rotation_factor'], 'num_joints': hm['num_joints'], 'sigma': hm['sigma'] if 'sigma' in hm else None, 'target_type': hm['target_type'] if 'target_type' in hm else None, 'use_different_joints_weight': hm['use_different_joints_weight'] if 'use_different_joints_weight' in hm else None } self.num_joints = hm['num_joints'] # parameters relevant to PyTorch image transformation operations if 'pth_transform' in cfgs['dataset']: pth_transform = cfgs['dataset']['pth_transform'] normalize = transforms.Normalize( mean=pth_transform['mean'], std=pth_transform['std'] ) transform_list = [transforms.ToTensor(), normalize] if self.exp_type == 'detect2D' and self.split == 'train': transform_list.append(transforms.RandomHorizontalFlip(0.5)) self.pth_trans = transforms.Compose(transform_list) def _set_paths(self): """ Initialize relevant directories. """ ROOT = self.root split = self.split # validation set is a sub-set of the official training split # train/val/test: 3712/3769/7518 split = 'train' if self.split == 'valid' else split split += 'ing' self._data_config['image_dir'] = pjoin(ROOT, split, 'image_2') self._data_config['cropped_dir'] = pjoin(ROOT, split, 'cropped') self._data_config['drawn_dir'] = pjoin(ROOT, split, 'drawn') self._data_config['label_dir'] = pjoin(ROOT, split, 'label_2') self._data_config['calib_dir'] = pjoin(ROOT, split, 'calib') self._data_config['keypoint_dir'] = pjoin(ROOT, split, 'keypoints') self._data_config['stats_dir'] = pjoin(ROOT, 'instance_stats.npy') # list of images for each sub-set self._data_config['train_list'] = pjoin(ROOT, 'training/ImageSets/train.txt') self._data_config['valid_list'] = pjoin(ROOT, 'training/ImageSets/val.txt') self._data_config['test_list'] = pjoin(ROOT, 'testing/ImageSets/test.txt') self._data_config['trainvalid_list'] = pjoin(ROOT, 'training/ImageSets/trainval.txt') return def project_3d_to_2d(self, points, K): """ Get 2D projection of 3D points in the camera coordinate system. """ projected = K @ points.T projected[:2, :] /= projected[2, :] return projected def render_car(self, ax, K, obj_class, rot_y, locs, dimension, shift): # DEPRECATED cam_cord = [] self.get_cam_cord(cam_cord, shift, rot_y, dimension, locs) # get 2D projections projected = self.project_3d_to_2d(cam_cord[0], K) ax.plot(projected[0, :], projected[1, :], 'ro') vp.plot_3d_bbox(ax, projected[:2, 1:].T) return def show_statistics(self): # DEPRECATED path = self._data_config['stats_dir'] if self._check_precomputed_file(path, 'instance_stats') or self.split != 'train': return self.instance_statistics = {} if hasattr(self, 'car_sizes') and len(self.car_sizes) != 0: all_sizes = np.concatenate(self.car_sizes) fig, axes = plt.subplots(3,1) names = ['x', 'y', 'z'] for axe_id in range(3): axes[axe_id].hist(all_sizes[:, axe_id]) axes[axe_id].set_xlabel('Car size in {:s} direction'.format(names[axe_id])) axes[axe_id].set_ylabel('Counts') mean_size = all_sizes.mean(axis=0) std_size = all_sizes.std(axis=0) self.instance_statistics['size'] = {'mean':mean_size, 'std': std_size } # prepare a reference 3D bounding box xmax, xmin = mean_size[0], -mean_size[0] ymax, ymin = mean_size[1], -mean_size[1] zmax, zmin = mean_size[2], -mean_size[2] bbox = np.array([[xmax, ymin, zmax], [xmax, ymax, zmax], [xmax, ymin, zmin], [xmax, ymax, zmin], [xmin, ymin, zmax], [xmin, ymax, zmax], [xmin, ymin, zmin], [xmin, ymax, zmin]]) bbox = np.vstack([np.array([[0., 0., 0.]]), bbox]) self.instance_statistics['ref_box3d'] = bbox self._save_precomputed_file(self.instance_statistics, path, 'instance_stats') return def augment_pose_vector(self, locs, rot_y, obj_class, dimension, augment, augment_times, std_rot = np.array([15., 50., 15.])*np.pi/180., std_trans = np.array([0.2, 0.01, 0.2]), ): """ Data augmentation used for training the lifter sub-model. std_rot: standard deviation of rotation around x, y and z axis std_trans: standard deviation of translation along x, y and z axis """ aug_ids, aug_pose_vecs = [], [] aug_ids.append((obj_class, dimension)) # KITTI only annotates rotation around y-axis (yaw) pose_vec = np.concatenate([locs, np.array([0., rot_y, 0.])]).reshape(1, 6) aug_pose_vecs.append(pose_vec) if not augment: return aug_ids, aug_pose_vecs rots_random = np.random.randn(augment_times, 3) * std_rot.reshape(1, 3) # y-axis rots_random[:, 1] += rot_y trans_random = 1 + np.random.randn(augment_times, 3) * std_trans.reshape(1, 3) trans_random *= locs.reshape(1, 3) for i in range(augment_times): # augment 6DoF pose aug_ids.append((obj_class, dimension)) pose_vec = np.concatenate([trans_random[i], rots_random[i]]).reshape(1, 6) aug_pose_vecs.append(pose_vec) return aug_ids, aug_pose_vecs def get_representation(self, p2d, p3d, in_rep, out_rep): """ Get input-output representations based on 3d point cloud and its projected 2D screen coordinates. """ # input representation if len(p2d) > 0: num_kpts = len(p2d[0]) if in_rep == 'coordinates2d': input_list = [points.reshape(1, num_kpts, -1) for points in p2d] elif in_rep == 'coordinates2d+area' and self._data_config['3d_kpt_sample_style'] == 'bbox9': # indices: [corner, neighbour1, neighbour2] indices = self.area_indices input_list = [vp.get_area(points, indices, True) for points in p2d] else: raise NotImplementedError('Undefined input representation.') # output representation if out_rep == 'R3d+T': # R3D stands for relative 3D shape, T stands for translation # center the camera coordinates to remove depth output_list = [] for i in range(len(p3d)): # format: the root should be pre-computed as the first 3d point root = p3d[i][[0], :] relative_shape = p3d[i][1:, :] - root output = np.concatenate([root, relative_shape], axis=0) output_list.append(output.reshape(1, -1)) elif out_rep == 'R3d': # relative 3D shape output_list = [] # save a copy of the 3D object roots if not hasattr(self, 'root_list'): self.root_list = [] for i in range(len(p3d)): # format: the root should be pre-computed as the first 3d point root = p3d[i][[0], :] self.root_list.append(root) relative_shape = p3d[i][1:, :] - root output_list.append(relative_shape.reshape(1, -1)) else: raise NotImplementedError('undefined output representation.') return input_list, output_list def get_input_output_size(self): """ Get the input/output size for 2d-to-3d lifting. """ num_joints = self.num_joints if self._data_config['lft_in_rep'] == 'coordinates2d': input_size = num_joints*2 else: raise NotImplementedError if self._data_config['lft_out_rep'] in ['R3d+T']: output_size = num_joints*3 elif self._data_config['lft_out_rep'] in ['R3d']: output_size = (num_joints - 1) * 3 else: raise NotImplementedError return input_size, output_size def interpolate(self, bbox_3d, style, interp_coef=[0.5], dimension=None, strings=['l','h','w'] ): """ Interpolate 3d points on a 3D bounding box with a specified style. """ if dimension is not None: # size-encoded representation l = dimension[0] if l < 3.5: style += 'l' elif l < 4.5: style += 'h' else: style += 'w' pidx, cidx = interp_dict[style] parents, children = bbox_3d[:, pidx], bbox_3d[:, cidx] lines = children - parents new_joints = [(parents + interp_coef[i]*lines) for i in range(len(interp_coef))] return np.hstack([bbox_3d, np.hstack(new_joints)]) def construct_box_3d(self, l, h, w, interp_params): """ Construct 3D bounding box corners in the canonical pose. """ x_corners = [0.5*l, l, l, l, l, 0, 0, 0, 0] y_corners = [0.5*h, 0, h, 0, h, 0, h, 0, h] z_corners = [0.5*w, w, w, 0, 0, w, w, 0, 0] x_corners += - np.float32(l) / 2 y_corners += - np.float32(h) z_corners += - np.float32(w) / 2 corners_3d = np.array([x_corners, y_corners, z_corners]) if interp_params['flag']: corners_3d = self.interpolate(corners_3d, interp_params['style'], interp_params['coef'], #dimension=np.array([l,h,w]) # dimension aware ) return corners_3d def get_cam_cord(self, cam_cord, shift, ids, pose_vecs, rot_xz=False): """ Construct 3D bounding box corners in the camera coordinate system. """ # does not augment the dimension for now dims = ids[0][1] l, h, w = dims[0], dims[1], dims[2] corners_3d_fixed = self.construct_box_3d(l, h, w, self.interp_params) for pose_vec in pose_vecs: # translation locs = pose_vec[0, :3] rots = pose_vec[0, 3:] x, y, z = locs[0], locs[1], locs[2] # bottom center of the labeled 3D box rx, ry, rz = rots[0], rots[1], rots[2] # This purturbation turns out to work well for rotation estimation # x *= (1 + np.random.randn()*0.1) # y *= (1 + np.random.randn()*0.05) # z *= (1 + np.random.randn()*0.1) if self.split == 'train' and self.exp_type == '2dto3d' and not self._inference_mode: ry += np.random.randn()*np.pi # random perturbation rot_maty = np.array([[np.cos(ry), 0, np.sin(ry)], [0, 1, 0], [-np.sin(ry), 0, np.cos(ry)]]) if rot_xz: # rotation. Only yaw angle is considered in KITTI dataset rot_matx = np.array([[1, 0, 0], [0, np.cos(rx), -np.sin(rx)], [0, np.sin(rx), np.cos(rx)]]) rot_matz = np.array([[np.cos(rz), -np.sin(rz), 0], [np.sin(rz), np.cos(rz), 0], [0, 0, 1]]) # TODO: correct here rot_mat = rot_matz @ rot_maty @ rot_matx else: rot_mat = rot_maty corners_3d = np.matmul(rot_mat, corners_3d_fixed) # translation corners_3d += np.array([x, y, z]).reshape([3, 1]) camera_coordinates = corners_3d + shift cam_cord.append(camera_coordinates.T) return def csv_read_annot(self, file_path, fieldnames): """ Read instance attributes in the KITTI format. Instances not in the selected class will be ignored. A list of python dictionary is returned where each dictionary represents one instsance. """ annotations = [] with open(file_path, 'r') as csv_file: reader = csv.DictReader(csv_file, delimiter=' ', fieldnames=fieldnames) for line, row in enumerate(reader): if row["type"] in self._classes: annot_dict = { "class": row["type"], "label": TYPE_ID_CONVERSION[row["type"]], "truncation": float(row["truncated"]), "occlusion": float(row["occluded"]), "alpha": float(row["alpha"]), "dimensions": [float(row['dl']), float(row['dh']), float(row['dw']) ], "locations": [float(row['lx']), float(row['ly']), float(row['lz']) ], "rot_y": float(row["ry"]), "bbox": [float(row["xmin"]), float(row["ymin"]), float(row["xmax"]), float(row["ymax"]) ] } if "score" in fieldnames: annot_dict["score"] = float(row["score"]) annotations.append(annot_dict) return annotations def csv_read_calib(self, file_path): """ Read camera projection matrix in the KITTI format. """ with open(file_path, 'r') as csv_file: reader = csv.reader(csv_file, delimiter=' ') for line, row in enumerate(reader): if row[0] == 'P2:': P = row[1:] P = [float(i) for i in P] P = np.array(P, dtype=np.float32).reshape(3, 4) break return P def load_annotations(self, label_path, calib_path, fieldnames=FIELDNAMES): """ Read 3D annotation and camera parameters. """ if self.split in ['train', 'valid', 'trainvalid', 'test']: annotations = self.csv_read_annot(label_path, fieldnames) # get camera intrinsic matrix K P = self.csv_read_calib(calib_path) return annotations, P def add_visibility(self, joints, img_width=1242, img_height=375): """ Compute binary visibility of projected 2D parts. """ assert joints.shape[1] == 2 visibility = np.ones((len(joints), 1)) # predicate from upper left corner predicate1 = joints - np.array([[0., 0.]]) predicate1 = (predicate1 > 0.).prod(axis=1) # predicate from lower right corner predicate2 = joints - np.array([[img_width, img_height]]) predicate2 = (predicate2 < 0.).prod(axis=1) visibility[:, 0] *= predicate1*predicate2 return np.hstack([joints, visibility]) def get_inlier_indices(self, p_2d, threshold=0.3): """ Get indices of instances that are visible 'enough'. """ indices = [] num_joints = p_2d[0].shape[0] for idx, kpts in enumerate(p_2d): if p_2d[idx][:, 2].sum() / num_joints >= threshold: indices.append(idx) return indices def filter_outlier(self, p_2d, p_3d, threshold=0.3): """ Keep instances that are visible 'enough'. """ p_2d_filtered, p_3d_filtered, indices = [], [], [] num_joints = p_2d[0].shape[0] for idx, kpts in enumerate(p_2d): if p_2d[idx][:, 2].sum() / num_joints >= threshold: p_2d_filtered.append(p_2d[idx]) p_3d_filtered.append(p_3d[idx]) indices.append(idx) return p_2d_filtered, p_3d_filtered def get_img_size(self, path): """ Get the resolution of an image without loading it. """ with Image.open(path) as image: size = image.size return size def get_2d_3d_pair(self, image_path, label_path=None, calib_path=None, style='null', in_rep = 'coordinates2d', out_rep = 'R3d+T', augment=False, augment_times=1, add_visibility=True, add_raw_bbox=False, # add original bbox annotation from KITTI add_rotation=False, # add orientation angles bbox_only=False, # only returns raw bounding box filter_outlier=True, fieldnames=FIELDNAMES ): """ Get (input, output) pair used for training a lifter sub-model from a single image. """ image_name = image_path.split(osep)[-1] if label_path is None: # default is ground truth annotation label_path = pjoin(self._data_config['label_dir'], image_name[:-3] + 'txt') if calib_path is None: calib_path = pjoin(self._data_config['calib_dir'], image_name[:-3] + 'txt') anns, P = self.load_annotations(label_path, calib_path, fieldnames=fieldnames) # The intrinsics may vary slightly for different images # Yet one may convert them to a fixed one by applying a homography K = P[:, :3] # Debug: use pre-defined intrinsic parameters # K = np.array([[707.0493, 0. , 604.0814], # [ 0. , 707.0493, 180.5066], # [ 0. , 0. , 1. ]], dtype=np.float32) shift = np.linalg.inv(K) @ P[:, 3].reshape(3,1) # P containes intrinsics and extrinsics, I factorize P to K[I|K^-1t] # and use extrinsics to compute the camera coordinate # here the extrinsics represent the shift between current camera to # the reference grayscale camera # For more calibration details, refer to "Vision meets Robotics: The KITTI Dataset" camera_coordinates = [] pose_vecs = [] # id includes the class and size of the object ids = [] if add_raw_bbox: bboxes = [] if add_rotation: rotations = [] for i, a in enumerate(anns): a = a.copy() obj_class = a["label"] dimension = a["dimensions"] locs = np.array(a["locations"]) rot_y = np.array(a["rot_y"]) if add_raw_bbox: bboxes.append(np.array(a["bbox"]).reshape(1,4)) if add_rotation: rotations.append(np.array([a["alpha"], a["rot_y"]]).reshape(1,2)) # apply data augmentation to represent a larger variation of # 3D pose and translation if bbox_only: continue aug_ids, aug_pose_vecs = self.augment_pose_vector(locs, rot_y, obj_class, dimension, augment, augment_times ) self.get_cam_cord(camera_coordinates, shift, aug_ids, aug_pose_vecs ) ids += aug_ids pose_vecs += aug_pose_vecs num_instances = len(camera_coordinates) # get 2D projections if len(camera_coordinates) != 0: camera_coordinates = np.vstack(camera_coordinates) projected = self.project_3d_to_2d(camera_coordinates, K)[:2, :].T # target is camera coordinates p_2d = np.split(projected, num_instances, axis=0) p_3d = np.split(camera_coordinates, num_instances, axis=0) # set visibility to 0 if the projected keypoints lie out of the image plane if add_visibility: width, height = self.get_img_size(image_path) for idx, joints in enumerate(p_2d): p_2d[idx] = self.add_visibility(joints, width, height) # filter out the instances that lie outside of the image if filter_outlier: indices = self.get_inlier_indices(p_2d) p_2d = [p_2d[idx] for idx in indices] p_3d = [p_3d[idx] for idx in indices] # p_2d, p_3d = self.filter_outlier(p_2d, p_3d) if filter_outlier and add_raw_bbox: bboxes = [bboxes[idx] for idx in indices] if filter_outlier and add_rotation: rotations = [rotations[idx] for idx in indices] list_2d, list_3d = self.get_representation(p_2d, p_3d, in_rep, out_rep) else: list_2d, list_3d, ids, pose_vecs = [], [], [], [] ret = list_2d, list_3d, ids, pose_vecs if add_raw_bbox: ret = ret + (bboxes, ) if add_rotation: ret = ret + (rotations, ) return ret def show_annot(self, image_path, label_file=None, calib_file=None, save_dir=None ): """ Show the annotation of an image. """ image_name = image_path.split(osep)[-1] if label_file is None: label_file = pjoin(self._data_config['label_dir'], image_name[:-3] + 'txt') if calib_file is None: calib_file = pjoin(self._data_config['calib_dir'], image_name[:-3] + 'txt') anns, P = self.load_annotations(label_file, calib_file) K = P[:, :3] shift = np.linalg.inv(K) @ P[:, 3].reshape(3,1) image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)[:, :, ::-1] fig1 = plt.figure(figsize=(11.3, 9)) ax = plt.subplot(111) ax.imshow(image) fig2 = plt.figure(figsize=(11.3, 9)) ax = plt.subplot(111) ax.imshow(image) for i, a in enumerate(anns): a = a.copy() obj_class = a["label"] dimension = a["dimensions"] locs = np.array(a["locations"]) rot_y = np.array(a["rot_y"]) self.render_car(ax, K, obj_class, rot_y, locs, dimension, shift) if save_dir is not None: output_path1 = pjoin(save_dir, image_name + '_original.png') output_path2 = pjoin(save_dir, image_name + '_annotated.png') make_dir(output_path1) fig1.savefig(output_path1, dpi=300) fig2.savefig(output_path2, dpi=300) return def _generate_2d_3d_paris(self): """ Prepare pair of 2D screen coordinates and 3D cuboid representation. """ path_list = self._data_config['image_path_list'] kpt_3d_style = self._data_config['3d_kpt_sample_style'] in_rep = self._data_config['lft_in_rep'] out_rep = self._data_config['lft_out_rep'] # R3d (Relative 3D shape) encodes 3D rotation input_list = [] output_list = [] id_list = [] augment = self._data_config['lft_aug'] if self.split == 'train' else False augment_times = self._data_config['lft_aug_times'] for path in path_list: list_2d, list_3d, ids, _ = self.get_2d_3d_pair(path, style=kpt_3d_style, in_rep = in_rep, out_rep = out_rep, augment=augment, augment_times=augment_times, add_visibility=True ) input_list += list_2d output_list += list_3d id_list += ids # does not use visibility as input num_instance = len(input_list) self.input = np.vstack(input_list)[:, :, :2].reshape(num_instance, -1) # use visibility as input # self.input = np.vstack(input_list).reshape(num_instance, -1) self.output = np.vstack(output_list) if hasattr(self, 'root_list'): self.root_list = np.vstack(self.root_list) self.num_joints = int(self.input.shape[1]/2) return def generate_pairs(self): """ Prepare data (e.g., input-output pairs and metadata) that will be used depending on the type of experiment. """ if self.exp_type == '2dto3d': # generate 2D screen coordinates and 3D cuboid self._generate_2d_3d_paris() elif self.exp_type in ['instanceto2d', 'baselinealpha', 'baselinetheta']: # # load the annotations containing cropped car instances # path = pjoin(self._data_config['cropped_dir'], # self.kpts_style, self.split, 'annot.npy') # assert exists(path), 'Please prepare instance annotation first.' # self.annot_2dpose = np.load(path, allow_pickle=True).item() self.annot_2dpose = self._prepare_2d_pose_annot() elif self.exp_type in ['detection2d']: self._prepare_detection_records() self.total_data = len(self.detection_records) elif self.exp_type == 'inference': self.gather_annotations() self.total_data = len(self.annot_dict) self.annoted_img_paths = list(self.annot_dict.keys()) elif self.exp_type == 'finetune': self.gather_annotations(use_raw_bbox=False, add_gt=True, filter_outlier=True ) self.total_data = len(self.annot_dict) self.annoted_img_paths = list(self.annot_dict.keys()) else: raise NotImplementedError('Unknown experiment type.') # count of total data if self.exp_type == '2dto3d': self.input = self.input.astype(np.float32()) self.output = self.output.astype(np.float32()) self.total_data = len(self.input) elif self.exp_type in ['instanceto2d', 'baselinealpha', 'baselinetheta']: self.total_data = len(self.annot_2dpose['paths']) return def visualize(self, plot_num = 1, save_dir=None): """ Show some random images with annotations. """ path_list = self._data_config['image_path_list'] chosen = np.random.choice(len(path_list), plot_num, replace=False) for img_idx in chosen: self.show_annot(path_list[img_idx], save_dir=save_dir) return def get_collate_fn(self): return my_collate_fn def inference(self, flags=[True, True]): self._inference_mode = flags[0] self._read_img_during_inference = flags[1] def extract_ss_sample(self, cnt): """ Prepare data for self-supervised representation learning. """ # cnt: number of fully supervised samples extract_cnt = self.ss_settings['max_per_img'] - cnt if extract_cnt <= 0: num_channel = 5 if self.hm_para['add_xy'] else 3 return torch.zeros(0, num_channel, 256, 256), None, None, None idx = np.random.randint(0, len(self.ss_record['paths'])) parameters = self.hm_para parameters['boxes'] = self.ss_record['boxes'][idx] joints = self.ss_record['kpts'][idx] img_name = self.ss_record['paths'][idx].split(osep)[-1] img_path = pjoin(self.ss_settings['img_root'], img_name) image, target, weights, meta = lip.get_tensor_from_img(img_path, parameters, joints=joints, pth_trans=self.pth_trans, rf=parameters['rf'], sf=parameters['sf'], generate_hm=False, max_cnt=extract_cnt ) return image, target, weights, meta def prepare_ft_dict(self, idx): """ Prepare data for fine-tuning. """ img_name = self.annoted_img_paths[idx] img_annot = self.annot_dict[img_name] ret = {} img_path = pjoin(self._data_config['image_dir'], img_name) kpts = img_annot['kpts'] # the croping bounding box in the original image # global_box = self.annot_2dpose['global_box'][idx] parameters = self.hm_para parameters['boxes'] = img_annot['bbox_2d'] # fs: fully-supervised ss: self-supervised images_fs, heatmaps_fs, weights_fs, meta_fs = lip.get_tensor_from_img(img_path, parameters, joints=kpts, pth_trans=self.pth_trans, rf=parameters['rf'], sf=parameters['sf'], generate_hm=True) ret['path'] = img_path ret['images_fs'] = images_fs ret['heatmaps_fs'] = heatmaps_fs # ret['meta_fs'] = meta_fs ret['kpts_3d'] = img_annot['kpts_3d'] ret['crop_center'] = meta_fs['center'] ret['crop_scale'] = meta_fs['scale'] ret['kpts_local'] = meta_fs['transformed_joints'] # prepare the affine transformation matrices so map local coordinates # back to global screen coordinates ret['af_mats'] = [] for idx in range(len(ret['crop_center'])): trans_inv = get_affine_transform(ret['crop_center'][idx], ret['crop_scale'][idx], 0., self.hm_para['input_size'], inv=1) ret['af_mats'].append(trans_inv) # use random unlabeled images for data augmentation if self.split == 'train' and self.use_ss: images_ss, heatmaps_ss, weights_ss, meta_ss = self.extract_ss_sample(len(images_fs)) ret['images_ss'] = images_ss ret['meta_ss'] = meta_ss return ret def __getitem__(self, idx): """ Required by dataloader. """ # only return testing images during inference if self.split == 'test' or self._inference_mode: #TODO: consider classes except for cars in the future img_name = self.annoted_img_paths[idx] # debug: use a specified image for visualization # img_name = "006658.png" img_path = pjoin(self._data_config['image_dir'], img_name) if self._read_img_during_inference: image = lip.imread_rgb(img_path) else: image = None if self._read_img_during_inference and hasattr(self, 'pth_trans'): # pytorch transformation if provided image = self.pth_trans(image) record = {'path':img_path} # add other available annotations if hasattr(self, 'annot_dict'): record = {**record, **self.annot_dict[img_name]} return image, record # for training and validation splits if self.exp_type == '2dto3d': # the 2D-3D pairs are stored in RAM meta_data = {} # the 3D global position if hasattr(self, 'root_list'): meta_data['roots'] = self.root_list[idx] return self.input[idx], self.output[idx], np.zeros((0,1)), meta_data elif self.exp_type in ['baselinealpha', 'baselinetheta']: img_path = self.annot_2dpose['paths'][idx] rots = self.annot_2dpose['rots'][idx] kpts = self.annot_2dpose['kpts'][idx] if kpts.shape[2] == 2: kpts = np.concatenate([kpts, np.ones((kpts.shape[0], kpts.shape[1], 1))], axis=2) parameters = self.hm_para parameters['boxes'] = self.annot_2dpose['boxes'][idx] images_fs, heatmaps_fs, weights_fs, meta_fs = lip.get_tensor_from_img(img_path, parameters, joints=kpts, pth_trans=self.pth_trans, rf=parameters['rf'], sf=parameters['sf'], generate_hm=False ) if self.exp_type == 'baselinealpha': targets = [np.array([[np.cos(rots[idx][0]), np.sin(rots[idx][0])]]) for idx in range(len(rots))] meta_fs['angles_gt'] = rots[:, 0] elif self.exp_type == 'baselinetheta': targets = [np.array([[np.cos(rots[idx][1]), np.sin(rots[idx][1])]]) for idx in range(len(rots))] meta_fs['angles_gt'] = rots[:, 1] targets = torch.from_numpy(np.concatenate(targets).astype(np.float32)) return images_fs, targets, weights_fs, meta_fs elif self.exp_type == 'instanceto2d': # the input images and target heatmaps are produced online img_path = self.annot_2dpose['paths'][idx] kpts = self.annot_2dpose['kpts'][idx] # the croping bounding box in the original image # global_box = self.annot_2dpose['global_box'][idx] if kpts.shape[2] == 2: kpts = np.concatenate([kpts, np.ones((kpts.shape[0], kpts.shape[1], 1))], axis=2) parameters = self.hm_para parameters['boxes'] = self.annot_2dpose['boxes'][idx] # fs: fully-supervised ss: self-supervised images_fs, heatmaps_fs, weights_fs, meta_fs = \ lip.get_tensor_from_img(img_path, parameters, joints=kpts, pth_trans=self.pth_trans, rf=parameters['rf'], sf=parameters['sf'], generate_hm=True ) # use random unlabeled images for data augmentation if self.split == 'train' and hasattr(self, 'use_ss') and self.use_ss: images_ss, heatmaps_ss, weights_ss, meta_ss = self.extract_ss_sample(len(images_fs)) images = [images_fs, images_ss] targets = heatmaps_fs weights = weights_fs meta = meta_fs else: images = images_fs targets = heatmaps_fs weights = weights_fs meta = meta_fs return images, targets, weights, meta elif self.exp_type == 'detection2d': record = copy.deepcopy(self.detection_records[idx]) path = record['path'] image = lip.imread_rgb(path) target = record['target'] if hasattr(self, 'pth_trans'): # pytorch transformation if provided image = self.pth_trans(image) return image, target elif self.exp_type == 'finetune': # prepare images, 2D and 3D annotations as a dictionary for finetuning ret = self.prepare_ft_dict(idx) return ret else: raise NotImplementedError def prepare_data(cfgs, logger): """ Prepare training and validation dataset objects. """ train_set = KITTI(cfgs, 'train', logger) valid_set = KITTI(cfgs, 'valid', logger) if cfgs['exp_type'] == '2dto3d': # normalize 2D keypoints valid_set.normalize(train_set.statistics) return train_set, valid_set def get_dataset(cfgs, logger, split): return KITTI(cfgs, split, logger) def collate_dict(dict_list): ret = {} ret['path'] = [item['path'] for item in dict_list] for key in dict_list[0]: if key == 'path': continue ret[key] = np.concatenate([d[key] for d in dict_list], axis=0) return ret def length_limit(instances, targets, target_weights, meta): if len(instances) > MAX_INS_CNT and len(instances) == len(targets): # normal training chosen = np.random.choice(len(instances), MAX_INS_CNT, replace=False) ins, tar, tw, = instances[chosen], targets[chosen], target_weights[chosen] m = {'path':meta['path']} for key in meta: if key != 'path': m[key] = meta[key][chosen] elif len(instances) > MAX_INS_CNT and len(instances) > len(targets) and meta['fs_instance_cnt'] > MAX_INS_CNT: # mixed training: fully-supervised instances are too many chosen = np.random.choice(meta['fs_instance_cnt'], MAX_INS_CNT, replace=False) ins, tar, tw, = instances[chosen], targets[chosen], target_weights[chosen] m = {'path':meta['path']} for key in meta: if key != 'path' and key != 'fs_instance_cnt': m[key] = meta[key][chosen] elif len(instances) > MAX_INS_CNT and len(instances) > len(targets) and meta['fs_instance_cnt'] <= MAX_INS_CNT: # mixed training: self-supervised instances are too many ins, tar, tw, m = instances[:MAX_INS_CNT], targets, target_weights, meta else: ins, tar, tw, m = instances, targets, target_weights, meta return ins, tar, tw, m def my_collate_fn(batch): # the collate function for 2d pose training instances, targets, target_weights, meta = list(zip(*batch)) if isinstance(instances[0], list): # each batch comes in the format of (fs_instances, ss_instances) fs_instances, ss_instances = list(zip(*instances)) fs_instances = torch.cat(fs_instances) ss_instances = torch.cat(ss_instances) instances = torch.cat([fs_instances, ss_instances]) targets = torch.cat(targets, dim=0) # target_weights = torch.cat(target_weights, dim=0) meta = collate_dict(meta) meta['fs_instance_cnt'] = len(fs_instances) else: instances = torch.cat(instances, dim=0) targets = torch.cat(targets, dim=0) # target_weights = torch.cat(target_weights, dim=0) meta = collate_dict(meta) if target_weights[0] is not None: target_weights = torch.cat(target_weights, dim=0) else: #dummy weight target_weights = torch.ones(1) return length_limit(instances, targets, target_weights, meta) ================================================ FILE: libs/dataset/__init__.py ================================================ #import libs.dataset.ApolloScape import libs.dataset.KITTI ================================================ FILE: libs/dataset/basic/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/dataset/basic/basic_classes.py ================================================ """ Basic classes for customized dataset classes to inherit. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import torch.utils.data import libs.dataset.normalization.operations as nop class SupervisedDataset(torch.utils.data.Dataset): def __init__(self, cfgs, split, logger=None): self.cfgs = cfgs self.split = split self.logger = logger self.root = cfgs['dataset']['root'] return def generate_pairs(self, synthetic=True): # sub-classes need to override this method to specify the inputs and # outputs self.input = None self.output = None self.total_data = 0 return def normalize(self, statistics=None): """ Normalize the (input, output) pairs with optional statistics. """ if statistics is None: mean_in, std_in = nop.get_statistics_1d(self.input) mean_out, std_out = nop.get_statistics_1d(self.output) self.statistics = {'mean_in': mean_in, 'mean_out': mean_out, 'std_in': std_in, 'std_out': std_out } else: mean_in, std_in = statistics['mean_in'], statistics['std_in'] mean_out, std_out = statistics['mean_out'], statistics['std_out'] self.statistics = statistics self.input = nop.normalize_1d(self.input, mean_in, std_in) self.output = nop.normalize_1d(self.output, mean_out, std_out) return def unnormalize(self, data, mean, std): return nop.unnormalize_1d(data, mean, std) def __len__(self): return self.total_data def __getitem__(self, idx): return self.input[idx], self.output[idx] ================================================ FILE: libs/dataset/normalization/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/dataset/normalization/operations.py ================================================ """ Dataset normalization operations. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import numpy as np def get_statistics_1d(data): """ Compute statistics of 1D data. data of shape [num_sample, vector_length] """ assert len(data.shape) == 2 mean = data.mean(axis=0, keepdims=True) std = data.std(axis=0, keepdims=True) return mean, std def normalize_1d(data, mean, std, individual=False): """ Normalizes 1D data with mean and standard deviation. data: dictionary where values are mean: np vector with the mean of the data std: np vector with the standard deviation of the data individual: whether to perform normalization independently for each input Returns data_out: normalized data """ if individual: # this representation has the implicit assumption that the representation # is translational and scaling invariant num_data = len(data) data = data.reshape(num_data, -1, 2) mean_x = np.mean(data[:,:,0], axis=1).reshape(num_data, 1) std_x = np.std(data[:,:,0], axis=1) mean_y = np.mean(data[:,:,1], axis=1).reshape(num_data, 1) std_y = np.std(data[:,:,1], axis=1) denominator = (0.5 * (std_x + std_y)).reshape(num_data, 1) data[:,:,0] = (data[:,:,0] - mean_x)/denominator data[:,:,1] = (data[:,:,1] - mean_y)/denominator data_out = data.reshape(num_data, -1) else: data_out = (data - mean)/std return data_out def unnormalize_1d(normalized_data, mean, std): orig_data = normalized_data*std + mean return orig_data ================================================ FILE: libs/logger/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/logger/logger.py ================================================ """ Basic logging functions. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import logging import os import time from libs.common import utils initialized = False def get_dirs(cfgs): """ Prepare file directories for a logger object. """ root_output_dir = cfgs['dirs']['output'] dataset_name = cfgs['dataset']['name'] cfg_name = cfgs['name'] final_output_dir = [root_output_dir, dataset_name] final_output_dir = os.path.join(*final_output_dir) time_str = time.strftime('%Y-%m-%d %H:%M') log_file = '{}_{}.log'.format(cfg_name, time_str) final_log_file = os.path.join(final_output_dir, log_file) return final_output_dir, final_log_file def get_logger(cfgs, head = '%(asctime)-15s %(message)s'): """ Prepare a logger object. """ final_output_dir, final_log_file = get_dirs(cfgs) utils.make_dir(final_log_file) logging.basicConfig(filename=str(final_log_file), format=head) logger = logging.getLogger() logger.setLevel(logging.INFO) if len(logger.handlers) == 1: console = logging.StreamHandler() logging.getLogger('').addHandler(console) return logger, final_output_dir ================================================ FILE: libs/loss/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/loss/function.py ================================================ """ Loss functions. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from scipy.spatial import distance_matrix from libs.common.img_proc import soft_arg_max, appro_cr loss_dict = {'mse': nn.MSELoss(reduction='mean'), 'sl1': nn.SmoothL1Loss(reduction='mean'), 'l1': nn.L1Loss(reduction='mean') } class JointsMSELoss(nn.Module): def __init__(self, use_target_weight): super(JointsMSELoss, self).__init__() self.criterion = nn.MSELoss(reduction='mean') self.use_target_weight = use_target_weight def forward(self, output, target, target_weight, meta=None): batch_size = output.size(0) num_joints = output.size(1) heatmaps_pred = output.reshape((batch_size, num_joints, -1)).split(1, 1) heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) loss = 0 for idx in range(num_joints): heatmap_pred = heatmaps_pred[idx].squeeze() heatmap_gt = heatmaps_gt[idx].squeeze() if self.use_target_weight: loss += 0.5 * self.criterion( heatmap_pred.mul(target_weight[:, idx]), heatmap_gt.mul(target_weight[:, idx]) ) else: loss += 0.5 * self.criterion(heatmap_pred, heatmap_gt) return loss / num_joints def get_comp_dict(spec_list = ['None', 'None', 'None'], loss_weights = [1,1,1] ): comp_dict = {} if spec_list[0] != 'None': comp_dict['hm'] = (loss_dict[spec_list[0]], loss_weights[0]) if spec_list[1] != 'None': comp_dict['coor'] = (loss_dict[spec_list[1]], loss_weights[1]) if spec_list[2] != 'None': comp_dict['cr'] = (loss_dict[spec_list[2]], loss_weights[2]) return comp_dict class JointsCompositeLoss(nn.Module): """ Loss function for 2d screen coordinate regression which consists of multiple terms. """ def __init__(self, spec_list, img_size, hm_size, loss_weights = [1,1,1], target_cr = None, cr_loss_thres = 0.15, use_target_weight=False ): """ comp_dict specify the optional terms used in the loss computation, which is specified with spec_list. loss for each component follows the format of [loss_type, weight], loss_type speficy the loss type for each component (e.g. L1 or L2) while weight gives the weight for this component. hm: a supervised loss defined with a heatmap target coor: a supervised loss defined with 2D coordinates cr: a self-supervised loss defined with prior cross-ratio """ super(JointsCompositeLoss, self).__init__() self.comp_dict = get_comp_dict(spec_list, loss_weights) self.img_size = img_size self.hm_size = hm_size self.target_cr = target_cr self.use_target_weight = use_target_weight self.apply_cr_loss = False self.cr_loss_thres = cr_loss_thres def calc_hm_loss(self, output, target): """ Heatmap loss which corresponds to L_{hm} in the paper. output: predicted heatmaps of shape [N, K, H, W] target: ground truth heatmaps of shape [N, K, H, W] """ batch_size = output.size(0) num_parts = output.size(1) heatmaps_pred = output.reshape((batch_size, num_parts, -1)).split(1, 1) heatmaps_gt = target.reshape((batch_size, num_parts, -1)).split(1, 1) loss = 0 for idx in range(num_parts): heatmap_pred = heatmaps_pred[idx].squeeze() heatmap_gt = heatmaps_gt[idx].squeeze() loss += 0.5 * self.comp_dict['hm'][0](heatmap_pred, heatmap_gt) return loss / num_parts def calc_cross_ratio_loss(self, pred_coor, target_cr, mask): """ Cross-ratio loss which corresponds to L_{cr} in the paper. pred_coor: predicted local coordinates target_cr: ground truth cross ratio """ assert hasattr(self, 'cr_indices') # this indices is assumed to be initialized by the user loss = 0 mask = mask.to(pred_coor.device) if mask.sum() == 0: return loss for sample_idx in range(len(pred_coor)): for line_idx in range(len(self.cr_indices)): if mask[sample_idx][line_idx] == 0: continue # predicted cross-ratio square pred_cr_sqr = appro_cr(pred_coor[sample_idx][self.cr_indices[line_idx]]) # normalize the predicted cross-ratio square pred_cr_sqr /= target_cr**2 line_loss = self.comp_dict['cr'][0](pred_cr_sqr, torch.ones(1).to(pred_cr_sqr.device)) loss += line_loss * mask[sample_idx][line_idx][0] return loss/mask.sum() def get_cr_mask(self, coordinates, threshold = 0.15): """ Mask some edges out when computing the cross-ratio loss. Ignore the fore-shortened edges since they will produce large and unstable gradient. """ assert hasattr(self, 'cr_indices') mask = torch.zeros(coordinates.shape[0], len(self.cr_indices), 1) for sample_idx in range(len(coordinates)): for line_idx in range(len(self.cr_indices)): pts = coordinates[sample_idx][self.cr_indices[line_idx]] dm = distance_matrix(pts, pts) minval = np.min(dm[np.nonzero(dm)]) if minval > threshold: mask[sample_idx][line_idx] = 1.0 return mask def calc_colinear_loss(self): # DEPRECATED return 0. def calc_coor_loss(self, coordinates_pred, coordinates_gt): """ Coordinate loss which corresponds to L_{2d} in the paper. coordinates_pred: [N, K, 2] coordinates_gt: [N, K, 2] """ coordinates_gt[:, :, 0] /= self.img_size[0] coordinates_gt[:, :, 1] /= self.img_size[1] loss = self.comp_dict['coor'][0](coordinates_pred, coordinates_gt) return loss def forward(self, output, target, target_weight=None, meta=None): """ Loss evaluation. Output is in the format of (heatmaps, coordinates) where coordinates is optional. target refers to the ground truth heatmaps. """ if type(output) is tuple: heatmaps_pred, coordinates_pred = output else: heatmaps_pred, coordinates_pred = output, None total_loss = 0 if 'hm' in self.comp_dict: # some heatmaps map be produced by unlabeled data if len(heatmaps_pred) != len(target): heatmaps_pred = heatmaps_pred[:len(target)] total_loss += self.calc_hm_loss(heatmaps_pred, target) * self.comp_dict['hm'][1] if 'coor' in self.comp_dict: coordinates_gt = meta['transformed_joints'][:, :, :2].astype(np.float32) coordinates_gt = torch.from_numpy(coordinates_gt).cuda() if coordinates_pred == None: coordinates_pred, max_vals = soft_arg_max(heatmaps_pred) coordinates_pred[:, :, 0] /= self.hm_size[1] coordinates_pred[:, :, 1] /= self.hm_size[0] if len(coordinates_pred) != len(coordinates_gt): coordinates_pred_fs = coordinates_pred[:len(coordinates_gt)] else: coordinates_pred_fs = coordinates_pred total_loss += self.calc_coor_loss(coordinates_pred_fs, coordinates_gt) * self.comp_dict['coor'][1] if 'cr' in self.comp_dict and self.comp_dict['cr'][1] != "None" and self.apply_cr_loss: cr_loss_mask = self.get_cr_mask(coordinates_pred.clone().detach().data.cpu().numpy(), self.cr_loss_thres) total_loss += self.calc_cross_ratio_loss(coordinates_pred, self.target_cr, cr_loss_mask) * self.comp_dict['cr'][1] return total_loss class MSELoss1D(nn.Module): """ Mean squared error loss. """ def __init__(self, use_target_weight=False, reduction='mean'): super(MSELoss1D, self).__init__() self.criterion = nn.MSELoss(reduction=reduction) self.use_target_weight = use_target_weight def forward(self, output, target, target_weight=None, meta=None): loss = self.criterion(output, target) return loss class SmoothL1Loss1D(nn.Module): """ Smooth L1 loss. """ def __init__(self, use_target_weight=False): super(SmoothL1Loss1D, self).__init__() self.criterion = nn.SmoothL1Loss(reduction='mean') self.use_target_weight = use_target_weight def forward(self, output, target, target_weight=None, meta=None): loss = self.criterion(output, target) return loss class DecoupledSL1Loss(nn.Module): # DEPRECATED def __init__(self, use_target_weight=None): super(DecoupledSL1Loss, self).__init__() self.criterion = F.smooth_l1_loss def forward(self, output, target, target_weight=None): # balance the loss for translation and rotation regression loss_center = self.criterion(output[:, :3], target[:, :3], reduction='mean') loss_else = self.criterion(output[:, 3:], target[:, 3:], reduction='mean') return loss_center + loss_else class JointsOHKMMSELoss(nn.Module): # DEPRECATED def __init__(self, use_target_weight, topk=8): super(JointsOHKMMSELoss, self).__init__() self.criterion = nn.MSELoss(reduction='none') self.use_target_weight = use_target_weight self.topk = topk def ohkm(self, loss): ohkm_loss = 0. for i in range(loss.size()[0]): sub_loss = loss[i] topk_val, topk_idx = torch.topk( sub_loss, k=self.topk, dim=0, sorted=False ) tmp_loss = torch.gather(sub_loss, 0, topk_idx) ohkm_loss += torch.sum(tmp_loss) / self.topk ohkm_loss /= loss.size()[0] return ohkm_loss def forward(self, output, target, target_weight): batch_size = output.size(0) num_joints = output.size(1) heatmaps_pred = output.reshape((batch_size, num_joints, -1)).split(1, 1) heatmaps_gt = target.reshape((batch_size, num_joints, -1)).split(1, 1) loss = [] for idx in range(num_joints): heatmap_pred = heatmaps_pred[idx].squeeze() heatmap_gt = heatmaps_gt[idx].squeeze() if self.use_target_weight: loss.append(0.5 * self.criterion( heatmap_pred.mul(target_weight[:, idx]), heatmap_gt.mul(target_weight[:, idx]) )) else: loss.append( 0.5 * self.criterion(heatmap_pred, heatmap_gt) ) loss = [l.mean(dim=1).unsqueeze(dim=1) for l in loss] loss = torch.cat(loss, dim=1) return self.ohkm(loss) class WingLoss(nn.Module): # DEPRECATED def __init__(self, use_target_weight, width=5, curvature=0.5, image_size=(384, 288)): super(WingLoss, self).__init__() self.width = width self.curvature = curvature self.C = self.width - self.width * np.log(1 + self.width / self.curvature) self.image_size = image_size def forward(self, output, target, target_weight): prediction, _ = soft_arg_max(output) # normalize the coordinates to 0-1 prediction[:, :, 0] /= self.image_size[1] prediction[:, :, 1] /= self.image_size[0] target[:, :, 0] /= self.image_size[1] target[:, :, 1] /= self.image_size[0] diff = target - prediction diff_abs = diff.abs() loss = diff_abs.clone() idx_smaller = diff_abs < self.width idx_bigger = diff_abs >= self.width loss[idx_smaller] = self.width * torch.log(1 + diff_abs[idx_smaller] / self.curvature) loss[idx_bigger] = loss[idx_bigger] - self.C loss = loss.mean() return loss ================================================ FILE: libs/metric/criterions.py ================================================ """ Metric functions used for validation. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import libs.common.transformation as ltr import libs.common.img_proc as lip from libs.common.transformation import compute_similarity_transform import numpy as np import torch from scipy.spatial.transform import Rotation # threshold for percentage of correct key-points (PCK) PCK_THRES = np.array([0.1, 0.2, 0.3]) def get_distance(gt, pred): """ 2D Euclidean distance of two groups of points with visibility considered. gt: [n_joints, 2 or 3] pred: [n_joints, 2] """ if gt.shape[1] == 2: sqerr = (gt - pred)**2 sqerr = sqerr.sum(axis = 1) dist_list = list(np.sqrt(sqerr)) elif gt.shape[1] == 3: dist_list = [] sqerr = (gt[:, :2] - pred)**2 sqerr = sqerr.sum(axis = 1) indices = np.nonzero(gt[:, 2])[0] dist_list = list(np.sqrt(sqerr[indices])) else: raise ValueError('Array shape not supported.') return dist_list def get_angle_error(pred, meta_data, cfgs=None): """ Compute error for angle prediction. """ if not isinstance(pred, np.ndarray): pred = pred.data.cpu().numpy() angles_pred = np.arctan2(pred[:,1], pred[:,0]) angles_gt = meta_data['angles_gt'] dif = np.abs(angles_gt - angles_pred) * 180 / np.pi # add or minus 2*pi indices = dif > 180 dif[indices] = 360 - dif[indices] cnt = len(pred) avg_acc = dif.sum()/cnt others = None return avg_acc, cnt, others def get_PCK(pred, gt): """ Get percentage of correct key-points """ distance = np.array(get_distance(gt, pred)) denominator = (gt[:, 1].max() - gt[:, 1].min()) * 1/3 correct_cnt = np.zeros((len(PCK_THRES))) for idx, thres in enumerate(PCK_THRES): correct_cnt[idx] = (distance < thres * denominator).sum() return correct_cnt def get_distance_src(output, meta_data, cfgs=None, image_size = (256.0, 256.0), arg_max='hard' ): """ From predicted heatmaps, obtain local coordinates (\phi_l in the paper) and transform them back to the source images based on metadata. Error is then evaluated on the source image for the screen coordinates (\phi_g in the paper). """ # the error is reported as distance in terms of pixels in the source image if type(output) is tuple: pred, max_vals = output[1].data.cpu().numpy(), None elif isinstance(output, np.ndarray) and arg_max == 'soft': pred, max_vals = lip.soft_arg_max_np(output) elif isinstance(output, torch.Tensor) and arg_max == 'soft': pred, max_vals = lip.soft_arg_max(output) elif isinstance(output, np.ndarray) or isinstance(output, torch.Tensor) and arg_max == 'hard': if not isinstance(output, np.ndarray): output = output.data.cpu().numpy() pred, max_vals = lip.get_max_preds(output) else: raise NotImplementedError image_size = image_size if cfgs is None else cfgs['heatmapModel']['input_size'] width, height = image_size # multiply by down-sample ratio if not isinstance(pred, np.ndarray): pred = pred.data.cpu().numpy() if (max_vals is not None) and (not isinstance(max_vals, np.ndarray)): max_vals = max_vals.data.cpu().numpy() # the coordinates need to be rescaled for different cases if type(output) is tuple: pred *= np.array(image_size).reshape(1, 1, 2) else: pred *= image_size[0] / output.shape[3] # inverse transform and compare pixel didstance centers, scales = meta_data['center'], meta_data['scale'] # some predictions are generated for unlabeled data if len(pred) != len(centers): pred_used = pred[:len(centers)] else: pred_used = pred if 'rotation' in meta_data: rots = meta_data['rotation'] else: rots = [0. for i in range(len(centers))] joints_original_batch = meta_data['original_joints'] distance_list = [] correct_cnt_sum = np.zeros((len(PCK_THRES))) all_src_coordinates = [] for sample_idx in range(len(pred_used)): trans_inv = lip.get_affine_transform(centers[sample_idx], scales[sample_idx], rots[sample_idx], (height, width), inv=1 ) joints_original = joints_original_batch[sample_idx] pred_src_coordinates = lip.affine_transform_modified(pred_used[sample_idx], trans_inv ) all_src_coordinates.append(pred_src_coordinates.reshape(1, len(pred_src_coordinates), 2)) distance_list += get_distance(joints_original, pred_src_coordinates) correct_cnt_sum += get_PCK(pred_src_coordinates, joints_original) cnt = len(distance_list) avg_acc = sum(distance_list) / cnt others = { 'src_coord': np.concatenate(all_src_coordinates, axis=0), # screen coordinates 'joints_pred': pred, # predicted local coordinates 'max_vals': max_vals, 'correct_cnt': correct_cnt_sum, 'PCK_batch': correct_cnt_sum / cnt } return avg_acc, cnt, others class AngleError(): """ Angle error in degrees. """ def __init__(self, cfgs, num_joints=None): self.name = 'Angle error in degrees' self.num_joints = num_joints self.count = 0 self.mean = 0. return def update(self, prediction, meta_data, ground_truth=None, logger=None): """ the prediction and transformation parameters in meta_data are used. """ avg_acc, cnt, others = get_angle_error(prediction, meta_data) self.mean = (self.mean * self.count + cnt * avg_acc) / (self.count + cnt) self.count += cnt return def report(self, logger): msg = 'Error type: {error_type:s}\t' \ 'Error: {Error}\t'.format( error_type = self.name, Error = self.mean) logger.info(msg) return class JointDistance2DSIP(): """ Joint distance error evaluated for screen coordinates in the source image plane (SIP). """ def __init__(self, cfgs, num_joints=None): self.name = 'Joint distance in the source image plane' if num_joints is not None: self.num_joints = num_joints else: self.num_joints = cfgs['heatmapModel']['num_joints'] self.image_size = cfgs['heatmapModel']['input_size'] if 'arg_max' in cfgs['testing_settings']: self.arg_max = cfgs['testing_settings']['arg_max'] else: self.arg_max = None self.count = 0 self.mean = 0. self.PCK_counts = np.zeros(len(PCK_THRES)) return def update(self, prediction, meta_data, ground_truth=None, logger=None): """ Update statistics for a batch. The prediction and transformation parameters in meta_data are used. """ avg_acc, cnt, others = get_distance_src(prediction, meta_data, arg_max=self.arg_max, image_size=self.image_size ) self.mean = (self.mean * self.count + cnt * avg_acc) / (self.count + cnt) self.count += cnt self.PCK_counts += others['correct_cnt'] return def report(self, logger): """ Report final evaluation results. """ logger.info("Ealuaton Results:") msg = 'Error type: {error_type:s}\t' \ 'MPJPE: {MPJPE}\t'.format(error_type = self.name, MPJPE = self.mean ) logger.info(msg) for idx, value in enumerate(self.PCK_counts): PCK = value / self.count logger.info('PCK at threshold {:.2f}: {:.3f}'.format(PCK_THRES[idx], PCK)) return def update_statistics(self, update, num_data, name_str): """ Update error statistics for a data batch. """ old_count = getattr(self, 'count'+name_str) old_mean = getattr(self, 'mean'+name_str) old_max = getattr(self, 'max'+name_str) old_min = getattr(self, 'min'+name_str) new_mean = (old_count * old_mean + np.sum(update, axis=0)) / (old_count + num_data) new_count = old_count + num_data new_max = np.maximum(old_max, update.max(axis=0)) new_min = np.minimum(old_min, update.min(axis=0)) setattr(self, 'mean'+name_str, new_mean) setattr(self, 'count'+name_str, new_count) setattr(self, 'max'+name_str, new_max) setattr(self, 'min'+name_str, new_min) return def update_rotation_error(self, prediction, ground_truth, meta_data=None, logger=None, name_str='', style='euler' ): """ Get rotation error between two 3D point clouds. """ num_data = len(prediction) prediction = prediction.reshape(num_data, -1, 3) ground_truth = ground_truth.reshape(num_data, -1, 3) if style == 'euler': results = -np.ones((num_data, 3)) for data_idx in range(num_data): R, T = ltr.compute_rigid_transform(prediction[data_idx].T, ground_truth[data_idx].T ) if style == 'euler': results[data_idx] = np.abs(Rotation.from_matrix(R).as_euler('xyz', degrees=True ) ) else: raise NotImplementedError update_statistics(self, results, num_data, name_str) return def update_joints_3d_error(self, prediction, ground_truth, meta_data=None, logger=None, name_str='', style='direct' ): """ Get distance error between prediction and ground truth. """ ground_truth = ground_truth.reshape(len(ground_truth), -1, 3) prediction = prediction.reshape(len(prediction), -1, 3) num_joints = prediction.shape[1] if style == 'procrustes': # Apply procrustes alignment if asked to do so for j in range(len(prediction)): gt = ground_truth[j] out = prediction[j] _, Z, T, b, c = compute_similarity_transform(gt, out, compute_optimal_scale=True) out = (b * out.dot(T)) + c prediction[j] = np.reshape(out, [num_joints, 3]) sqerr = (ground_truth - prediction)**2 distance = np.sqrt(np.sum(sqerr, axis=2)) num_data = len(prediction) update_statistics(self, distance, num_data, name_str) # provide detailed L1 errors if there is only one joint if num_joints == 1: error_xyz = np.abs(ground_truth - prediction) update_statistics(self, error_xyz, num_data, name_str + '_xyz') return class RotationError3D(): """ Helper class for recording rotation estimation error. """ def __init__(self, cfgs): self.name = 'Rotation error' self.style = cfgs['metrics']['R3D']['style'] self.count = 0 if self.style == 'euler': self.mean = np.zeros((3)) self.max = -np.ones((3)) self.min = np.ones((3))*1e16 return def update(self, prediction, ground_truth, meta_data=None, logger=None): """ get rotation error between two point clouds """ update_rotation_error(self, prediction, ground_truth, meta_data=meta_data, logger=logger, style=self.style ) return def report(self, logger): msg = 'Error type: {error_type:s}\t' \ 'Mean error: {mean_error}\t' \ 'Max error: {max_error}\t' \ 'Min error: {min_error}\t'.format( error_type = self.name, mean_error= self.mean, max_error= self.max, min_error= self.min ) logger.info(msg) return class JointDistance3D(): """ Helper class for recording joint distance error. """ def __init__(self, cfgs): self.name = 'Joint distance' self.style = cfgs['metrics']['JD3D']['style'] self.num_joints = int(cfgs['FCModel']['output_size']/3) self.count = 0 if self.style in ['direct', 'procrustes']: self.mean = np.zeros((self.num_joints)) self.max = -np.ones((self.num_joints)) self.min = np.ones((self.num_joints))*1e16 else: raise NotImplementedError return def update(self, prediction, ground_truth, meta_data=None, logger=None): """ get Euclidean distance between two point clouds """ update_joints_3d_error(self, prediction, ground_truth, meta_data=meta_data, logger=logger, name_str='', style=self.style ) return def report(self, logger): MPJPE = self.mean.sum() / self.num_joints msg = 'Error type: {error_type:s}\t' \ 'MPJPE: {MPJPE}\t' \ 'Mean error for each joint: {mean_error}\t' \ 'Max error for each joint: {max_error}\t' \ 'Min error for each joint: {min_error}\t'.format( error_type = self.name, MPJPE = MPJPE, mean_error= self.mean, max_error= self.max, min_error= self.min ) logger.info(msg) return class RError3D(): def __init__(self, cfgs, num_joints): """ Relative shape error The point cloud should have a format [shape_relative_to_root] """ self.name = 'RError3D' self.T_style = cfgs['metrics']['R3D']['T_style'] self.R_style = cfgs['metrics']['R3D']['R_style'] if cfgs['dataset']['3d_kpt_sample_style'] == 'bbox9': self.num_joints = num_joints - 1 # discount the root joint else: raise NotImplementedError self.count_rT = self.count_R = 0 # translation error of the shape relative to the root self.mean_rT = np.zeros((self.num_joints)) self.max_rT = -np.ones((self.num_joints)) self.min_rT = np.ones((self.num_joints))*1e16 # relative rotation between the ground truth shape and predicted shape self.mean_R = np.zeros((3)) self.max_R = -np.ones((3)) self.min_R = np.ones((3))*1e16 return def update(self, prediction, ground_truth, meta_data=None, logger=None): update_joints_3d_error(self, prediction=prediction, ground_truth=ground_truth, meta_data=meta_data, logger=logger, name_str='_rT', style=self.T_style ) update_rotation_error(self, prediction=prediction, ground_truth=ground_truth, meta_data=meta_data, logger=logger, name_str='_R', style=self.R_style ) return def report(self, logger): MPJPE = self.mean_rT.sum() / self.num_joints msg = 'Error type: {error_type:s}\t' \ 'MPJPE of the shape relative to the root:\t' \ 'MPJPE: {MPJPE}\t' \ 'Rotation error of the shape relative to the root:\t' \ 'Mean error: {mean_R}\t' \ 'Max error: {max_R}\t' \ 'Min error: {min_R}\t'.format( error_type = self.name, MPJPE = MPJPE, mean_R = self.mean_R, max_R = self.max_R, min_R = self.min_R ) logger.info(msg) return class RTError3D(): def __init__(self, cfgs, num_joints): """ Rotation and translation error combined. The point cloud should have a format [root, shape_relative_to_root] """ self.name = 'RTError3D' self.T_style = cfgs['metrics']['RTError3D']['T_style'] self.R_style = cfgs['metrics']['RTError3D']['R_style'] if cfgs['dataset']['3d_kpt_sample_style'] == 'bbox9': self.num_joints = num_joints - 1 # discount the root joint else: raise NotImplementedError self.count_T = self.count_T_xyz = self.count_rT = self.count_R = 0 if self.T_style in ['direct', 'procrustes']: # translation error of the root vector self.mean_T = np.zeros((1)) # L1 error for each component self.mean_T_xyz = np.zeros((3)) self.max_T = -np.ones((1)) self.max_T_xyz = -np.ones((3)) self.min_T = np.ones((1))*1e16 self.min_T_xyz = np.ones((3))*1e16 # translation error of the shape relative to the root self.mean_rT = np.zeros((self.num_joints)) self.max_rT = -np.ones((self.num_joints)) self.min_rT = np.ones((self.num_joints))*1e16 else: raise NotImplementedError # relative rotation between the ground truth shape and predicted shape self.mean_R = np.zeros((3)) self.max_R = -np.ones((3)) self.min_R = np.ones((3))*1e16 return def update(self, prediction, ground_truth, meta_data=None, logger=None): update_joints_3d_error(self, prediction=prediction[:, :3], ground_truth=ground_truth[:, :3], meta_data=meta_data, logger=logger, name_str='_T', style=self.T_style ) update_joints_3d_error(self, prediction=prediction[:, 3:], ground_truth=ground_truth[:, 3:], meta_data=meta_data, logger=logger, name_str='_rT', style=self.T_style ) update_rotation_error(self, prediction=prediction[:, 3:], ground_truth=ground_truth[:, 3:], meta_data=meta_data, logger=logger, name_str='_R', style=self.R_style ) return def report(self, logger): MPJPE = self.mean_rT.sum() / self.num_joints msg = 'Error type: {error_type:s}\t' \ 'Translation error of the root:\t' \ 'Mean error: {mean_T}\t' \ 'Max error: {max_T}\t' \ 'Min error: {min_T}\t' \ 'Translation error of the root in three directions:\t' \ 'Mean error (L1): {mean_T_xyz}\t' \ 'MPJPE of the shape relative to the root:\t' \ 'MPJPE: {MPJPE}\t' \ 'Rotation error of the shape relative to the root:\t' \ 'Mean error: {mean_R}\t' \ 'Max error: {max_R}\t' \ 'Min error: {min_R}\t'.format( error_type = self.name, MPJPE = MPJPE, mean_T = self.mean_T, max_T = self.max_T, min_T = self.min_T, mean_T_xyz = self.mean_T_xyz, mean_R = self.mean_R, max_R = self.max_R, min_R = self.min_R) logger.info(msg) return class Evaluator(): """ Helper class for recording a list of pre-defined metrics. """ def __init__(self, metrics, cfgs=None, num_joints=9): """ metrics is a list of strings specifying what metrics to use """ self.metrics = [] for metric in metrics: self.metrics.append(eval(metric + '(cfgs=cfgs, num_joints=num_joints)')) return def update(self, prediction, ground_truth=None, meta_data=None, logger=None ): """ update evaluation with a new batch of prediction and ground truth """ for metric in self.metrics: metric.update(prediction, ground_truth=ground_truth, meta_data=meta_data, logger=logger ) return def report(self, logger): for metric in self.metrics: metric.report(logger) return ================================================ FILE: libs/model/FCmodel.py ================================================ """ Fully-connected model architecture for processing 1D data. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import torch.nn as nn class ResidualBlock(nn.Module): def __init__(self, num_neurons, p_dropout=0.5, kaiming=False, leaky=False): super(ResidualBlock, self).__init__() self.num_neurons = num_neurons self.leaky = leaky self.p_dropout = p_dropout if leaky: self.relu = nn.LeakyReLU(inplace=True) else: self.relu = nn.ReLU(inplace=True) self.dropout = nn.Dropout(p_dropout) self.w1 = nn.Linear(self.num_neurons, self.num_neurons) self.batch_norm1 = nn.BatchNorm1d(self.num_neurons) self.w2 = nn.Linear(self.num_neurons, self.num_neurons) self.batch_norm2 = nn.BatchNorm1d(self.num_neurons) if kaiming: # kaiming initialization self.w1.weight.data = nn.init.kaiming_normal_(self.w1.weight.data) self.w2.weight.data = nn.init.kaiming_normal_(self.w2.weight.data) def forward(self, x): y = self.w1(x) y = self.batch_norm1(y) y = self.relu(y) y = self.dropout(y) y = self.w2(y) y = self.batch_norm2(y) y = self.relu(y) y = self.dropout(y) out = x + y return out class FCModel(nn.Module): def __init__(self, stage_id=1, num_neurons=1024, num_blocks=2, p_dropout=0.5, norm_twoD=False, kaiming=False, refine_3d=False, leaky=False, dm=False, input_size=32, output_size=64): """ dm: use distance matrix feature computed from coordinates (DEPRECATED) leaky: use leaky ReLu instead of normal Relu """ super(FCModel, self).__init__() self.num_neurons = num_neurons self.p_dropout = p_dropout self.num_blocks = num_blocks self.stage_id = stage_id self.refine_3d = refine_3d self.leaky = leaky self.dm = dm self.input_size = input_size self.output_size = output_size # map the input to a representation vector self.w1 = nn.Linear(self.input_size, self.num_neurons) self.batch_norm1 = nn.BatchNorm1d(self.num_neurons) self.res_blocks = [] for l in range(num_blocks): self.res_blocks.append(ResidualBlock(num_neurons=self.num_neurons, p_dropout=self.p_dropout, leaky=self.leaky)) self.res_blocks = nn.ModuleList(self.res_blocks) # output self.w2 = nn.Linear(self.num_neurons, self.output_size) if self.leaky: self.relu = nn.LeakyReLU(inplace=True) else: self.relu = nn.ReLU(inplace=True) self.dropout = nn.Dropout(self.p_dropout) if kaiming: self.w1.weight.data = nn.init.kaiming_normal_(self.w1.weight.data) self.w2.weight.data = nn.init.kaiming_normal_(self.w2.weight.data) def forward(self, x): y = self.get_representation(x) y = self.w2(y) return y def get_representation(self, x): y = self.w1(x) y = self.batch_norm1(y) y = self.relu(y) y = self.dropout(y) # residual blocks for i in range(self.num_blocks): y = self.res_blocks[i](y) return y def get_fc_model(stage_id, cfgs, input_size, output_size, architecture_type = 'FCModel'): return FCModel(stage_id=stage_id, refine_3d=cfgs[architecture_type]['refine_3d'], norm_twoD=cfgs[architecture_type]['norm_twoD'], num_blocks=cfgs[architecture_type]['num_blocks'], input_size=input_size, output_size=output_size, num_neurons=cfgs[architecture_type]['num_neurons'], p_dropout=cfgs[architecture_type]['dropout'], leaky=cfgs[architecture_type]['leaky'] ) def get_cascade(): return nn.ModuleList([]) ================================================ FILE: libs/model/__init__.py ================================================ import libs.model.heatmapModel.hrnet import libs.model.heatmapModel.resnet ================================================ FILE: libs/model/egonet.py ================================================ """ A PyTorch implementation of Ego-Net. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import torch import torch.nn as nn import numpy as np import cv2 import math from scipy.spatial.transform import Rotation from os.path import join as pjoin import libs.model as models import libs.model.FCmodel as FCmodel import libs.dataset.normalization.operations as nop import libs.visualization.egonet_utils as vego import libs.common.transformation as ltr from libs.common.img_proc import to_npy, get_affine_transform, generate_xy_map, modify_bbox from libs.common.img_proc import affine_transform_modified from libs.common.format import save_txt_file, get_pred_str from libs.dataset.KITTI.car_instance import interp_dict class EgoNet(nn.Module): def __init__(self, cfgs, pre_trained=False ): """ Initialization method of Ego-Net. """ super(EgoNet, self).__init__() # initialize a fully-convolutional heatmap regression model # this model corresponds to H and C in Equation (2) hm_model_settings = cfgs['heatmapModel'] hm_model_name = hm_model_settings['name'] # this implementation uses a HR-Net backbone, yet you can use other # backbones as well method_str = 'models.heatmapModel.' + hm_model_name + '.get_pose_net' self.HC = eval(method_str)(cfgs, is_train=False) self.resolution = cfgs['heatmapModel']['input_size'] # optional channel augmentation if 'add_xy' in cfgs['heatmapModel']: self.xy_dict = {'flag':cfgs['heatmapModel']['add_xy']} else: self.xy_dict = None # initialize a lifing model # this corresponds to L in Equation (2) self.L = FCmodel.get_fc_model(stage_id=1, cfgs=cfgs, input_size=cfgs['FCModel']['input_size'], output_size=cfgs['FCModel']['output_size'] ) if pre_trained: # load pre-trained checkpoints HC_path = pjoin(cfgs['dirs']['ckpt'], 'HC.pth') L_path = pjoin(cfgs['dirs']['ckpt'], 'L.pth') LS_path = pjoin(cfgs['dirs']['ckpt'], 'LS.npy') self.HC.load_state_dict(torch.load(HC_path)) # the statistics used by the lifter for normalizing inputs self.LS = np.load(LS_path, allow_pickle=True).item() self.L.load_state_dict(torch.load(L_path)) def crop_single_instance(self, img, bbox, resolution, pth_trans=None, xy_dict=None ): """ Crop a single instance given an image and bounding box. """ bbox = to_npy(bbox) width, height = resolution target_ar = height / width ret = modify_bbox(bbox, target_ar) c, s, r = ret['c'], ret['s'], 0. # xy_dict: parameters for adding xy coordinate maps trans = get_affine_transform(c, s, r, (height, width)) instance = cv2.warpAffine(img, trans, (int(resolution[0]), int(resolution[1])), flags=cv2.INTER_LINEAR ) #cv2.imwrite('instance.jpg', instance) if xy_dict is not None and xy_dict['flag']: xymap = generate_xy_map(ret['bbox'], resolution, img.shape[:-1]) instance = np.concatenate([instance, xymap.astype(np.float32)], axis=2) instance = instance if pth_trans is None else pth_trans(instance) return instance def load_cv2(self, path, rgb=True): data_numpy = cv2.imread(path, 1 | 128) if data_numpy is None: raise ValueError('Fail to read {}'.format(path)) if rgb: data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) return data_numpy def crop_instances(self, annot_dict, resolution, pth_trans=None, rgb=True, xy_dict=None ): """ Crop input instances given an annotation dictionary. """ all_instances = [] # each record stores attributes of one instance all_records = [] target_ar = resolution[1] / resolution[0] for idx, path in enumerate(annot_dict['path']): data_numpy = self.load_cv2(path) boxes = annot_dict['boxes'][idx] if 'labels' in annot_dict: labels = annot_dict['labels'][idx] else: labels = -np.ones((len(boxes)), dtype=np.int64) if 'scores' in annot_dict: scores = annot_dict['scores'][idx] else: scores = -np.ones((len(boxes))) if len(boxes) == 0: continue for idx, bbox in enumerate(boxes): # crop an instance with required aspect ratio instance = self.crop_single_instance(data_numpy, bbox, resolution, pth_trans=pth_trans, xy_dict=xy_dict ) bbox = to_npy(bbox) ret = modify_bbox(bbox, target_ar) c, s, r = ret['c'], ret['s'], 0. all_instances.append(torch.unsqueeze(instance, dim=0)) all_records.append({ 'path': path, 'center': c, 'scale': s, 'bbox': bbox, 'bbox_resize': ret['bbox'], 'rotation': r, 'label': labels[idx], 'score': scores[idx] } ) return torch.cat(all_instances, dim=0), all_records def add_orientation_arrow(self, record): """ Generate an arrow for each predicted orientation for visualization. """ pred_kpts = record['kpts_3d_pred'] gt_kpts = record['kpts_3d_gt'] K = record['K'] arrow_2d = np.zeros((len(pred_kpts), 2, 2)) for idx in range(len(pred_kpts)): vector_3d = (pred_kpts[idx][1] - pred_kpts[idx][5]) arrow_3d = np.concatenate([gt_kpts[idx][0].reshape(3, 1), (gt_kpts[idx][0] + vector_3d).reshape(3, 1)], axis=1) projected = K @ arrow_3d arrow_2d[idx][0] = projected[0, :] / projected[2, :] arrow_2d[idx][1] = projected[1, :] / projected[2, :] # fix the arrow length if not fore-shortened vector_2d = arrow_2d[idx][:,1] - arrow_2d[idx][:,0] length = np.linalg.norm(vector_2d) if length > 50: vector_2d = vector_2d/length * 60 arrow_2d[idx][:,1] = arrow_2d[idx][:,0] + vector_2d return arrow_2d def write_annot_dict(self, annot_dict, records): for idx, path in enumerate(annot_dict['path']): if 'boxes' in annot_dict: records[path]['boxes'] = to_npy(annot_dict['boxes'][idx]) if 'kpts' in annot_dict: records[path]['kpts_2d_gt'] = to_npy(annot_dict['kpts'][idx]) if 'kpts_3d_gt' in annot_dict: records[path]['kpts_3d_gt'] = to_npy(annot_dict['kpts_3d_gt'][idx]) if 'pose_vecs_gt' in annot_dict: records[path]['pose_vecs_gt'] = to_npy(annot_dict['pose_vecs_gt'][idx]) if 'kpts_3d_before' in annot_dict: records[path]['kpts_3d_before'] = to_npy(annot_dict['kpts_3d_before'][idx]) if 'raw_txt_format' in annot_dict: # list of annotation dictionary for each instance records[path]['raw_txt_format'] = annot_dict['raw_txt_format'][idx] if 'K' in annot_dict: # list of annotation dictionary for each instance records[path]['K'] = annot_dict['K'][idx] if 'kpts_3d_gt' in annot_dict and 'K' in annot_dict: records[path]['arrow'] = self.add_orientation_arrow(records[path]) return records def get_observation_angle_trans(self, euler_angles, translations): """ Convert orientation in camera coordinate into local coordinate system utilizing known object location (translation) """ alphas = euler_angles[:,1].copy() for idx in range(len(euler_angles)): ry3d = euler_angles[idx][1] # orientation in the camera coordinate system x3d, z3d = translations[idx][0], translations[idx][2] alpha = ry3d - math.atan2(-z3d, x3d) - 0.5 * math.pi #alpha = ry3d - math.atan2(x3d, z3d)# - 0.5 * math.pi while alpha > math.pi: alpha -= math.pi * 2 while alpha < (-math.pi): alpha += math.pi * 2 alphas[idx] = alpha return alphas def get_observation_angle_proj(self, euler_angles, kpts, K): """ Convert orientation in camera coordinate into local coordinate system utilizing the projection of object on the image plane """ f = K[0,0] cx = K[0,2] kpts_x = [kpts[i][0,0] for i in range(len(kpts))] alphas = euler_angles[:,1].copy() for idx in range(len(euler_angles)): ry3d = euler_angles[idx][1] # orientation in the camera coordinate system x3d, z3d = kpts_x[idx] - cx, f alpha = ry3d - math.atan2(-z3d, x3d) - 0.5 * math.pi #alpha = ry3d - math.atan2(x3d, z3d)# - 0.5 * math.pi while alpha > math.pi: alpha -= math.pi * 2 while alpha < (-math.pi): alpha += math.pi * 2 alphas[idx] = alpha return alphas def get_template(self, prediction, interp_coef=[0.332, 0.667]): """ Construct a template 3D cuboid at canonical pose. The 3D cuboid is represented as part coordinates in the camera coordinate system. """ parents = prediction[interp_dict['bbox12'][0] - 1] children = prediction[interp_dict['bbox12'][1] - 1] lines = parents - children lines = np.sqrt(np.sum(lines**2, axis=1)) # averaged over the four parallel line segments h, l, w = np.sum(lines[:4])/4, np.sum(lines[4:8])/4, np.sum(lines[8:])/4 x_corners = [l, l, l, l, 0, 0, 0, 0] y_corners = [0, h, 0, h, 0, h, 0, h] z_corners = [w, w, 0, 0, w, w, 0, 0] x_corners += - np.float32(l) / 2 y_corners += - np.float32(h) #y_corners += - np.float32(h/2) z_corners += - np.float32(w) / 2 corners_3d = np.array([x_corners, y_corners, z_corners]) if len(prediction) == 32: pidx, cidx = interp_dict['bbox12'] parents, children = corners_3d[:, pidx - 1], corners_3d[:, cidx - 1] lines = children - parents new_joints = [(parents + interp_coef[i]*lines) for i in range(len(interp_coef))] corners_3d = np.hstack([corners_3d, np.hstack(new_joints)]) return corners_3d def kpts_to_euler(self, template, prediction): """ Convert the predicted cuboid representation to euler angles. """ # estimate roll, pitch, yaw of the prediction by comparing with a # reference bounding box # prediction and template of shape [3, N_points] R, T = ltr.compute_rigid_transform(template, prediction) # in the order of yaw, pitch and roll angles = Rotation.from_matrix(R).as_euler('yxz', degrees=False) # re-order in the order of x, y and z angles = angles[[1,0,2]] return angles, T def get_6d_rep(self, predictions, ax=None, color="black"): """ Get the 6DoF representation of a 3D prediction. """ predictions = predictions.reshape(len(predictions), -1, 3) all_angles = [] for instance_idx in range(len(predictions)): prediction = predictions[instance_idx] # templates are 3D boxes with no rotation # the prediction is estimated as the rotation between prediction and template template = self.get_template(prediction) instance_angle, instance_trans = self.kpts_to_euler(template, prediction.T) all_angles.append(instance_angle.reshape(1, 3)) angles = np.concatenate(all_angles) # the first point is the predicted point center translation = predictions[:, 0, :] return angles, translation def gather_lifting_results(self, record, data, prediction, target=None, pose_vecs_gt=None, intrinsics=None, refine=False, visualize=False, template=None, dist_coeffs=np.zeros((4,1)), color='r', get_str=False, alpha_mode='trans' ): """ Convert network outputs to pose angles. """ # prepare the prediction strings for submission # compute the roll, pitch and yaw angle of the predicted bounding box record['euler_angles'], record['translation'] = \ self.get_6d_rep(record['kpts_3d_pred']) if alpha_mode == 'trans': record['alphas'] = self.get_observation_angle_trans(record['euler_angles'], record['translation'] ) elif alpha_mode == 'proj': record['alphas'] = self.get_observation_angle_proj(record['euler_angles'], record['kpts_2d_pred'], record['K'] ) else: raise NotImplementedError if get_str: record['pred_str'] = get_pred_str(record) if visualize: record = vego.plot_3d_objects(prediction, target, pose_vecs_gt, record, color ) return record def plot_one_image(self, img_path, record, visualize=False, color_dict={'bbox_2d':'r', 'bbox_3d':'r', 'kpts':['rx', 'b'] }, save_dict={'flag':False, 'save_dir':None }, alpha_mode='trans' ): """ Post-process and plot the predictions from one image. """ if visualize: # plot 2D predictions vego.plot_2d_objects(img_path, record, color_dict) # plot 3d bounding boxes all_kpts_2d = np.concatenate(record['kpts_2d_pred']) all_kpts_3d_pred = record['kpts_3d_pred'].reshape(len(record['kpts_3d_pred']), -1) if 'kpts_3d_gt' in record: all_kpts_3d_gt = record['kpts_3d_gt'] all_pose_vecs_gt = record['pose_vecs_gt'] else: all_kpts_3d_gt = None all_pose_vecs_gt = None # refine and gather the prediction strings record = self.gather_lifting_results(record, all_kpts_2d, all_kpts_3d_pred, all_kpts_3d_gt, all_pose_vecs_gt, color=color_dict['bbox_3d'], alpha_mode=alpha_mode, visualize=visualize, get_str=save_dict['flag'] ) # save KITTI-style prediction file in .txt format save_txt_file(img_path, record, save_dict) return record def post_process(self, records, visualize=False, color_dict={'bbox_2d':'r', 'kpts':['ro', 'b'], }, save_dict={'flag':False, 'save_dir':None }, alpha_mode='trans' ): """ Save save and visualize them optionally. """ for img_path in records.keys(): print("Processing {:s}".format(img_path)) records[img_path] = self.plot_one_image(img_path, records[img_path], visualize=visualize, color_dict=color_dict, save_dict=save_dict, alpha_mode=alpha_mode ) return records def new_img_dict(self): """ An empty dictionary for image-level records. """ img_dict = {'center':[], 'scale':[], 'rotation':[], 'bbox_resize':[], # resized bounding box 'kpts_2d_pred':[], 'label':[], 'score':[] } return img_dict def get_keypoints(self, instances, records, is_cuda=True ): """ Foward pass to obtain the screen coordinates. """ if is_cuda: instances = instances.cuda() output = self.HC(instances) # local part coordinates width, height = self.resolution local_coord = output[1].data.cpu().numpy() local_coord *= np.array(self.resolution).reshape(1, 1, 2) # transform local part coordinates to screen coordinates centers = [records[i]['center'] for i in range(len(records))] scales = [records[i]['scale'] for i in range(len(records))] rots = [records[i]['rotation'] for i in range(len(records))] for instance_idx in range(len(local_coord)): trans_inv = get_affine_transform(centers[instance_idx], scales[instance_idx], rots[instance_idx], (height, width), inv=1 ) screen_coord = affine_transform_modified(local_coord[instance_idx], trans_inv ) records[instance_idx]['kpts'] = screen_coord # assemble a dictionary where each key corresponds to one image ret = {} for record in records: path = record['path'] if path not in ret: ret[path] = self.new_img_dict() ret[path]['kpts_2d_pred'].append(record['kpts'].reshape(1, -1)) ret[path]['center'].append(record['center']) ret[path]['scale'].append(record['scale']) ret[path]['bbox_resize'].append(record['bbox_resize']) ret[path]['label'].append(record['label']) ret[path]['score'].append(record['score']) ret[path]['rotation'].append(record['rotation']) return ret def lift_2d_to_3d(self, records, cuda=True): """ Foward-pass of the lifter sub-model. """ for path in records.keys(): data = np.concatenate(records[path]['kpts_2d_pred'], axis=0) data = nop.normalize_1d(data, self.LS['mean_in'], self.LS['std_in']) data = data.astype(np.float32) data = torch.from_numpy(data) if cuda: data = data.cuda() prediction = self.L(data) prediction = nop.unnormalize_1d(prediction.data.cpu().numpy(), self.LS['mean_out'], self.LS['std_out'] ) records[path]['kpts_3d_pred'] = prediction.reshape(len(prediction), -1, 3) return records def forward(self, annot_dict): """ Process a batch of images. annot_dict is a Python dictionary storing the following keys: path: list of image paths boxes: list of bounding boxes for each image """ all_instances, all_records = self.crop_instances(annot_dict, resolution=self.resolution, pth_trans=self.pth_trans, xy_dict=self.xy_dict ) # all_records stores records for each instance records = self.get_keypoints(all_instances, all_records) # records stores records for each image records = self.lift_2d_to_3d(records) # write the annotation dictionary records = self.write_annot_dict(annot_dict, records) return records ================================================ FILE: libs/model/heatmapModel/__init__.py ================================================ ================================================ FILE: libs/model/heatmapModel/hrnet.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Bin Xiao (Bin.Xiao@microsoft.com) # Modified by Shichao Li (nicholas.li@connect.ust.hk) # ------------------------------------------------------------------------------ # from __future__ import absolute_import # from __future__ import division # from __future__ import print_function import os import logging import torch import torch.nn as nn import numpy as np BN_MOMENTUM = 0.1 logger = logging.getLogger(__name__) def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) def basicdownsample(in_planes, out_planes): downsample = nn.Sequential( nn.Conv2d( in_planes, out_planes, kernel_size=1, stride=2, bias=False ), nn.BatchNorm2d( out_planes ), ) return downsample class BasicLinearModule(nn.Module): def __init__(self, in_channels, out_channels, mid_channels=512): super(BasicLinearModule, self).__init__() self.l1 = nn.Linear(in_channels, out_channels) # self.l1 = nn.Linear(in_channels, mid_channels) # self.bn1 = nn.BatchNorm1d(mid_channels, momentum=BN_MOMENTUM) # self.relu = nn.ReLU(inplace=True) # self.l2 = nn.Linear(mid_channels, out_channels) def forward(self, x): x = x.view(len(x), -1) out = self.l1(x) # out = self.bn1(out) # out = self.relu(out) # out = self.l2(out) return out class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class HighResolutionModule(nn.Module): def __init__(self, num_branches, blocks, num_blocks, num_inchannels, num_channels, fuse_method, multi_scale_output=True): super(HighResolutionModule, self).__init__() self._check_branches( num_branches, blocks, num_blocks, num_inchannels, num_channels) self.num_inchannels = num_inchannels self.fuse_method = fuse_method self.num_branches = num_branches self.multi_scale_output = multi_scale_output self.branches = self._make_branches( num_branches, blocks, num_blocks, num_channels) self.fuse_layers = self._make_fuse_layers() self.relu = nn.ReLU(True) def _check_branches(self, num_branches, blocks, num_blocks, num_inchannels, num_channels): if num_branches != len(num_blocks): error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format( num_branches, len(num_blocks)) logger.error(error_msg) raise ValueError(error_msg) if num_branches != len(num_channels): error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format( num_branches, len(num_channels)) logger.error(error_msg) raise ValueError(error_msg) if num_branches != len(num_inchannels): error_msg = 'NUM_BRANCHES({}) <> NUM_INCHANNELS({})'.format( num_branches, len(num_inchannels)) logger.error(error_msg) raise ValueError(error_msg) def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1): downsample = None if stride != 1 or \ self.num_inchannels[branch_index] != num_channels[branch_index] * block.expansion: downsample = nn.Sequential( nn.Conv2d( self.num_inchannels[branch_index], num_channels[branch_index] * block.expansion, kernel_size=1, stride=stride, bias=False ), nn.BatchNorm2d( num_channels[branch_index] * block.expansion, momentum=BN_MOMENTUM ), ) layers = [] layers.append( block( self.num_inchannels[branch_index], num_channels[branch_index], stride, downsample ) ) self.num_inchannels[branch_index] = \ num_channels[branch_index] * block.expansion for i in range(1, num_blocks[branch_index]): layers.append( block( self.num_inchannels[branch_index], num_channels[branch_index] ) ) return nn.Sequential(*layers) def _make_branches(self, num_branches, block, num_blocks, num_channels): branches = [] for i in range(num_branches): branches.append( self._make_one_branch(i, block, num_blocks, num_channels) ) return nn.ModuleList(branches) def _make_fuse_layers(self): if self.num_branches == 1: return None num_branches = self.num_branches num_inchannels = self.num_inchannels fuse_layers = [] for i in range(num_branches if self.multi_scale_output else 1): fuse_layer = [] for j in range(num_branches): if j > i: fuse_layer.append( nn.Sequential( nn.Conv2d( num_inchannels[j], num_inchannels[i], 1, 1, 0, bias=False ), nn.BatchNorm2d(num_inchannels[i]), nn.Upsample(scale_factor=2**(j-i), mode='nearest') ) ) elif j == i: fuse_layer.append(None) else: conv3x3s = [] for k in range(i-j): if k == i - j - 1: num_outchannels_conv3x3 = num_inchannels[i] conv3x3s.append( nn.Sequential( nn.Conv2d( num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False ), nn.BatchNorm2d(num_outchannels_conv3x3) ) ) else: num_outchannels_conv3x3 = num_inchannels[j] conv3x3s.append( nn.Sequential( nn.Conv2d( num_inchannels[j], num_outchannels_conv3x3, 3, 2, 1, bias=False ), nn.BatchNorm2d(num_outchannels_conv3x3), nn.ReLU(True) ) ) fuse_layer.append(nn.Sequential(*conv3x3s)) fuse_layers.append(nn.ModuleList(fuse_layer)) return nn.ModuleList(fuse_layers) def get_num_inchannels(self): return self.num_inchannels def forward(self, x): if self.num_branches == 1: return [self.branches[0](x[0])] for i in range(self.num_branches): x[i] = self.branches[i](x[i]) x_fuse = [] for i in range(len(self.fuse_layers)): y = x[0] if i == 0 else self.fuse_layers[i][0](x[0]) for j in range(1, self.num_branches): if i == j: y = y + x[j] else: y = y + self.fuse_layers[i][j](x[j]) x_fuse.append(self.relu(y)) return x_fuse blocks_dict = { 'basic': BasicBlock, 'bottleneck': Bottleneck } class PoseHighResolutionNet(nn.Module): def __init__(self, cfgs, **kwargs): self.inplanes = 64 self.num_joints = cfgs['heatmapModel']['num_joints'] extra = cfgs['heatmapModel']['extra'] super(PoseHighResolutionNet, self).__init__() # stem net self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=2, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.layer1 = self._make_layer(Bottleneck, 64, 4) self.stage2_cfg = cfgs['heatmapModel']['extra']['stage2'] num_channels = self.stage2_cfg['num_channels'] block = blocks_dict[self.stage2_cfg['block']] num_channels = [ num_channels[i] * block.expansion for i in range(len(num_channels)) ] self.transition1 = self._make_transition_layer([256], num_channels) self.stage2, pre_stage_channels = self._make_stage( self.stage2_cfg, num_channels) self.stage3_cfg = cfgs['heatmapModel']['extra']['stage3'] num_channels = self.stage3_cfg['num_channels'] block = blocks_dict[self.stage3_cfg['block']] num_channels = [ num_channels[i] * block.expansion for i in range(len(num_channels)) ] self.transition2 = self._make_transition_layer( pre_stage_channels, num_channels) self.stage3, pre_stage_channels = self._make_stage( self.stage3_cfg, num_channels) self.stage4_cfg = cfgs['heatmapModel']['extra']['stage4'] num_channels = self.stage4_cfg['num_channels'] block = blocks_dict[self.stage4_cfg['block']] num_channels = [ num_channels[i] * block.expansion for i in range(len(num_channels)) ] self.transition3 = self._make_transition_layer( pre_stage_channels, num_channels) self.stage4, pre_stage_channels = self._make_stage( self.stage4_cfg, num_channels, multi_scale_output=False) self.pretrained_layers = cfgs['heatmapModel']['extra']['pretrained_layers'] # network head self.head_type = cfgs['heatmapModel']['head_type'] self.pixel_shuffle = cfgs['heatmapModel']['pixel_shuffle'] if self.head_type == 'heatmap': self.final_layer = nn.Conv2d( in_channels=pre_stage_channels[0], out_channels=self.num_joints, kernel_size=extra['final_conv_kernel'], stride=1, padding=1 if extra['final_conv_kernel'] == 3 else 0 ) if cfgs['heatmapModel']['pixel_shuffle']: # Add a pixel shuffle upsampling layer to control the heatmap size self.upsamp_fact = int(cfgs['heatmapModel']['heatmap_size'][0]\ /cfgs['heatmapModel']['input_size'][0]*4) self.upsample_layer = nn.Sequential( nn.Conv2d(self.num_joints, self.num_joints*self.upsamp_fact**2, kernel_size=1), nn.BatchNorm2d(self.num_joints*self.upsamp_fact**2), nn.ReLU(inplace=True), nn.PixelShuffle(self.upsamp_fact) ) elif self.head_type == 'angleregression': num_chan = 256 self.head = nn.Sequential( nn.Conv2d( in_channels=pre_stage_channels[0], out_channels=num_chan, kernel_size=1, stride=1, padding=0 ), # produce 8*8*num_joints tensor BasicBlock(num_chan, num_chan, stride=2, downsample=basicdownsample(num_chan, num_chan) ), BasicBlock(num_chan, num_chan, stride=2, downsample=basicdownsample(num_chan, num_chan) ), BasicBlock(num_chan, num_chan, stride=2, downsample=basicdownsample(num_chan, num_chan) ), BasicBlock(num_chan, num_chan, stride=2, downsample=basicdownsample(num_chan, num_chan) ), nn.AvgPool2d(kernel_size=4), ) self.final_fc = nn.Sequential( nn.Linear(256, 256), nn.BatchNorm1d(256), nn.ReLU(inplace=True), nn.Linear(256, 2) ) elif self.head_type == 'coordinates': num_chan = self.num_joints map_width, map_height = cfgs['heatmapModel']['heatmap_size'] ks = (int(map_height / 16), int(map_width / 16)) self.head1 = nn.Sequential( nn.Conv2d( in_channels=pre_stage_channels[0], out_channels=self.num_joints, kernel_size=1, stride=1, padding=0 ), ) self.head2 = nn.Sequential( BasicBlock(num_chan+2, num_chan*2, stride=2, downsample=basicdownsample(num_chan+2, num_chan*2) ), BasicBlock(num_chan*2, num_chan*2, stride=2, downsample=basicdownsample(num_chan*2, num_chan*2) ), BasicBlock(num_chan*2, num_chan*2, stride=2, downsample=basicdownsample(num_chan*2, num_chan*2) ), BasicBlock(num_chan*2, num_chan*2, stride=2, downsample=basicdownsample(num_chan*2, num_chan*2) ), nn.Conv2d(num_chan*2, num_chan*2, kernel_size=ks), nn.Sigmoid() ) # coordinate convolution makes arg-max easier x_map = np.tile(np.linspace(0, 1, map_width), (map_height, 1)) x_map = x_map.reshape(1, 1, map_height, map_width) y_map = np.linspace(0, 1, map_height).reshape(map_height, 1) y_map = np.tile(y_map, (1, map_width)) y_map = y_map.reshape(1, 1, map_height, map_width) self.coor_maps = np.concatenate([x_map, y_map], axis=1).astype(np.float32) self.coor_maps = torch.from_numpy(self.coor_maps) else: raise NotImplementedError def _make_transition_layer( self, num_channels_pre_layer, num_channels_cur_layer): num_branches_cur = len(num_channels_cur_layer) num_branches_pre = len(num_channels_pre_layer) transition_layers = [] for i in range(num_branches_cur): if i < num_branches_pre: if num_channels_cur_layer[i] != num_channels_pre_layer[i]: transition_layers.append( nn.Sequential( nn.Conv2d( num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False ), nn.BatchNorm2d(num_channels_cur_layer[i]), nn.ReLU(inplace=True) ) ) else: transition_layers.append(None) else: conv3x3s = [] for j in range(i+1-num_branches_pre): inchannels = num_channels_pre_layer[-1] outchannels = num_channels_cur_layer[i] \ if j == i-num_branches_pre else inchannels conv3x3s.append( nn.Sequential( nn.Conv2d( inchannels, outchannels, 3, 2, 1, bias=False ), nn.BatchNorm2d(outchannels), nn.ReLU(inplace=True) ) ) transition_layers.append(nn.Sequential(*conv3x3s)) return nn.ModuleList(transition_layers) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d( self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False ), nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def _make_stage(self, layer_config, num_inchannels, multi_scale_output=True): num_modules = layer_config['num_modules'] num_branches = layer_config['num_branches'] num_blocks = layer_config['num_blocks'] num_channels = layer_config['num_channels'] block = blocks_dict[layer_config['block']] fuse_method = layer_config['fuse_method'] modules = [] for i in range(num_modules): # multi_scale_output is only used last module if not multi_scale_output and i == num_modules - 1: reset_multi_scale_output = False else: reset_multi_scale_output = True modules.append( HighResolutionModule( num_branches, block, num_blocks, num_inchannels, num_channels, fuse_method, reset_multi_scale_output ) ) num_inchannels = modules[-1].get_num_inchannels() return nn.Sequential(*modules), num_inchannels def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.conv2(x) x = self.bn2(x) x = self.relu(x) x = self.layer1(x) x_list = [] for i in range(self.stage2_cfg['num_branches']): if self.transition1[i] is not None: x_list.append(self.transition1[i](x)) else: x_list.append(x) y_list = self.stage2(x_list) x_list = [] for i in range(self.stage3_cfg['num_branches']): if self.transition2[i] is not None: x_list.append(self.transition2[i](y_list[-1])) else: x_list.append(y_list[i]) y_list = self.stage3(x_list) x_list = [] for i in range(self.stage4_cfg['num_branches']): if self.transition3[i] is not None: x_list.append(self.transition3[i](y_list[-1])) else: x_list.append(y_list[i]) y_list = self.stage4(x_list) if self.head_type == 'heatmap': x = self.final_layer(y_list[0]) # upsampling if self.pixel_shuffle: x = self.upsample_layer(x) elif self.head_type == 'coordinates': maps = self.head1(y_list[0]) # concatenate coordinate maps num_sample = len(maps) coor_maps = self.coor_maps.repeat(num_sample, 1, 1, 1).to(maps.device) augmented_maps = torch.cat([maps, coor_maps], dim=1) coordinates = self.head2(augmented_maps) x = (maps, coordinates.view(len(x), -1, 2)) elif self.head_type == 'angleregression': maps = self.head(y_list[0]) x = self.final_fc(maps.reshape(len(maps), -1)) else: raise NotImplementedError() return x def init_weights(self, pretrained=''): logger.info('=> init weights from normal distribution') for m in self.modules(): if isinstance(m, nn.Conv2d): # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') nn.init.normal_(m.weight, std=0.001) for name, _ in m.named_parameters(): if name in ['bias']: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): nn.init.normal_(m.weight, std=0.001) for name, _ in m.named_parameters(): if name in ['bias']: nn.init.constant_(m.bias, 0) if os.path.isfile(pretrained): pretrained_state_dict = torch.load(pretrained) logger.info('=> loading pretrained model {}'.format(pretrained)) need_init_state_dict = {} for name, m in pretrained_state_dict.items(): if name.split('.')[0] in self.pretrained_layers \ or self.pretrained_layers[0] == '*': need_init_state_dict[name] = m self.load_state_dict(need_init_state_dict, strict=False) logger.info('{:d} modules initialized.'.format(len(need_init_state_dict))) elif pretrained: logger.error('=> please download pre-trained models first!') raise ValueError('{} does not exist!'.format(pretrained)) def modify_input_channel(self, num_channels): if num_channels == 3: return new_layer = nn.Conv2d(num_channels, 64, kernel_size=3, stride=2, padding=1, bias=False) # copy the old weights with torch.no_grad(): new_layer.weight[:,:3,:,:] = self.conv1.weight.clone() del self.conv1 self.conv1 = new_layer return def load_my_state_dict(self, state_dict): own_state = self.state_dict() for name, param in state_dict.items(): if name not in own_state: continue param = param.data own_state[name].copy_(param) def is_freezed(name, freeze_names): for prefix in freeze_names: if name.startswith(prefix): return True return False def get_pose_net(cfgs, is_train, **kwargs): model = PoseHighResolutionNet(cfgs, **kwargs) if is_train and cfgs['heatmapModel']['init_weights']: model.init_weights(cfgs['heatmapModel']['pretrained']) # freeze specified pre-trained layers freeze_names = cfgs['heatmapModel']['extra'].get('freeze_layers', []) for name, param in model.named_parameters(): if is_freezed(name, freeze_names): param.requires_grad = False print('{:s} freezed during training.'.format(name)) if cfgs['heatmapModel']['add_xy']: model.modify_input_channel(5) return model ================================================ FILE: libs/model/heatmapModel/resnet.py ================================================ # ------------------------------------------------------------------------------ # Copyright (c) Microsoft # Licensed under the MIT License. # Written by Bin Xiao (Bin.Xiao@microsoft.com) # ------------------------------------------------------------------------------ #from __future__ import absolute_import from __future__ import division from __future__ import print_function import os import logging import torch import torch.nn as nn BN_MOMENTUM = 0.1 logger = logging.getLogger(__name__) def conv3x3(in_planes, out_planes, stride=1): """3x3 convolution with padding""" return nn.Conv2d( in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False ) class BasicBlock(nn.Module): expansion = 1 def __init__(self, inplanes, planes, stride=1, downsample=None): super(BasicBlock, self).__init__() self.conv1 = conv3x3(inplanes, planes, stride) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3(planes, planes) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, inplanes, planes, stride=1, downsample=None): super(Bottleneck, self).__init__() self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False) self.bn1 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(planes, momentum=BN_MOMENTUM) self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False) self.bn3 = nn.BatchNorm2d(planes * self.expansion, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class PoseResNet(nn.Module): def __init__(self, block, layers, cfg, **kwargs): self.inplanes = 64 extra = cfg.MODEL.EXTRA self.deconv_with_bias = extra.DECONV_WITH_BIAS super(PoseResNet, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) self.bn1 = nn.BatchNorm2d(64, momentum=BN_MOMENTUM) self.relu = nn.ReLU(inplace=True) self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.layer1 = self._make_layer(block, 64, layers[0]) self.layer2 = self._make_layer(block, 128, layers[1], stride=2) self.layer3 = self._make_layer(block, 256, layers[2], stride=2) self.layer4 = self._make_layer(block, 512, layers[3], stride=2) # used for deconv layers self.deconv_layers = self._make_deconv_layer( extra.NUM_DECONV_LAYERS, extra.NUM_DECONV_FILTERS, extra.NUM_DECONV_KERNELS, ) self.final_layer = nn.Conv2d( in_channels=extra.NUM_DECONV_FILTERS[-1], out_channels=cfg.MODEL.NUM_JOINTS, kernel_size=extra.FINAL_CONV_KERNEL, stride=1, padding=1 if extra.FINAL_CONV_KERNEL == 3 else 0 ) def _make_layer(self, block, planes, blocks, stride=1): downsample = None if stride != 1 or self.inplanes != planes * block.expansion: downsample = nn.Sequential( nn.Conv2d(self.inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), nn.BatchNorm2d(planes * block.expansion, momentum=BN_MOMENTUM), ) layers = [] layers.append(block(self.inplanes, planes, stride, downsample)) self.inplanes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.inplanes, planes)) return nn.Sequential(*layers) def _get_deconv_cfg(self, deconv_kernel, index): if deconv_kernel == 4: padding = 1 output_padding = 0 elif deconv_kernel == 3: padding = 1 output_padding = 1 elif deconv_kernel == 2: padding = 0 output_padding = 0 return deconv_kernel, padding, output_padding def _make_deconv_layer(self, num_layers, num_filters, num_kernels): assert num_layers == len(num_filters), \ 'ERROR: num_deconv_layers is different len(num_deconv_filters)' assert num_layers == len(num_kernels), \ 'ERROR: num_deconv_layers is different len(num_deconv_filters)' layers = [] for i in range(num_layers): kernel, padding, output_padding = \ self._get_deconv_cfg(num_kernels[i], i) planes = num_filters[i] layers.append( nn.ConvTranspose2d( in_channels=self.inplanes, out_channels=planes, kernel_size=kernel, stride=2, padding=padding, output_padding=output_padding, bias=self.deconv_with_bias)) layers.append(nn.BatchNorm2d(planes, momentum=BN_MOMENTUM)) layers.append(nn.ReLU(inplace=True)) self.inplanes = planes return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.relu(x) x = self.maxpool(x) x = self.layer1(x) x = self.layer2(x) x = self.layer3(x) x = self.layer4(x) x = self.deconv_layers(x) x = self.final_layer(x) return x def init_weights(self, pretrained=''): if os.path.isfile(pretrained): logger.info('=> init deconv weights from normal distribution') for name, m in self.deconv_layers.named_modules(): if isinstance(m, nn.ConvTranspose2d): logger.info('=> init {}.weight as normal(0, 0.001)'.format(name)) logger.info('=> init {}.bias as 0'.format(name)) nn.init.normal_(m.weight, std=0.001) if self.deconv_with_bias: nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): logger.info('=> init {}.weight as 1'.format(name)) logger.info('=> init {}.bias as 0'.format(name)) nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) logger.info('=> init final conv weights from normal distribution') for m in self.final_layer.modules(): if isinstance(m, nn.Conv2d): # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') logger.info('=> init {}.weight as normal(0, 0.001)'.format(name)) logger.info('=> init {}.bias as 0'.format(name)) nn.init.normal_(m.weight, std=0.001) nn.init.constant_(m.bias, 0) pretrained_state_dict = torch.load(pretrained) logger.info('=> loading pretrained model {}'.format(pretrained)) self.load_state_dict(pretrained_state_dict, strict=False) else: logger.info('=> init weights from normal distribution') for m in self.modules(): if isinstance(m, nn.Conv2d): # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') nn.init.normal_(m.weight, std=0.001) # nn.init.constant_(m.bias, 0) elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) elif isinstance(m, nn.ConvTranspose2d): nn.init.normal_(m.weight, std=0.001) if self.deconv_with_bias: nn.init.constant_(m.bias, 0) resnet_spec = { 18: (BasicBlock, [2, 2, 2, 2]), 34: (BasicBlock, [3, 4, 6, 3]), 50: (Bottleneck, [3, 4, 6, 3]), 101: (Bottleneck, [3, 4, 23, 3]), 152: (Bottleneck, [3, 8, 36, 3]) } def get_pose_net(cfg, is_train, **kwargs): num_layers = cfg.MODEL.EXTRA.NUM_LAYERS block_class, layers = resnet_spec[num_layers] model = PoseResNet(block_class, layers, cfg, **kwargs) if is_train and cfg.MODEL.INIT_WEIGHTS: model.init_weights(cfg.MODEL.PRETRAINED) return model ================================================ FILE: libs/optimizer/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/optimizer/optimizer.py ================================================ """ Optimization utilities. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import torch def prepare_optim(model, cfgs): """ Get optimizer and scheduler objects from model parameters. """ params = [ p for p in model.parameters() if p.requires_grad] lr = cfgs['optimizer']['lr'] weight_decay = cfgs['optimizer']['weight_decay'] momentum = cfgs['optimizer']['momentum'] milestones = cfgs['optimizer']['milestones'] gamma = cfgs['optimizer']['gamma'] if cfgs['optimizer']['optim_type'] == 'adam': optimizer = torch.optim.Adam(params, lr = lr, weight_decay = weight_decay) elif cfgs['optimizer']['optim_type'] == 'sgd': optimizer = torch.optim.SGD(params, lr = lr, momentum = momentum, weight_decay = weight_decay) else: raise NotImplementedError scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones = milestones, gamma = gamma ) # A scheduler that automatically decreases the learning rate # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, # mode='min', # factor=0.5, # patience=10, # verbose=True, # min_lr=0.01) return optimizer, scheduler ================================================ FILE: libs/trainer/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/trainer/accuracy.py ================================================ """ Deprecated. Will be deleted in a future version. Pre-defined accuracy functions. """ import libs.common.img_proc as lip import numpy as np def get_distance(gt, pred): # gt: [n_joints, 2 or 3] # pred: [n_joints, 2] if gt.shape[1] == 2: sqerr = (gt - pred)**2 sqerr = sqerr.sum(axis = 1) dist_list = list(np.sqrt(sqerr)) elif gt.shape[1] == 3: dist_list = [] sqerr = (gt[:, :2] - pred)**2 sqerr = sqerr.sum(axis = 1) indices = np.nonzero(gt[:, 2])[0] dist_list = list(np.sqrt(sqerr[indices])) else: raise ValueError('Array shape not supported.') return dist_list def accuracy_pixel(output, meta_data, cfgs=None, image_size = (256.0, 256.0), arg_max='hard' ): """ pixel-wise distance computed from predicted heatmaps """ # report distance in terms of pixel in the original image if arg_max == 'soft': if isinstance(output, np.ndarray): pred, max_vals = lip.get_max_preds_soft(output) else: pred, max_vals = lip.get_max_preds_soft_pt(output) elif arg_max == 'hard': if not isinstance(output, np.ndarray): output = output.data.cpu().numpy() pred, max_vals = lip.get_max_preds(output) else: raise NotImplementedError image_size = image_size if cfgs is None else cfgs['heatmapModel']['input_size'] # multiply by down-sample ratio if not isinstance(pred, np.ndarray): pred = pred.data.cpu().numpy() max_vals = max_vals.data.cpu().numpy() pred *= image_size[0]/output.shape[3] # inverse transform and compare pixel didstance centers, scales, rots = meta_data['center'], meta_data['scale'], meta_data['rotation'] centers = centers.data.cpu().numpy() scales = scales.data.cpu().numpy() rots = rots.data.cpu().numpy() joints_original_batch = meta_data['original_joints'].data.cpu().numpy() distance_list = [] all_src_coordinates = [] for sample_idx in range(len(pred)): trans_inv = lip.get_affine_transform(centers[sample_idx], scales[sample_idx], rots[sample_idx], image_size, inv=1) joints_original = joints_original_batch[sample_idx] pred_src_coordinates = lip.affine_transform_modified(pred[sample_idx], trans_inv) all_src_coordinates.append(pred_src_coordinates.reshape(1, len(pred_src_coordinates), 2)) distance_list += get_distance(joints_original, pred_src_coordinates) cnt = len(distance_list) avg_acc = sum(distance_list)/cnt others = { 'src_coord': np.concatenate(all_src_coordinates, axis=0), 'joints_pred': pred, 'max_vals': max_vals } return avg_acc, cnt, others ================================================ FILE: libs/trainer/trainer.py ================================================ """ Utilities for training and validation. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import libs.model.FCmodel as FCmodel import libs.optimizer.optimizer as optimizer import libs.loss.function as loss_funcs import libs.visualization.points as vp from libs.common.transformation import procrustes_transform, pnp_refine from libs.visualization.debug import save_debug_images from libs.common.utils import AverageMeter from libs.metric.criterions import Evaluator from libs.logger.logger import get_dirs import torch import numpy as np import time import os import matplotlib.pyplot as plt def train_cascade(train_dataset, valid_dataset, cfgs, logger): """ Method for training the lifter sub-model L.pth. """ # data statistics #stats = train_dataset.stats stats = None # cascaded model cascade = FCmodel.get_cascade() stage_record = [] # train each stage for stage_id in range(cfgs['cascade']['num_stages']): # initialize the model input_size, output_size = train_dataset.get_input_output_size() cfgs['FCModel']['input_size'] = input_size cfgs['FCModel']['output_size'] = output_size stage_model = FCmodel.get_fc_model(stage_id + 1, cfgs=cfgs, input_size=input_size, output_size=output_size ) if cfgs['use_gpu']: stage_model = stage_model.cuda() # prepare the optimizer optim, sche = optimizer.prepare_optim(stage_model, cfgs) loss_type = cfgs['FCModel']['loss_type'] loss_func = eval('loss_funcs.' + loss_type)( reduction=cfgs['FCModel']['loss_reduction'] ).cuda() # train the model record = train(train_dataset=train_dataset, valid_dataset=valid_dataset, model=stage_model, loss_func=loss_func, optim=optim, sche=sche, stats=stats, cfgs=cfgs, logger=logger ) stage_model = record['model'] stage_record.append((record['batch_idx'], record['loss'])) # put into cascade cascade.append(stage_model.cpu()) # release memory del stage_model return {'cascade':cascade, 'record':stage_record} def evaluate_cascade(cascade, eval_dataset, stats, opt, save=False, save_path=None, action_wise=False, action_eval_list=None, apply_dropout=False ): """ Method for evaluating the lifter sub-model L.pth. """ loss, distance = None, None for stage_id in range(len(cascade)): # initialize the model stage_model = cascade[stage_id] if opt.cuda: stage_model = stage_model.cuda() # evaluate the model loss, distance = evaluate(eval_dataset, stage_model, stats, opt, save=save, save_path=save_path, procrustes=False, per_joint=True, apply_dropout=apply_dropout ) # update datasets eval_dataset.stage_update(stage_model, stats, opt) # release memory del stage_model return loss, distance def get_loader(dataset, cfgs, split, collate_fn=None): """ Prepare a PyTorch dataloader object. """ setting = split + '_settings' arg_dic = {'batch_size': cfgs[setting]['batch_size'], 'num_workers': cfgs[setting]['num_threads'], 'shuffle': cfgs[setting]['shuffle'], } if collate_fn is not None: arg_dic['collate_fn'] = collate_fn loader = torch.utils.data.DataLoader(dataset, **arg_dic) return loader def train(train_dataset, model, loss_func, optim, sche, cfgs, logger, metric_func=None, stats=None, valid_dataset=None, collate_fn=None, save_debug=False ): """ Train a model with optional validation during training. """ # training configurations total_epochs = cfgs['training_settings']['total_epochs'] batch_size = cfgs['training_settings']['batch_size'] report_every = cfgs['training_settings']['report_every'] eval_during = cfgs['training_settings']['eval_during'] eval_start_epoch = cfgs['training_settings']['eval_start_epoch'] if \ 'eval_start_epoch' in cfgs['training_settings'] else 0 # evaluate during training if eval_during and valid_dataset is not None: eval_every = cfgs['training_settings']['eval_every'] evaluator = Evaluator(cfgs['training_settings']['eval_metrics'], cfgs, train_dataset.num_joints ) plot_loss = cfgs['training_settings']['plot_loss'] cuda = cfgs['use_gpu'] and torch.cuda.is_available() # optional list storing loss curve x_buffer = [] y_buffer = [] # online plotting if plot_loss: ax, lines, x_buffer, y_buffer = initialize_plot() # training for epoch in range(1, total_epochs + 1): # Apply cross-ratio loss after certain epochs if epoch > 1: loss_func.apply_cr_loss = True # initialize training record batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() acc = AverageMeter() model.train() # modify the learning rate according to the scheduler sche.step() # data loader train_loader = get_loader(train_dataset, cfgs, 'training', collate_fn) total_batches = len(train_loader) total_sample = len(train_dataset) end = time.time() for batch_idx, (data, target, weights, meta) in enumerate(train_loader): if cuda: data, target, weights = data.cuda(), target.cuda(), weights.cuda() # measure data loading time data_time.update(time.time() - end) # erase all computed gradient optim.zero_grad() # forward pass to get prediction prediction = model(data) # compute loss loss = loss_func(prediction, target, weights, meta) # compute gradient in the computational graph loss.backward() # update parameters in the model optim.step() losses.update(loss.item(), data.size(0)) # compute other optional metrics besides the loss value if metric_func is not None: avg_acc, cnt, others = metric_func(prediction, meta, cfgs) acc.update(avg_acc, n=cnt, others=others) if batch_idx % report_every == 0: acc.print_content() else: others = None # measure elapsed time batch_time.update(time.time() - end) end = time.time() # logging if batch_idx % report_every == 0: logger_print(epoch, batch_idx, batch_size, total_sample, batch_time, data.size()[0], data_time, losses, acc, logger ) # optional: save intermediate results for debugging if save_debug: save_debug_images(epoch, batch_idx, cfgs, data, meta, target, others, prediction, 'train' ) # update loss curve x_buffer.append(total_batches * (epoch - 1) + batch_idx) y_buffer.append(loss.item()) if plot_loss: update_curve(ax, lines[0], x_buffer, y_buffer) del data, target, weights, meta # evaluate model if specified if eval_during and epoch> eval_start_epoch and \ batch_idx and batch_idx % eval_every == 0: evaluate(valid_dataset, model, loss_func, cfgs, logger, evaluator, collate_fn=collate_fn, epoch=epoch ) # back to training mode model.train() # save a snapshot if epoch in cfgs['training_settings'].get('snapshot_epochs', []): output_dir, _ = get_dirs(cfgs) prefix = cfgs['exp_type'] model_state_file = os.path.join(output_dir, prefix + '_{:d}.pth'.format(epoch)) logger.info('=> Snapshot model to {}'.format(model_state_file)) torch.save(model.module.state_dict(), model_state_file) logger.info('Training finished.') return {'model':model, 'batch_idx':x_buffer, 'loss':y_buffer} def initialize_plot(): """ Initialize loss plot. """ x_buffer, y_buffer = [], [] ax = plt.subplot(111) lines = ax.plot(x_buffer, y_buffer) plt.xlabel('batch index') plt.ylabel('training loss') return ax, lines, x_buffer, y_buffer def update_curve(ax, line, x_buffer, y_buffer): """ Update loss plot. """ line.set_xdata(x_buffer) line.set_ydata(y_buffer) # recompute the ax.dataLim ax.relim() # update ax.viewLim using the new dataLim ax.autoscale_view() plt.draw() plt.pause(0.05) return def logger_print(epoch, batch_idx, batch_size, total_sample, batch_time, length, data_time, losses, acc, logger ): """ Print training logs. """ msg = 'Training Epoch: [{0}][{1}/{2}]\t' \ 'Time {batch_time.val:.3f}s ({batch_time.avg:.3f}s)\t' \ 'Speed {speed:.1f} samples/s\t' \ 'Data {data_time.val:.3f}s ({data_time.avg:.3f}s)\t' \ 'Loss {loss.val:.5f} ({loss.avg:.5f})'.format( epoch, batch_idx * batch_size, total_sample, batch_time=batch_time, speed=length / batch_time.val, data_time=data_time, loss=losses ) if acc.val != 0 and acc.avg != 0: # acc is a pre-defined metric with positive value msg += 'Accuracy {acc.val:.3f} ({acc.avg:.3f})'.format(acc=acc) logger.info(msg) return def visualize_lifting_results(data, prediction, target=None, sample_num=None, intrinsics=None, refine=False, dist_coeffs=np.zeros((4,1)), meta_data=None ): """ Visualizing predictions of the lifter model (optional). """ # only take the coordinates if data.shape[1] > 18: data = data[:, :18] # use the ground truth translation if provided in the meta_data if 'roots' in meta_data: target = np.hstack([meta_data['roots'], target]) prediction = np.hstack([meta_data['roots'], prediction]) sample_num = sample_num if sample_num else len(prediction) chosen = np.random.choice(len(prediction), sample_num, replace=False) if target is not None: assert len(target) == len(prediction) p3d_gt_sample = target[chosen].reshape(sample_num, -1, 3) else: p3d_gt_sample = None p3d_pred_sample = prediction[chosen].reshape(sample_num, -1, 3) data_sample = data[chosen].reshape(sample_num, -1, 2) # vp.plot_comparison_relative(p3d_pred_sample[:9, 3:], # p3d_gt_sample[:9, 3:]) ax = vp.plot_scene_3dbox(p3d_pred_sample, p3d_gt_sample) if not refine: return # refine 3D point prediction by minimizing re-projection errors assert intrinsics is not None for idx in range(sample_num): prediction = p3d_pred_sample[idx] tempt_box_pred = prediction.copy() tempt_box_pred[1:, :] += tempt_box_pred[0, :].reshape(1, 3) observation = data_sample[idx] # use the predicted 3D bounding box size for refinement refined_prediction = pnp_refine(tempt_box_pred, observation, intrinsics, dist_coeffs) vp.plot_lines(ax, refined_prediction[:, 1:].T, vp.plot_3d_bbox.connections, dimension=3, c='g' ) # use the gt 3D box size for refinement # first align a box with gt size with the predicted box, then refine if target is None: continue tempt_box_gt = p3d_gt_sample[idx].copy() tempt_box_gt[1:, :] += tempt_box_gt[0, :].reshape(1, 3) pseudo_box = procrustes_transform(tempt_box_gt.T, tempt_box_pred.T) refined_prediction2 = pnp_refine(pseudo_box.T, observation, intrinsics, dist_coeffs) vp.plot_lines(ax, pseudo_box[:, 1:].T, vp.plot_3d_bbox.connections, dimension=3, c='y' ) vp.plot_lines(ax, refined_prediction2[:, 1:].T, vp.plot_3d_bbox.connections, dimension=3, c='b' ) return def evaluate(eval_dataset, model, loss_func, cfgs, logger, evaluator, save=False, save_path=None, collate_fn=None, epoch=None, sample_num=20 ): """ Method for evaluating a model. """ # unnormalize the prediction if needed if cfgs['testing_settings']['unnormalize']: stats = eval_dataset.statistics # visualize after certain epoch if cfgs['exp_type'] == '2dto3d' and 'vis_epoch' in cfgs['testing_settings']: vis_epoch = cfgs['testing_settings']['vis_epoch'] else: vis_epoch = -1 all_dists = [] model.eval() # optional: enable dropout in testing to produce loss similar to the training loss if cfgs['testing_settings']['apply_dropout']: def apply_dropout(m): if type(m) == torch.nn.Dropout: m.train() model.apply(apply_dropout) intrinsics = None if not hasattr(eval_dataset, 'intrinsic') else \ eval_dataset.intrinsic refine = False if intrinsics is None else True eval_loader = get_loader(eval_dataset, cfgs, 'testing', collate_fn) cuda = cfgs['use_gpu'] and torch.cuda.is_available() losses = AverageMeter() # optional: save intermediate results if save: pred_list = [] gt_list = [] has_plot = False # only plot once for batch_idx, (data, target, weights, meta) in enumerate(eval_loader): if cuda: data, target, weights = data.cuda(), target.cuda(), weights.cuda() # forward pass to get prediction prediction = model(data) # optional: save intermediate results for debugging if cfgs['testing_settings'].get('save_debug', False) and \ cfgs.get('exp_type') == 'instanceto2d': joints_pred = prediction[1].data.cpu().numpy() image_size = cfgs['heatmapModel']['input_size'] joints_pred *= np.array(image_size).reshape(1, 1, 2) save_debug_images(0, batch_idx, cfgs, data, meta, target, {'joints_pred': joints_pred}, prediction, 'validation' ) logger.info('Saved batch {:d}'.format(batch_idx)) # if save: # pred_list.append(prediction.data.cpu().numpy()) loss = loss_func(prediction, target, weights, meta) losses.update(loss.item(), data.size(0)) if cfgs['testing_settings']['unnormalize']: # compute distance of body joints in un-normalized format target = eval_dataset.unnormalize(target.data.cpu().numpy(), stats['mean_out'], stats['std_out'] ) prediction = eval_dataset.unnormalize(prediction.data.cpu().numpy(), stats['mean_out'], stats['std_out'] ) evaluator.update(prediction, ground_truth=target, meta_data=meta) ## plot 3D bounding boxes for visualization if not has_plot and vis_epoch > 0 and epoch > vis_epoch: data_unnorm = eval_dataset.unnormalize(data.data.cpu().numpy(), stats['mean_in'], stats['std_in'] ) visualize_lifting_results(data_unnorm, prediction, target, sample_num=sample_num, intrinsics=intrinsics, refine=refine, meta_data=meta ) has_plot = True if save: pred_list.append(prediction) gt_list.append(target) if save: # note the residual update is saved if a cascade is used record = {#'data':np.concatenate(data_list, axis=0), 'pred':np.concatenate(pred_list, axis=0), 'error':all_dists, 'gt':np.concatenate(gt_list, axis=0) } np.save(save_path, np.array(record)) evaluator.report(logger) return ================================================ FILE: libs/visualization/__init__.py ================================================ #!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Empty file. """ ================================================ FILE: libs/visualization/debug.py ================================================ """ Utilities for saving debugging images. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ from libs.common.img_proc import get_max_preds from libs.common.utils import make_dir import math import numpy as np import torchvision import cv2 from os.path import join def draw_circles(ndarr, xmaps, ymaps, nmaps, batch_joints, batch_joints_vis, width, height, padding, color=[255,0,0], add_idx=True ): k = 0 for y in range(ymaps): for x in range(xmaps): if k >= nmaps: break joints = batch_joints[k] for idx, joint in enumerate(joints): xpos = x * width + padding + joint[0] ypos = y * height + padding + joint[1] cv2.circle(ndarr, (int(xpos), int(ypos)), 2, color, 2) if add_idx: cv2.putText(ndarr, str(idx+1), (int(xpos), int(ypos)), cv2.FONT_HERSHEY_SIMPLEX, 1, color, 1 ) k += 1 return ndarr # functions used for debugging heatmap-based keypoint localization model # def save_batch_image_with_joints(batch_image, record_dict, file_name, nrow=8, padding=2 ): """ batch_image: [batch_size, channel, height, width] batch_joints: [batch_size, num_joints, 3], batch_joints_vis: [batch_size, num_joints, 1], """ grid = torchvision.utils.make_grid(batch_image[:, :3, :, :], nrow, padding, True) ndarr = grid.mul(255).clamp(0, 255).byte().permute(1, 2, 0).cpu().numpy() ndarr = ndarr.copy() nmaps = batch_image.size(0) xmaps = min(nrow, nmaps) ymaps = int(math.ceil(float(nmaps) / xmaps)) height = int(batch_image.size(2) + padding) width = int(batch_image.size(3) + padding) batch_joints, batch_joints_vis = record_dict['pred'] ndarr = draw_circles(ndarr, xmaps, ymaps, nmaps, batch_joints, batch_joints_vis, width, height, padding) if 'gt' in record_dict: nmaps = min(nmaps, len(batch_joints_vis)) xmaps = min(nrow, nmaps) ymaps = int(math.ceil(float(nmaps) / xmaps)) batch_joints_gt, batch_joints_vis_gt = record_dict['gt'] ndarr = draw_circles(ndarr, xmaps, ymaps, nmaps, batch_joints_gt, batch_joints_vis_gt, width, height, padding, color=[0,255,255]) cv2.imwrite(file_name, cv2.cvtColor(ndarr, cv2.COLOR_RGB2BGR)) return def save_batch_heatmaps(batch_image, batch_heatmaps, file_name, normalize=True ): """ batch_image: [batch_size, channel, height, width] batch_heatmaps: ['batch_size, num_joints, height, width] file_name: saved file name """ if normalize: batch_image = batch_image.clone() min = float(batch_image.min()) max = float(batch_image.max()) batch_image.add_(-min).div_(max - min + 1e-5) batch_size = batch_heatmaps.size(0) num_joints = batch_heatmaps.size(1) heatmap_height = batch_heatmaps.size(2) heatmap_width = batch_heatmaps.size(3) grid_image = np.zeros((batch_size*heatmap_height, (num_joints+1)*heatmap_width, 3), dtype=np.uint8) preds, maxvals = get_max_preds(batch_heatmaps.detach().cpu().numpy()) for i in range(batch_size): image = batch_image[i].mul(255)\ .clamp(0, 255)\ .byte()\ .permute(1, 2, 0)\ .cpu().numpy() heatmaps = batch_heatmaps[i].mul(255)\ .clamp(0, 255)\ .byte()\ .cpu().numpy() resized_image = cv2.resize(image, (int(heatmap_width), int(heatmap_height))) height_begin = heatmap_height * i height_end = heatmap_height * (i + 1) for j in range(num_joints): cv2.circle(resized_image, (int(preds[i][j][0]), int(preds[i][j][1])), 1, [0, 0, 255], 1) heatmap = heatmaps[j, :, :] colored_heatmap = cv2.applyColorMap(heatmap, cv2.COLORMAP_JET) masked_image = colored_heatmap*0.7 + resized_image*0.3 cv2.circle(masked_image, (int(preds[i][j][0]), int(preds[i][j][1])), 1, [0, 0, 255], 1) width_begin = heatmap_width * (j+1) width_end = heatmap_width * (j+2) grid_image[height_begin:height_end, width_begin:width_end, :] = \ masked_image # grid_image[height_begin:height_end, width_begin:width_end, :] = \ # colored_heatmap*0.7 + resized_image*0.3 grid_image[height_begin:height_end, 0:heatmap_width, :] = resized_image cv2.imwrite(file_name, grid_image) return def save_debug_images(epoch, batch_index, cfgs, input, meta, target, others, output, split ): """ Save debugging images during training HC.pth. """ if not cfgs['training_settings']['debug']['save']: return prefix = join(cfgs['dirs']['output'], "intermediate_results", split, '{}_{}'.format(epoch, batch_index) ) make_dir(prefix) joints_pred = others['joints_pred'] debug_cfgs = cfgs['training_settings']['debug'] record_dict = {'pred':(joints_pred, meta['joints_vis']), 'gt':(meta['transformed_joints'], meta['joints_vis'])} if debug_cfgs['save_images_kpts']: save_batch_image_with_joints( input[:,:3,:,:], record_dict, '{}_keypoints.jpg'.format(prefix) ) if debug_cfgs['save_hms_gt']: save_batch_heatmaps( input[:,:3,:,:], target, '{}_hm_gt.jpg'.format(prefix) ) if debug_cfgs['save_hms_pred']: output = output[0] if type(output) is tuple else output save_batch_heatmaps( input[:,:3,:,:], output, '{}_hm_pred.jpg'.format(prefix) ) return ================================================ FILE: libs/visualization/egonet_utils.py ================================================ """ Visualization utilities for Ego-Net inference. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import cv2 import numpy as np import matplotlib.pyplot as plt import libs.visualization.points as vp def plot_2d_objects(img_path, record, color_dict): if 'plots' in record: # update old drawing fig = record['plots']['fig2d'] ax = record['plots']['ax2d'] else: # new drawing fig = plt.figure(figsize=(11.3, 9)) ax = plt.subplot(111) record['plots'] = {} record['plots']['fig2d'] = fig record['plots']['ax2d'] = ax image = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)[:, :, ::-1] height, width, _ = image.shape ax.imshow(image) ax.set_xlim([0, width]) ax.set_ylim([0, height]) ax.invert_yaxis() for idx in range(len(record['kpts_2d_pred'])): kpts = record['kpts_2d_pred'][idx].reshape(-1, 2) bbox = record['bbox_resize'][idx] vp.plot_2d_bbox(ax, bbox, color_dict['bbox_2d']) # predicted key-points ax.plot(kpts[:, 0], kpts[:, 1], color_dict['kpts'][0]) if 'kpts_2d_gt' in record: # plot ground truth 2D screen coordinates for idx, kpts_gt in enumerate(record['kpts_2d_gt']): kpts_gt = kpts_gt.reshape(-1, 3) vp.plot_3d_bbox(ax, kpts_gt[1:, :2], color='g', linestyle='-.') if 'arrow' in record: for idx in range(len(record['arrow'])): start = record['arrow'][idx][:,0] end = record['arrow'][idx][:,1] x, y = start dx, dy = end - start ax.arrow(x, y, dx, dy, color='r', lw=4, head_width=5, alpha=0.5) # save intermediate results # plt.gca().set_axis_off() # plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, # hspace = 0, wspace = 0) # plt.margins(0,0) # plt.gca().xaxis.set_major_locator(plt.NullLocator()) # plt.gca().yaxis.set_major_locator(plt.NullLocator()) # img_name = img_path.split('/')[-1] # save_dir = './qualitative_results/' # plt.savefig(save_dir + img_name, dpi=100, bbox_inches = 'tight', pad_inches = 0) return record def plot_3d_objects(prediction, target, pose_vecs_gt, record, color): if target is not None: p3d_gt = target.reshape(len(target), -1, 3) else: p3d_gt = None p3d_pred = prediction.reshape(len(prediction), -1, 3) if "kpts_3d_before" in record: # use predicted translation for visualization p3d_pred = np.concatenate([record['kpts_3d_before'][:, [0], :], p3d_pred], axis=1) elif p3d_gt is not None and p3d_gt.shape[1] == p3d_pred.shape[1] + 1: # use ground truth translation for visualization assert len(p3d_pred) == len(p3d_gt) p3d_pred = np.concatenate([p3d_gt[:, [0], :], p3d_pred], axis=1) else: raise NotImplementedError if 'plots' in record and 'ax3d' in record['plots']: # update drawing ax = record['plots']['ax3d'] ax = vp.plot_scene_3dbox(p3d_pred, p3d_gt, ax=ax, color=color) elif 'plots' in record: # plotting a set of 3D boxes ax = vp.plot_scene_3dbox(p3d_pred, p3d_gt, color=color) ax.set_title("GT: black w/o Ego-Net: magenta w/ Ego-Net: red/yellow") vp.draw_pose_vecs(ax, pose_vecs_gt) record['plots']['ax3d'] = ax else: raise NotImplementedError # draw pose angle predictions translation = p3d_pred[:, 0, :] pose_vecs_pred = np.concatenate([translation, record['euler_angles']], axis=1) vp.draw_pose_vecs(ax, pose_vecs_pred, color=color) if 'kpts_3d_before' in record and 'plots' in record: # plot input 3D bounding boxes before using Ego-Net kpts_3d_before = record['kpts_3d_before'] vp.plot_scene_3dbox(kpts_3d_before, ax=ax, color='m') pose_vecs_before = np.zeros((len(kpts_3d_before), 6)) for idx in range(len(pose_vecs_before)): pose_vecs_before[idx][0:3] = record['raw_txt_format'][idx]['locations'] pose_vecs_before[idx][4] = record['raw_txt_format'][idx]['rot_y'] vp.draw_pose_vecs(ax, pose_vecs_before, color='m') return record ================================================ FILE: libs/visualization/points.py ================================================ """ Simple visualization utilities for 2D and 3D points based on Matplotlib. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import numpy as np import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D def check_points(points, dimension): """ Assertion function for input dimension. """ if len(points.shape) == 1: assert points.shape[0] % dimension == 0 points = points.reshape(-1, dimension) elif len(points.shape) == 2: assert points.shape[1] == dimension else: raise ValueError return points def set_3d_axe_limits(ax, points=None, center=None, radius=None, ratio=1.2): """ Set 3d axe limits to simulate set_aspect('equal'). Matplotlib has not yet provided implementation of set_aspect('equal') for 3d axe. """ if points is None: assert center is not None and radius is not None if center is None or radius is None: assert points is not None if center is None: center = points.mean(axis=0, keepdims=True) if radius is None: radius = points - center radius = np.max(np.abs(radius))*ratio #ax.set_aspect('equal') xroot, yroot, zroot = center[0,0], center[0,1], center[0,2] ax.set_xlim3d([-radius+xroot, radius+xroot]) ax.set_ylim3d([-radius+yroot, radius+yroot]) ax.set_zlim3d([-radius+zroot, radius+zroot]) return def plot_3d_points(ax, points, indices=None, center=None, radius=None, add_labels=True, display_ticks=True, remove_planes=[], marker='o', color='k', size=50, alpha=1, set_limits=False ): """ Scatter plot of 3D points. points are of shape [3*N_points] or [N_points, 3] """ points = check_points(points, dimension=3) points = points[indices,:] if indices is not None else points ax.scatter(points[:,0], points[:,1], points[:,2], marker=marker, c=color, s=size, alpha=alpha) if set_limits: set_3d_axe_limits(ax, points, center, radius) if add_labels: ax.set_xlabel("x") ax.set_ylabel("y") ax.set_zlabel("z") # remove tick labels or planes if not display_ticks: ax.set_xticks([]) ax.set_yticks([]) ax.set_zticks([]) ax.get_xaxis().set_ticklabels([]) ax.get_yaxis().set_ticklabels([]) ax.set_zticklabels([]) white = (1.0, 1.0, 1.0, 1.0) if 'x' in remove_planes: ax.w_xaxis.set_pane_color(white) if 'y' in remove_planes: ax.w_xaxis.set_pane_color(white) if 'z' in remove_planes: ax.w_xaxis.set_pane_color(white) plt.show() return def plot_lines(ax, points, connections, dimension, lw=4, c='k', linestyle='-', alpha=0.8, add_index=False ): """ Plot 2D/3D lines given points and connection. connections are of shape [n_lines, 2] """ points = check_points(points, dimension) if add_index: for idx in range(len(points)): if dimension == 2: x, y = points[idx][0], points[idx][1] ax.text(x, y, str(idx)) elif dimension == 3: x, y, z = points[idx][0], points[idx][1], points[idx][2] ax.text(x, y, z, str(idx)) connections = connections.reshape(-1, 2) for connection in connections: x = [points[connection[0]][0], points[connection[1]][0]] y = [points[connection[0]][1], points[connection[1]][1]] if dimension == 3: z = [points[connection[0]][2], points[connection[1]][2]] line, = ax.plot(x, y, z, lw=lw, c=c, linestyle=linestyle, alpha=alpha) else: line, = ax.plot(x, y, lw=lw, c=c, linestyle=linestyle, alpha=alpha) plt.show() return line def plot_mesh(ax, vertices, faces, color='grey'): """ Simple mesh plotting. vertics of shape [N_vertices, 3] faces pf shape [N_faces, 3] storing indices """ set_3d_axe_limits(ax, vertices) ax.plot_trisurf(vertices[:, 0], vertices[:, 1], faces, -vertices[:, 2], shade=True, color=color ) return def plot_3d_coordinate_system(ax, origin, system, length=300, colors=['r', 'g', 'b'] ): """ Draw a coordinate system at a specified origin system: [v1, v2, v3] """ origin = origin.reshape(3, 1) start_points = np.repeat(origin, 3, axis=1) end_points = start_points + system*length all_points = np.hstack([origin, end_points]) for i in range(3): plot_lines(ax, all_points.T, plot_3d_coordinate_system.connections[i].reshape(1,2), dimension=3, c=colors[i] ) return def plot_3d_bbox(ax, bbox_3d_projected, color=None, linestyle='-', add_index=False ): """ Draw the projected edges of a 3D cuboid. """ c = np.random.rand(3) if color is None else color plot_lines(ax, bbox_3d_projected, plot_3d_bbox.connections, dimension=2, c=c, linestyle=linestyle, add_index=add_index ) return def plot_2d_bbox(ax, bbox_2d, color=None, score=None, label=None, linestyle='-' ): """ Draw a 2D bounding box. bbox_2d in the format [x1, y1, x2, y2] """ c = np.random.rand(3) if color is None else color x1, y1, x2, y2 = bbox_2d[0], bbox_2d[1], bbox_2d[2], bbox_2d[3], points = np.array([[x1, y1], [x2, y1], [x2, y2], [x1, y2]], dtype=np.float32) plot_lines(ax, points, plot_2d_bbox.connections, dimension=2, c=c, linestyle=linestyle) if score is not None and label is not None: string = "({:.2f}, {:d})".format(score, label) ax.text((x1+x2)*0.5, (y1+y2)*0.5, string, bbox=dict(facecolor='red', alpha=0.2)) return def plot_comparison_relative(points_pred, points_gt): # DEPRECATED # plot the comparison of the shape relative to the root point plt.figure() num_row = 3 num_col = int(len(points_pred)/num_row) for i in range(len(points_pred)): ax = plt.subplot(num_row, num_col, i+1, projection='3d') pred = points_pred[i] gt = points_gt[i] plot_3d_points(ax, pred, color='r') plot_3d_points(ax, gt, color='k') # TODO check here pred_bbox = get_bbox_3d(pred) gt_bbox = get_bbox_3d(gt) plot_lines(ax, pred_bbox, plot_3d_bbox.connections, dimension=3, c='r') plot_lines(ax, gt_bbox, plot_3d_bbox.connections, dimension=3, c='k') set_3d_axe_limits(ax, np.vstack([pred_bbox.reshape(-1, 3), gt_bbox.reshape(-1, 3)] ), center=np.zeros((1,3)), radius=5. ) ax.set_xlabel("x") ax.set_ylabel("y") ax.set_zlabel("z") ax.view_init(0., 90.) return def plot_scene_3dbox(points_pred, points_gt=None, ax=None, color='r'): """ Plot the comparison of predicted 3d bounding boxes and ground truth ones. """ if ax is None: plt.figure() ax = plt.subplot(111, projection='3d') preds = points_pred.copy() # add the root translation preds[:,1:,] = preds[:,1:,] + preds[:,[0],] if points_gt is not None: gts = points_gt.copy() gts[:,1:,] = gts[:,1:,] + gts[:,[0],] all_points = np.concatenate([preds, gts], axis=0).reshape(-1, 3) else: all_points = preds.reshape(-1, 3) for pred in preds: plot_3d_points(ax, pred, color=color, size=15) plot_lines(ax, pred[1:,], plot_3d_bbox.connections, dimension=3, c=color) if points_gt is not None: for gt in gts: plot_3d_points(ax, gt, color='k', size=15) plot_lines(ax, gt[1:,], plot_3d_bbox.connections, dimension=3, c='k') set_3d_axe_limits(ax, all_points) return ax def get_area(points, indices, preserve_points=False): # DEPRECATED # points [N, 2] # indices [M, 3] vec1 = points[indices[:, 1], :] - points[indices[:, 0], :] vec2 = points[indices[:, 2], :] - points[indices[:, 0], :] area= np.cross(vec1, vec2)*0.5 area = area.reshape(1, -1) if preserve_points: feature = np.hstack([points.reshape(1,-1), area]) else: feature = area return feature def interpolate(start, end, num_interp): # DEPRECATED # start: [3] # end: [3] x = np.linspace(start[0], end[0], num=num_interp+2)[1:-1].reshape(num_interp, 1) y = np.linspace(start[1], end[1], num=num_interp+2)[1:-1].reshape(num_interp, 1) z = np.linspace(start[2], end[2], num=num_interp+2)[1:-1].reshape(num_interp, 1) return np.concatenate([x,y,z], axis=1) def get_interpolated_points(points, indices, num_interp): # DEPRECATED # points [N, 3] # indices [M, 2] point indices for interpolating a line segment # num_interp how many points to add for each segment new_points = [] for start_idx, end_idx in indices: new_points.append(interpolate(points[start_idx], points[end_idx], num_interp)) return np.vstack(new_points) def draw_pose_vecs(ax, pose_vecs=None, color='black'): """ Add pose vectors to a 3D matplotlib axe. """ if pose_vecs is None: return for pose_vec in pose_vecs: x, y, z, pitch, yaw, roll = pose_vec string = "({:.2f}, {:.2f}, {:.2f})".format(pitch, yaw, roll) # add some random noise to the text location so that they do not overlap nl = 0.02 # noise level ax.text(x*(1+np.random.randn()*nl), y*(1+np.random.randn()*nl), z*(1+np.random.randn()*nl), string, color=color ) def get_bbox_3d(points, add_center=False, interp_style=""): """ Get a 3D bounding boxes from coordinate limits in object coordinate system. """ assert len(points.shape) == 2 if points.shape[0] == 3: axis=1 elif points.shape[1] == 3: axis=0 limit_min = points.min(axis=axis) limit_max = points.max(axis=axis) xmax, xmin = limit_max[0], limit_min[0] ymax, ymin = limit_max[1], limit_min[1] zmax, zmin = limit_max[2], limit_min[2] bbox = np.array([[xmax, ymin, zmax], [xmax, ymax, zmax], [xmax, ymin, zmin], [xmax, ymax, zmin], [xmin, ymin, zmax], [xmin, ymax, zmax], [xmin, ymin, zmin], [xmin, ymax, zmin]]) if add_center: bbox = np.vstack([np.array([[0., 0., 0.]]), bbox]) if interp_style.startswith('bbox9interp'): interp_num = int(interp_style[11:]) # indices for each edge indices = np.array([[1,2], [3,4], [1,3], [2,4], [5,6], [7,8], [5,7], [6,8], [1,5], [3,7], [2,6], [4,8]]) new_points = get_interpolated_points(bbox, indices, interp_num) bbox = np.vstack([bbox, new_points]) return bbox def ray_intersect_triangle(p0, p1, triangle): """ Tests if a ray starting at point p0, in the direction p1 - p0, will intersect with the triangle. arguments: p0, p1: numpy.ndarray, both with shape (3,) for x, y, z. triangle: numpy.ndarray, shaped (3,3), with each row representing a vertex and three columns for x, y, z. returns: 0.0 if ray does not intersect triangle, 1.0 if it will intersect the triangle, 2.0 if starting point lies in the triangle. Reference: https://www.erikrotteveel.com/python/three-dimensional-ray-tracing-in-python/ """ v0, v1, v2 = triangle u = v1 - v0 v = v2 - v0 normal = np.cross(u, v) b = np.inner(normal, p1 - p0) a = np.inner(normal, v0 - p0) if (b == 0.0): # ray is parallel to the plane if a != 0.0: # ray is outside but parallel to the plane return 0 else: # ray is parallel and lies in the plane rI = 0.0 else: rI = a / b if rI < 0.0: return 0 w = p0 + rI * (p1 - p0) - v0 denom = np.inner(u, v) * np.inner(u, v) - \ np.inner(u, u) * np.inner(v, v) si = (np.inner(u, v) * np.inner(w, v) - \ np.inner(v, v) * np.inner(w, u)) / denom if (si < 0.0) | (si > 1.0): return 0 ti = (np.inner(u, v) * np.inner(w, u) - \ np.inner(u, u) * np.inner(w, v)) / denom if (ti < 0.0) | (si + ti > 1.0): return 0 if (rI == 0.0): return 2 return 1 def get_visibility(box3d, triangles): """ Get visibility for each vertex of a 3D bounding box given all the triangles in a scene. box3d: [8, 3] The vertex coordinates in the camera coordinate system. triangles: [N, 3, 3] """ visibility = np.ones(8, dtype=np.bool) p1 = np.zeros(3) for idx, p0 in enumerate(box3d): intersects = set() for triangle in triangles: intersects.add(ray_intersect_triangle(p0, p1, triangle)) if 1 in intersects: visibility[idx] = False return visibility ## static variables implemented as function attributes plot_3d_coordinate_system.connections = np.array([[0, 1], [0, 2], [0, 3]]) plot_3d_bbox.connections = np.array([[0, 1], [0, 2], [1, 3], [2, 3], [4, 5], [5, 7], [4, 6], [6, 7], [0, 4], [1, 5], [2, 6], [3, 7]]) plot_2d_bbox.connections = np.array([[0, 1], [1, 2], [2, 3], [3, 0]]) ================================================ FILE: tools/inference.py ================================================ """ Inference of Ego-Net on KITTI dataset. The user can provide the 3D bounding boxes predicted by other 3D object detectors and run Ego-Net to refine the orientation of these instances. The user can also visualize the intermediate results. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import sys sys.path.append('../') import libs.arguments.parse as parse import libs.logger.logger as liblogger import libs.dataset.KITTI.car_instance as libkitti from libs.common.img_proc import modify_bbox from libs.trainer.trainer import get_loader from libs.model.egonet import EgoNet import shutil import torch import numpy as np import os import subprocess import matplotlib.pyplot as plt plt.ion() def filter_detection(detected, thres=0.7): """ Filter predictions based on a confidence threshold. """ # detected: list of dict filtered = [] for detection in detected: tempt_dict = {} indices = detection['scores'] > thres for key in ['boxes', 'labels', 'scores']: tempt_dict[key] = detection[key][indices] filtered.append(tempt_dict) return filtered def merge(dict_a, dict_b): for key in dict_b.keys(): dict_a[key] = dict_b[key] return def collate_dict(dict_list): ret = {} for key in dict_list[0]: ret[key] = [d[key] for d in dict_list] return ret def my_collate_fn(batch): # the collate function for 2d pose training imgs, meta = list(zip(*batch)) meta = collate_dict(meta) return imgs, meta def filter_conf(record, thres=0.0): """ Filter the object detections with a confidence threshold. """ annots = record['raw_txt_format'] indices = [i for i in range(len(annots)) if annots[i]['score'] >= thres] if len(indices) == 0: return False, record filterd_record = { 'bbox_2d': record['bbox_2d'][indices], 'kpts_3d': record['kpts_3d'][indices], 'raw_txt_format': [annots[i] for i in indices], 'scores': [annots[i]['score'] for i in indices], 'K':record['K'] } return True, filterd_record def gather_dict(request, references, filter_c=True, larger=True, thres=0., target_ar=1., enlarge=1.2 ): """ Gather a annotation dictionary from the prepared detections as requsted. """ assert 'path' in request ret = {'path':[], 'boxes':[], 'kpts_3d_before':[], 'raw_txt_format':[], 'scores':[], 'K':[]} for img_path in request['path']: img_name = img_path.split('/')[-1] if img_name not in references: print('Warning: ' + img_name + ' not included in detected images!') continue ref = references[img_name] if filter_c: success, ref = filter_conf(ref, thres=thres) if filter_c and not success: continue ret['path'].append(img_path) bbox = ref['bbox_2d'] if larger: # enlarge the input bounding box if needed for instance_id in range(len(bbox)): bbox[instance_id] = np.array(modify_bbox(bbox[instance_id], target_ar=target_ar, enlarge=enlarge )['bbox'] ) ret['boxes'].append(bbox) # 3D key-points from the detections before using Ego-Net ret['kpts_3d_before'].append(ref['kpts_3d']) # raw prediction strings used for later saving ret['raw_txt_format'].append(ref['raw_txt_format']) ret['scores'].append(ref['scores']) ret['K'].append(ref['K']) if 'pose_vecs_gt' in request: ret['pose_vecs_gt'] = request['pose_vecs_gt'] return ret def make_output_dir(cfgs, name): save_dir = os.path.join(cfgs['dirs']['output'], name, 'data') if not os.path.exists(save_dir): os.makedirs(save_dir) return save_dir @torch.no_grad() def inference(testset, model, results, cfgs): """ The inference loop. Set cfgs['visualize'] to True if you want to view the results. color_dict stores plotting parameters used by Matplotlib. save_dict stores parameters relevant to result saving. """ # data loader data_loader = get_loader(testset, cfgs, 'testing', collate_fn=my_collate_fn) # transformation statistics model.pth_trans = testset.pth_trans all_records = {} for batch_idx, (_, meta) in enumerate(data_loader): if cfgs['use_gt_box']: save_dir = make_output_dir(cfgs, 'gt_box_test') # use ground truth bounding box to crop RoIs record = model(meta) record = model.post_process(record, visualize=cfgs['visualize'], color_dict={'bbox_2d':'y', 'bbox_3d':'y', 'kpts':['yx', 'y'] }, save_dict={ 'flag':True, 'save_dir':save_dir } ) merge(all_records, record) if cfgs['use_pred_box']: # use detected bounding box from any 2D/3D detector thres = cfgs.get('conf_thres', 0.) width, height = cfgs['heatmapModel']['input_size'] enlarge = cfgs['dataset'].get('enlarge_factor', 1.2) annot_dict = gather_dict(meta, results['pred'], thres=thres, target_ar=height/width, enlarge=enlarge ) if len(annot_dict['path']) != 0: record2 = model(annot_dict) # update drawings for key in record2: if 'record' in locals() and 'plots' in record[key]: record2[key]['plots'] = record[key]['plots'] save_dir = make_output_dir(cfgs, 'submission') record2 = model.post_process(record2, visualize=cfgs['visualize'], color_dict={'bbox_2d':'r', 'bbox_3d':'r', 'kpts':['rx', 'r'], }, save_dict={'flag':True, 'save_dir':save_dir }, alpha_mode=cfgs['testing_settings']['alpha_mode'] ) if cfgs['visualize']: input("Press Enter to view next batch.") # set batch_to_show to a small number if you need to visualize if batch_idx >= cfgs['batch_to_show'] - 1: break return def generate_empty_file(output_dir, label_dir): """ Generate empty files for images without any predictions. """ all_files = os.listdir(label_dir) detected = os.listdir(os.path.join(output_dir, 'data')) for file_name in all_files: if not file_name.endswith(".txt"): continue if file_name not in detected: file = open(os.path.join(output_dir, 'data', file_name[:-4] + '.txt'), 'w') file.close() return def main(): # experiment configurations cfgs = parse.parse_args() # logging logger, final_output_dir = liblogger.get_logger(cfgs) # save a copy of the experiment configuration save_cfg_path = os.path.join(final_output_dir, 'saved_config.yml') shutil.copyfile(cfgs['config_path'], save_cfg_path) # set GPU if cfgs['use_gpu'] and torch.cuda.is_available(): logger.info('Using GPU:{}'.format(cfgs['gpu_id'])) os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(list(map(str, cfgs['gpu_id']))) else: raise ValueError('CPU-based inference is not maintained.') # cudnn related setting torch.backends.cudnn.benchmark = cfgs['cudnn']['benchmark'] torch.backends.cudnn.deterministic = cfgs['cudnn']['deterministic'] torch.backends.cudnn.enabled = cfgs['cudnn']['enabled'] # configurations related to the KITTI dataset data_cfgs = cfgs['dataset'] # which split to show split = data_cfgs['split'] # default: KITTI val split dataset_inf = libkitti.get_dataset(cfgs, logger, split) # set the dataset to inference mode dataset_inf.inference([True, False]) # read annotations input_file_path = cfgs['dirs']['load_prediction_file'] # the record for 2D and 3D predictions results = {} # flags: the user can choose to use which type of input bounding boxes to use # use_gt_box can be used to re-produce the experiments simulating perfect 2D detection results['flags'] = {} if cfgs['use_pred_box']: # read the predicted boxes as specified by the path results['pred'] = dataset_inf.read_predictions(input_file_path) # Initialize Ego-Net and load the pre-trained checkpoint model = EgoNet(cfgs, pre_trained=True) model = model.eval().cuda() # perform inference and save the (updated) predictions inference(dataset_inf, model, results, cfgs) if cfgs['visualize']: return evaluator = "./kitti-eval/evaluate_object_3d_offline" label_dir = os.path.join(cfgs['dataset']['root'], 'training', 'label_2') output_dir = os.path.join(cfgs['dirs']['output'], 'submission') # When generating submission files for the test split, # if no detections are produced for one image, generate an empty file if cfgs['dataset']['split'] == 'test': test_calib_dir = os.path.join(cfgs['dataset']['root'], 'testing', 'calib') generate_empty_file(output_dir, test_calib_dir) return # run kitti-eval to produce official evaluation command = "{} {} {}".format(evaluator, label_dir, output_dir) output = subprocess.check_output(command, shell=True) print(output.decode()) return output if __name__ == "__main__": main() ================================================ FILE: tools/inference_legacy.py ================================================ """ This is the legacy inference code which includes some debugging functions. You don't need to read this file to use Ego-Net. """ import sys sys.path.append('../') import libs.arguments.parse as parse import libs.logger.logger as liblogger import libs.dataset as dataset import libs.dataset.KITTI.car_instance import libs.model as models import libs.model.FCmodel as FCmodel import libs.dataset.normalization.operations as nop import libs.visualization.points as vp import libs.common.transformation as ltr from libs.common.img_proc import resize_bbox, get_affine_transform, get_max_preds, generate_xy_map from libs.common.img_proc import affine_transform_modified, cs2bbox, simple_crop, enlarge_bbox from libs.trainer.trainer import visualize_lifting_results, get_loader from libs.dataset.KITTI.car_instance import interp_dict import shutil import torch import cv2 import numpy as np import matplotlib.pyplot as plt import os import math from scipy.spatial.transform import Rotation from copy import deepcopy def prepare_models(cfgs, is_cuda=True): """ Initialize and load Ego-Net given a configuration file. """ hm_model_settings = cfgs['heatmapModel'] hm_model_name = hm_model_settings['name'] method_str = 'models.heatmapModel.' + hm_model_name + '.get_pose_net' hm_model = eval(method_str)(cfgs, is_train=False) lifter = FCmodel.get_fc_model(stage_id=1, cfgs=cfgs, input_size=cfgs['FCModel']['input_size'], output_size=cfgs['FCModel']['output_size'] ) hm_model.load_state_dict(torch.load(cfgs['dirs']['load_hm_model'])) stats = np.load(cfgs['dirs']['load_stats'], allow_pickle=True).item() lifter.load_state_dict(torch.load(cfgs['dirs']['load_lifter'])) if is_cuda: hm_model = hm_model.cuda() lifter = lifter.cuda() model_dict = {'heatmap_regression':hm_model.eval(), 'lifting':lifter.eval(), 'FC_stats':stats } return model_dict def modify_bbox(bbox, target_ar, enlarge=1.1): """ Enlarge a bounding box so that occluded parts may be enclosed. """ lbbox = enlarge_bbox(bbox[0], bbox[1], bbox[2], bbox[3], [enlarge, enlarge]) ret = resize_bbox(lbbox[0], lbbox[1], lbbox[2], lbbox[3], target_ar=target_ar) return ret def crop_single_instance(img, bbox, resolution, pth_trans=None, xy_dict=None): """ Crop a single instance given an image and bounding box. """ bbox = to_npy(bbox) target_ar = resolution[0] / resolution[1] ret = modify_bbox(bbox, target_ar) c, s = ret['c'], ret['s'] r = 0. # xy_dict: parameters for adding xy coordinate maps trans = get_affine_transform(c, s, r, resolution) instance = cv2.warpAffine(img, trans, (int(resolution[0]), int(resolution[1])), flags=cv2.INTER_LINEAR ) #cv2.imwrite('test.jpg', input) #input = torch.from_numpy(input.transpose(2,0,1)) if xy_dict is not None and xy_dict['flag']: xymap = generate_xy_map(ret['bbox'], resolution, img.shape[:-1]) instance = np.concatenate([instance, xymap.astype(np.float32)], axis=2) instance = instance if pth_trans is None else pth_trans(instance) return instance def crop_instances(annot_dict, resolution, pth_trans=None, rgb=True, xy_dict=None ): """ Crop input instances given an annotation dictionary. """ all_instances = [] # each record describes one instance all_records = [] target_ar = resolution[0] / resolution[1] for idx, path in enumerate(annot_dict['path']): #print(path) data_numpy = cv2.imread(path, 1 | 128) if data_numpy is None: raise ValueError('Fail to read {}'.format(path)) if rgb: data_numpy = cv2.cvtColor(data_numpy, cv2.COLOR_BGR2RGB) boxes = annot_dict['boxes'][idx] if 'labels' in annot_dict: labels = annot_dict['labels'][idx] else: labels = -np.ones((len(boxes)), dtype=np.int64) if 'scores' in annot_dict: scores = annot_dict['scores'][idx] else: scores = -np.ones((len(boxes))) if len(boxes) == 0: continue for idx, bbox in enumerate(boxes): # first crop the instance, and then resize to the required aspect ratio instance = crop_single_instance(data_numpy, bbox, resolution, pth_trans=pth_trans, xy_dict=xy_dict ) bbox = to_npy(bbox) ret = modify_bbox(bbox, target_ar) c, s = ret['c'], ret['s'] r = 0. all_instances.append(torch.unsqueeze(instance, dim=0)) all_records.append({ 'path': path, 'center': c, 'scale': s, 'bbox': bbox, 'bbox_resize': ret['bbox'], 'rotation': r, 'label': labels[idx], 'score': scores[idx] } ) #break return torch.cat(all_instances, dim=0), all_records def get_keypoints(instances, records, model, image_size=(256,256), arg_max='hard', is_cuda=True ): """ Foward pass to obtain the screen coordinates. """ if is_cuda: instances = instances.cuda() output = model(instances) if type(output) is tuple: pred, max_vals = output[1].data.cpu().numpy(), None elif arg_max == 'hard': if not isinstance(output, np.ndarray): output = output.data.cpu().numpy() pred, max_vals = get_max_preds(output) else: raise NotImplementedError if type(output) is tuple: pred *= image_size[0] else: pred *= image_size[0]/output.shape[3] centers = [records[i]['center'] for i in range(len(records))] scales = [records[i]['scale'] for i in range(len(records))] rots = [records[i]['rotation'] for i in range(len(records))] for sample_idx in range(len(pred)): trans_inv = get_affine_transform(centers[sample_idx], scales[sample_idx], rots[sample_idx], image_size, inv=1) pred_src_coordinates = affine_transform_modified(pred[sample_idx], trans_inv) record = records[sample_idx] # pred_src_coordinates += np.array([[record['bbox'][0], record['bbox'][1]]]) records[sample_idx]['kpts'] = pred_src_coordinates # assemble a dictionary where each key corresponds to one image ret = {} for record in records: path = record['path'] if path not in ret: ret[path] = {'center':[], 'scale':[], 'rotation':[], 'bbox_resize':[], # resized bounding box 'kpts_2d_pred':[], 'label':[], 'score':[] } ret[path]['kpts_2d_pred'].append(record['kpts'].reshape(1, -1)) ret[path]['center'].append(record['center']) ret[path]['scale'].append(record['scale']) ret[path]['bbox_resize'].append(record['bbox_resize']) ret[path]['label'].append(record['label']) ret[path]['score'].append(record['score']) ret[path]['rotation'].append(record['rotation']) return ret def kpts_to_euler(template, prediction): """ Convert the predicted cuboid representation to euler angles. """ # estimate roll, pitch, yaw of the prediction by comparing with a # reference bounding box # prediction and template of shape [3, N_points] R, T = ltr.compute_rigid_transform(template, prediction) # in the order of yaw, pitch and roll angles = Rotation.from_matrix(R).as_euler('yxz', degrees=False) # re-order in the order of x, y and z angles = angles[[1,0,2]] return angles, T def get_template(prediction, interp_coef=[0.332, 0.667]): """ Construct a template 3D cuboid used for computing regid transformation. """ parents = prediction[interp_dict['bbox12'][0]] children = prediction[interp_dict['bbox12'][1]] lines = parents - children lines = np.sqrt(np.sum(lines**2, axis=1)) h = np.sum(lines[:4])/4 # averaged over the four parallel line segments l = np.sum(lines[4:8])/4 w = np.sum(lines[8:])/4 x_corners = [0.5*l, l, l, l, l, 0, 0, 0, 0] y_corners = [0.5*h, 0, h, 0, h, 0, h, 0, h] z_corners = [0.5*w, w, w, 0, 0, w, w, 0, 0] x_corners += - np.float32(l) / 2 y_corners += - np.float32(h) #y_corners += - np.float32(h/2) z_corners += - np.float32(w) / 2 corners_3d = np.array([x_corners, y_corners, z_corners]) if len(prediction) == 33: pidx, cidx = interp_dict['bbox12'] parents, children = corners_3d[:, pidx], corners_3d[:, cidx] lines = children - parents new_joints = [(parents + interp_coef[i]*lines) for i in range(len(interp_coef))] corners_3d = np.hstack([corners_3d, np.hstack(new_joints)]) return corners_3d def get_observation_angle_trans(euler_angles, translations): """ Convert orientation in camera coordinate into local coordinate system utilizing known object location (translation) """ alphas = euler_angles[:,1].copy() for idx in range(len(euler_angles)): ry3d = euler_angles[idx][1] # orientation in the camera coordinate system x3d, z3d = translations[idx][0], translations[idx][2] alpha = ry3d - math.atan2(-z3d, x3d) - 0.5 * math.pi #alpha = ry3d - math.atan2(x3d, z3d)# - 0.5 * math.pi while alpha > math.pi: alpha -= math.pi * 2 while alpha < (-math.pi): alpha += math.pi * 2 alphas[idx] = alpha return alphas def get_observation_angle_proj(euler_angles, kpts, K): """ Convert orientation in camera coordinate into local coordinate system utilizing the projection of object on the image plane """ f = K[0,0] cx = K[0,2] kpts_x = [kpts[i][0,0] for i in range(len(kpts))] alphas = euler_angles[:,1].copy() for idx in range(len(euler_angles)): ry3d = euler_angles[idx][1] # orientation in the camera coordinate system x3d, z3d = kpts_x[idx] - cx, f alpha = ry3d - math.atan2(-z3d, x3d) - 0.5 * math.pi #alpha = ry3d - math.atan2(x3d, z3d)# - 0.5 * math.pi while alpha > math.pi: alpha -= math.pi * 2 while alpha < (-math.pi): alpha += math.pi * 2 alphas[idx] = alpha return alphas def get_6d_rep(predictions, ax=None, color="black"): """ Get the 6DoF representation of a 3D prediction. """ predictions = predictions.reshape(len(predictions), -1, 3) all_angles = [] for instance_idx in range(len(predictions)): prediction = predictions[instance_idx] # templates are 3D boxes with no rotation # the prediction is estimated as the rotation between prediction and template template = get_template(prediction) instance_angle, instance_trans = kpts_to_euler(template, prediction.T) all_angles.append(instance_angle.reshape(1, 3)) angles = np.concatenate(all_angles) # the first point is the predicted point center translation = predictions[:, 0, :] if ax is not None: pose_vecs = np.concatenate([translation, angles], axis=1) draw_pose_vecs(ax, pose_vecs, color=color) return angles, translation def format_str_submission(roll, pitch, yaw, x, y, z, score): """ Get a prediction string in ApolloScape style. """ tempt_str = "{pitch:.3f} {yaw:.3f} {roll:.3f} {x:.3f} {y:.3f} {z:.3f} {score:.3f}".format( pitch=pitch, yaw=yaw, roll=roll, x=x, y=y, z=z, score=score) return tempt_str def get_instance_str(dic): """ Produce KITTI style prediction string for one instance. """ string = "" string += dic['class'] + " " string += "{:.1f} ".format(dic['truncation']) string += "{:.1f} ".format(dic['occlusion']) string += "{:.6f} ".format(dic['alpha']) string += "{:.6f} {:.6f} {:.6f} {:.6f} ".format(dic['bbox'][0], dic['bbox'][1], dic['bbox'][2], dic['bbox'][3]) string += "{:.6f} {:.6f} {:.6f} ".format(dic['dimensions'][1], dic['dimensions'][2], dic['dimensions'][0]) string += "{:.6f} {:.6f} {:.6f} ".format(dic['locations'][0], dic['locations'][1], dic['locations'][2]) string += "{:.6f} ".format(dic['rot_y']) if 'score' in dic: string += "{:.8f} ".format(dic['score']) else: string += "{:.8f} ".format(1.0) return string def get_pred_str(record): """ Produce KITTI style prediction string for a record dictionary. """ # replace the rotation prediction generated by the previous stage updated_txt = deepcopy(record['raw_txt_format']) for instance_id in range(len(record['euler_angles'])): updated_txt[instance_id]['rot_y'] = record['euler_angles'][instance_id, 1] updated_txt[instance_id]['alpha'] = record['alphas'][instance_id] pred_str = "" angles = record['euler_angles'] for instance_id in range(len(angles)): # format a string for submission tempt_str = get_instance_str(updated_txt[instance_id]) if instance_id != len(angles) - 1: tempt_str += '\n' pred_str += tempt_str return pred_str def lift_2d_to_3d(records, model, stats, template, cuda=True): """ Foward-pass of the lifter model. """ for path in records.keys(): data = np.concatenate(records[path]['kpts_2d_pred'], axis=0) data = nop.normalize_1d(data, stats['mean_in'], stats['std_in']) data = data.astype(np.float32) data = torch.from_numpy(data) if cuda: data = data.cuda() prediction = model(data) prediction = nop.unnormalize_1d(prediction.data.cpu().numpy(), stats['mean_out'], stats['std_out'] ) records[path]['kpts_3d_pred'] = prediction.reshape(len(prediction), -1, 3) return records def filter_detection(detected, thres=0.7): """ Filter predictions based on a confidence threshold. """ # detected: list of dict filtered = [] for detection in detected: tempt_dict = {} indices = detection['scores'] > thres for key in ['boxes', 'labels', 'scores']: tempt_dict[key] = detection[key][indices] filtered.append(tempt_dict) return filtered def add_orientation_arrow(record): """ Generate an arrow for each predicted orientation for visualization. """ pred_kpts = record['kpts_3d_pred'] gt_kpts = record['kpts_3d_gt'] K = record['K'] arrow_2d = np.zeros((len(pred_kpts), 2, 2)) for idx in range(len(pred_kpts)): vector_3d = (pred_kpts[idx][1] - pred_kpts[idx][5]) arrow_3d = np.concatenate([gt_kpts[idx][0].reshape(3, 1), (gt_kpts[idx][0] + vector_3d).reshape(3, 1)], axis=1) projected = K @ arrow_3d arrow_2d[idx][0] = projected[0, :] / projected[2, :] arrow_2d[idx][1] = projected[1, :] / projected[2, :] # fix the arrow length if not fore-shortened vector_2d = arrow_2d[idx][:,1] - arrow_2d[idx][:,0] length = np.linalg.norm(vector_2d) if length > 50: vector_2d = vector_2d/length * 60 arrow_2d[idx][:,1] = arrow_2d[idx][:,0] + vector_2d return arrow_2d def process_batch(images, hm_regressor, lifter, stats, template, annot_dict, pth_trans=None, is_cuda=True, threshold=None, xy_dict=None ): """ Process a batch of images. # annot_dict is a Python dictionary storing # keys: # path: list of image paths # boxes: list of bounding boxes for each image """ all_instances, all_records = crop_instances(annot_dict, resolution=(256, 256), pth_trans=pth_trans, xy_dict=xy_dict ) # all_records stores records for each instance records = get_keypoints(all_instances, all_records, hm_regressor) # records stores records for each image records = lift_2d_to_3d(records, lifter, stats, template) # merge with the annotation dictionary for idx, path in enumerate(annot_dict['path']): if 'boxes' in annot_dict: records[path]['boxes'] = to_npy(annot_dict['boxes'][idx]) if 'kpts' in annot_dict: records[path]['kpts_2d_gt'] = to_npy(annot_dict['kpts'][idx]) if 'kpts_3d_gt' in annot_dict: records[path]['kpts_3d_gt'] = to_npy(annot_dict['kpts_3d_gt'][idx]) if 'pose_vecs_gt' in annot_dict: records[path]['pose_vecs_gt'] = to_npy(annot_dict['pose_vecs_gt'][idx]) if 'kpts_3d_SMOKE' in annot_dict: records[path]['kpts_3d_SMOKE'] = to_npy(annot_dict['kpts_3d_SMOKE'][idx]) if 'raw_txt_format' in annot_dict: # list of annotation dictionary for each instance records[path]['raw_txt_format'] = annot_dict['raw_txt_format'][idx] if 'K' in annot_dict: # list of annotation dictionary for each instance records[path]['K'] = annot_dict['K'][idx] if 'kpts_3d_gt' in annot_dict and 'K' in annot_dict: records[path]['arrow'] = add_orientation_arrow(records[path]) return records def to_npy(tensor): """ Convert PyTorch tensor to numpy array. """ if isinstance(tensor, np.ndarray): return tensor else: return tensor.data.cpu().numpy() def refine_with_perfect_size(pred, observation, intrinsics, dist_coeffs, gts, threshold=5., ax=None ): """ Use the gt 3D box size for refinement to show the performance gain with size regression. If there is a nearby ground truth bbox, use its size. pred [9, 3] gts[N, 9, 3] """ pred_center = pred[0, :].reshape(1,3) distance = np.sqrt(np.sum((gts[:, 0, :] - pred_center)**2, axis=1)) minimum_idx = np.where(distance == distance.min())[0][0] if distance[minimum_idx] > threshold: return False, None else: # First align the box with gt size with the predicted box, then refine tempt_box_pred = pred.copy() tempt_box_pred[1:, :] += tempt_box_pred[0, :].reshape(1, 3) tempt_box_gt = gts[minimum_idx].copy() tempt_box_gt[1:, :] += tempt_box_gt[0, :].reshape(1, 3) pseudo_box = ltr.procrustes_transform(tempt_box_gt.T, tempt_box_pred.T) refined_prediction = ltr.pnp_refine(pseudo_box.T, observation, intrinsics, dist_coeffs) if ax is not None: vp.plot_lines(ax, pseudo_box[:, 1:].T, vp.plot_3d_bbox.connections, dimension=3, c='y', linestyle='-.') vp.plot_lines(ax, refined_prediction[:, 1:].T, vp.plot_3d_bbox.connections, dimension=3, c='b', linestyle='-.') return True, refined_prediction def refine_with_predicted_bbox(pred, observation, intrinsics, dist_coeffs, gts=None, threshold=5., ax=None ): """ Refine with predicted 3D cuboid (disabled by default). """ tempt_box_pred = pred.copy() tempt_box_pred[1:, :] += tempt_box_pred[0, :].reshape(1, 3) # use the predicted 3D bounding box size for refinement refined_prediction = ltr.pnp_refine(tempt_box_pred, observation, intrinsics, dist_coeffs) # discard the results if the refined solution is to far away from the initial position distance = refined_prediction[:, 0] - tempt_box_pred[0, :] distance = np.sqrt(np.sum(distance**2)) if distance > threshold: return False, None else: # plotting if ax is not None: vp.plot_lines(ax, refined_prediction[:, 1:].T, vp.plot_3d_bbox.connections, dimension=3, c='g') return True, refined_prediction def draw_pose_vecs(ax, pose_vecs=None, color='black'): """ Add pose vectors to a 3D matplotlib axe. """ if pose_vecs is None: return for pose_vec in pose_vecs: x, y, z, pitch, yaw, roll = pose_vec string = "({:.2f}, {:.2f}, {:.2f})".format(pitch, yaw, roll) # add some random noise to the text location so that they do not overlap nl = 0.02 # noise level ax.text(x*(1+np.random.randn()*nl), y*(1+np.random.randn()*nl), z*(1+np.random.randn()*nl), string, color=color ) def refine_solution(est_3d, est_2d, K, dist_coeffs, refine_func, output_arr, output_flags, gts=None, ax=None ): """ Refine 3D prediction by minimizing re-projection error. est: estimates [N, 9, 3] K: intrinsics """ for idx in range(len(est_3d)): success, refined_prediction = refine_func(est_3d[idx], est_2d[idx], K, dist_coeffs, gts=gts, ax=ax) if success: # update the refined solution output_arr[idx] = refined_prediction.T output_flags[idx] = True # # convert to the center-relative shape representation # p3d_pred_refined[idx][1:, :] -= p3d_pred_refined[idx][[0]] return def gather_lifting_results(record, data, prediction, target=None, pose_vecs=None, intrinsics=None, refine=False, visualize=False, template=None, dist_coeffs=np.zeros((4,1)), color='r', get_str=False, alpha_mode='trans' ): """ Lift Screen coordinates to 3D representation and a optimization-based refinement is optional. """ if target is not None: p3d_gt = target.reshape(len(target), -1, 3) else: p3d_gt = None p3d_pred = prediction.reshape(len(prediction), -1, 3) # only for visualizing the prediciton of shape using gt bboxes if "kpts_3d_SMOKE" in record: p3d_pred = np.concatenate([record['kpts_3d_SMOKE'][:, [0], :], p3d_pred], axis=1) elif p3d_gt is not None and p3d_gt.shape[1] == p3d_pred.shape[1] + 1: if len(p3d_pred) != len(p3d_gt): print('debug') assert len(p3d_pred) == len(p3d_gt) p3d_pred = np.concatenate([p3d_gt[:, [0], :], p3d_pred], axis=1) else: raise NotImplementedError # this object will be updated if one prediction is refined p3d_pred_refined = p3d_pred.copy() refine_flags = [False for i in range(len(p3d_pred_refined))] # similar object but using a different refinement strategy p3d_pred_refined2 = p3d_pred.copy() refine_flags2 = [False for i in range(len(p3d_pred_refined2))] # input 2D keypoints data = data.reshape(len(data), -1, 2) if visualize: if 'plots' in record and 'ax3d' in record['plots']: ax = record['plots']['ax3d'] ax = vp.plot_scene_3dbox(p3d_pred, p3d_gt, ax=ax, color=color) elif 'plots' in record: # plotting the 3D scene ax = vp.plot_scene_3dbox(p3d_pred, p3d_gt, color=color) draw_pose_vecs(ax, pose_vecs) record['plots']['ax3d'] = ax else: raise ValueError else: ax = None if refine: assert intrinsics is not None # refine 3D point prediction by minimizing re-projection errors refine_solution(p3d_pred, data, intrinsics, dist_coeffs, refine_with_predicted_bbox, p3d_pred_refined, refine_flags, ax=ax ) if target is not None: # refine with ground truth bounding box size for debugging purpose refine_solution(p3d_pred, data, intrinsics, dist_coeffs, refine_with_perfect_size, p3d_pred_refined2, refine_flags2, gts=p3d_gt, ax=ax ) record['kpts_3d_refined'] = p3d_pred_refined # prepare the prediction string for submission # compute the roll, pitch and yaw angle of the predicted bounding box record['euler_angles'], record['translation'] = \ get_6d_rep(record['kpts_3d_refined'], ax, color=color) # the predicted pose vectors are also drawn here if alpha_mode == 'trans': record['alphas'] = get_observation_angle_trans(record['euler_angles'], record['translation']) elif alpha_mode == 'proj': record['alphas'] = get_observation_angle_proj(record['euler_angles'], record['kpts_2d_pred'], record['K']) else: raise NotImplementedError if get_str: record['pred_str'] = get_pred_str(record) return record def save_txt_file(img_path, prediction, params): """ Save a txt file for predictions of an image. """ if not params['flag']: return file_name = img_path.split('/')[-1][:-3] + 'txt' save_path = os.path.join(params['save_dir'], file_name) with open(save_path, 'w') as f: f.write(prediction['pred_str']) return def refine_one_image(img_path, record, add_3d_bbox=True, camera=None, template=None, visualize=False, color_dict={'bbox_2d':'r', 'bbox_3d':'r', 'kpts':['rx', 'b'] }, save_dict={'flag':False, 'save_dir':None }, alpha_mode='trans' ): """ Refine the predictions from a single image. """ # plot 2D predictions if visualize: if 'plots' in record: fig = record['plots']['fig2d'] ax = record['plots']['ax2d'] else: fig = plt.figure(figsize=(11.3, 9)) ax = plt.subplot(111) record['plots'] = {} record['plots']['fig2d'] = fig record['plots']['ax2d'] = ax image = cv2.imread(img_path, cv2.IMREAD_UNCHANGED)[:, :, ::-1] height, width, _ = image.shape ax.imshow(image) ax.set_xlim([0, width]) ax.set_ylim([0, height]) ax.invert_yaxis() num_instances = len(record['kpts_2d_pred']) for idx in range(num_instances): kpts = record['kpts_2d_pred'][idx].reshape(-1, 2) # kpts_3d = record['kpts_3d'][idx] bbox = record['bbox_resize'][idx] label = record['label'][idx] score = record['score'][idx] vp.plot_2d_bbox(ax, bbox, color_dict['bbox_2d'], score, label) # predicted key-points ax.plot(kpts[:, 0], kpts[:, 1], color_dict['kpts'][0]) # if add_3d_bbox: # vp.plot_3d_bbox(ax, kpts[1:,], color_dict['kpts'][1]) # bbox_3d_projected = project_3d_to_2d(kpts_3d) # vp.plot_3d_bbox(ax, bbox_3d_projected[:2, :].T) # plot ground truth if 'kpts_2d_gt' in record: for idx, kpts_gt in enumerate(record['kpts_2d_gt']): kpts_gt = kpts_gt.reshape(-1, 3) # ax.plot(kpts_gt[:, 0], kpts_gt[:, 1], 'gx') vp.plot_3d_bbox(ax, kpts_gt[1:, :2], color='g', linestyle='-.') if 'arrow' in record: for idx in range(len(record['arrow'])): start = record['arrow'][idx][:,0] end = record['arrow'][idx][:,1] x, y = start dx, dy = end - start ax.arrow(x, y, dx, dy, color='r', lw=4, head_width=5, alpha=0.5) # save intermediate results plt.gca().set_axis_off() plt.subplots_adjust(top = 1, bottom = 0, right = 1, left = 0, hspace = 0, wspace = 0) plt.margins(0,0) plt.gca().xaxis.set_major_locator(plt.NullLocator()) plt.gca().yaxis.set_major_locator(plt.NullLocator()) img_name = img_path.split('/')[-1] save_dir = './debug/qualitative_results/' plt.savefig(save_dir + img_name, dpi=100, bbox_inches = 'tight', pad_inches = 0) # plot 3d bounding boxes all_kpts_2d = np.concatenate(record['kpts_2d_pred']) all_kpts_3d_pred = record['kpts_3d_pred'].reshape(len(record['kpts_3d_pred']), -1) if 'kpts_3d_gt' in record: all_kpts_3d_gt = record['kpts_3d_gt'] all_pose_vecs_gt = record['pose_vecs_gt'] else: all_kpts_3d_gt = None all_pose_vecs_gt = None refine_args = {'visualize':visualize, 'get_str':save_dict['flag']} if camera is not None: refine_args['intrinsics'] = camera refine_args['refine'] = True refine_args['template'] = template # refine and gather the prediction strings record = gather_lifting_results(record, all_kpts_2d, all_kpts_3d_pred, all_kpts_3d_gt, all_pose_vecs_gt, color=color_dict['bbox_3d'], alpha_mode=alpha_mode, **refine_args ) # plot 3D bounding box generated by SMOKE if 'kpts_3d_SMOKE' in record: kpts_3d_SMOKE = record['kpts_3d_SMOKE'] if 'plots' in record: # update drawings ax = record['plots']['ax3d'] vp.plot_scene_3dbox(kpts_3d_SMOKE, ax=ax, color='m') pose_vecs = np.zeros((len(kpts_3d_SMOKE), 6)) for idx in range(len(pose_vecs)): pose_vecs[idx][0:3] = record['raw_txt_format'][idx]['locations'] pose_vecs[idx][4] = record['raw_txt_format'][idx]['rot_y'] # plot pose vectors draw_pose_vecs(ax, pose_vecs, color='m') # save KITTI-style prediction file in .txt format save_txt_file(img_path, record, save_dict) return record def post_process(records, camera=None, template=None, visualize=False, color_dict={'bbox_2d':'r', 'kpts':['ro', 'b'], }, save_dict={'flag':False, 'save_dir':None }, alpha_mode='trans' ): for img_path in records.keys(): print(img_path) records[img_path] = refine_one_image(img_path, records[img_path], camera=camera, template=template, visualize=visualize, color_dict=color_dict, save_dict=save_dict, alpha_mode=alpha_mode ) return records def merge(dict_a, dict_b): for key in dict_b.keys(): dict_a[key] = dict_b[key] return def collate_dict(dict_list): ret = {} for key in dict_list[0]: ret[key] = [d[key] for d in dict_list] return ret def my_collate_fn(batch): # the collate function for 2d pose training imgs, meta = list(zip(*batch)) meta = collate_dict(meta) return imgs, meta def filter_conf(record, thres=0.0): """ Filter the proposals with a confidence threshold. """ annots = record['raw_txt_format'] indices = [i for i in range(len(annots)) if annots[i]['score'] >= thres] if len(indices) == 0: return False, record filterd_record = { 'bbox_2d': record['bbox_2d'][indices], 'kpts_3d': record['kpts_3d'][indices], 'raw_txt_format': [annots[i] for i in indices], 'scores': [annots[i]['score'] for i in indices], 'K':record['K'] } return True, filterd_record def gather_dict(request, references, filter_c=True): """ Gather a dict from reference as requsted. """ assert 'path' in request ret = {'path':[], 'boxes':[], 'kpts_3d_SMOKE':[], 'raw_txt_format':[], 'scores':[], 'K':[]} for img_path in request['path']: img_name = img_path.split('/')[-1] if img_name not in references: print('Warning: ' + img_name + ' not included in detected images!') continue ref = references[img_name] if filter_c: success, ref = filter_conf(ref) if filter_c and not success: continue ret['path'].append(img_path) # ret['boxes'].append(ref['bbox_2d']) # temporary hack: enlarge the bounding box from the stage 1 model bbox = ref['bbox_2d'] for instance_id in range(len(bbox)): bbox[instance_id] = np.array(modify_bbox(bbox[instance_id], target_ar=1, enlarge=1.2)['bbox']) # temporary hack 2: use the gt bounding box for analysis ret['boxes'].append(bbox) # 3D bounding box produced by SMOKE ret['kpts_3d_SMOKE'].append(ref['kpts_3d']) ret['raw_txt_format'].append(ref['raw_txt_format']) ret['scores'].append(ref['scores']) ret['K'].append(ref['K']) #ret['kpts_3d_gt'] = request['kpts_3d_gt'] if 'pose_vecs_gt' in request: ret['pose_vecs_gt'] = request['pose_vecs_gt'] return ret @torch.no_grad() def inference(testset, model_settings, results, cfgs): """ The main inference function. """ # visualize to plot the 2D detection and 3D scene reconstruction data_loader = get_loader(testset, cfgs, 'testing', collate_fn=my_collate_fn) hm_regressor = model_settings['heatmap_regression'] lifter = model_settings['lifting'] # statistics for the FC model stats = model_settings['FC_stats'] #template = testset.instance_stats['ref_box3d'] template = None pth_trans = testset.pth_trans if 'add_xy' in cfgs['heatmapModel']: xy_dict = {'flag':cfgs['heatmapModel']['add_xy']} else: xy_dict = None all_records = {} camera = None flags = results['flags'] visualize = cfgs['visualize'] batch_to_show = cfgs['batch_to_show'] for batch_idx, (images, meta) in enumerate(data_loader): if flags['gt']: save_dir = os.path.join(cfgs['dirs']['output'], 'gt_box_test', 'data') if not os.path.exists(save_dir): os.makedirs(save_dir) # ground truth bounding box for comparison record = process_batch(images, hm_regressor, lifter, stats, template, annot_dict=meta, pth_trans=pth_trans, threshold=None, xy_dict=xy_dict ) record = post_process(record, camera, template, visualize=visualize, color_dict={'bbox_2d':'y', 'bbox_3d':'y', 'kpts':['yx', 'y'], }, save_dict={ 'flag':True, 'save_dir':save_dir } ) merge(all_records, record) if flags['pred']: # use detected bounding box from an anchor-free model annot_dict = gather_dict(meta, results['pred']) if len(annot_dict['path']) == 0: continue record2 = process_batch(images, hm_regressor, lifter, stats, template, annot_dict, pth_trans=pth_trans, threshold=None, xy_dict=xy_dict ) for key in record2: if 'record' in locals() and 'plots' in record[key]: record2[key]['plots'] = record[key]['plots'] save_dir = os.path.join(cfgs['dirs']['output'], 'submission', 'data') if not os.path.exists(save_dir): os.makedirs(save_dir) record2 = post_process(record2, camera, template, visualize=visualize, color_dict={'bbox_2d':'r', 'bbox_3d':'r', 'kpts':['rx', 'r'], }, save_dict={'flag':True, 'save_dir':save_dir }, alpha_mode=cfgs['testing_settings']['alpha_mode'] ) del images, record2, meta if batch_idx >= batch_to_show - 1: break # produce a csv file # csv_output_path = cfgs['dirs']['csv_output'] # save_csv(all_records, csv_output_path) return def generate_empty_file(output_dir, label_dir): """ Generate empty files for images without any predictions. """ all_files = os.listdir(label_dir) detected = os.listdir(os.path.join(output_dir, 'data')) for file_name in all_files: if file_name[:-4] + ".txt" not in detected: file = open(os.path.join(output_dir, 'data', file_name[:-4] + '.txt'), 'w') file.close() return def main(): # experiment configurations cfgs = parse.parse_args() # logging logger, final_output_dir = liblogger.get_logger(cfgs) shutil.copyfile(cfgs['config_path'], os.path.join(final_output_dir, 'saved_config.yml')) # Set GPU if cfgs['use_gpu'] and torch.cuda.is_available(): GPUs = cfgs['gpu_id'] else: logger.info("GPU acceleration is disabled.") # cudnn related setting torch.backends.cudnn.benchmark = cfgs['cudnn']['benchmark'] torch.backends.cudnn.deterministic = cfgs['cudnn']['deterministic'] torch.backends.cudnn.enabled = cfgs['cudnn']['enabled'] data_cfgs = cfgs['dataset'] # which split to show split = 'valid' dataset_inf = eval('dataset.' + data_cfgs['name'] + '.car_instance').get_dataset(cfgs, logger, split) # set to inference mode but does not read image dataset_inf.inference([True, False]) # some temporary testing # test_angle_conversion(dataset_inf, dataset_inf.instance_stats['ref_box3d']) # read annotations input_file_path = cfgs['dirs']['load_prediction_file'] # the record for 2D and 3D predictions # key->value: name of the approach->dictionary storing the predictions results = {} confidence_thres = cfgs['conf_thres'] # flags: use predicted bounding boxes as well as the ground truth boxes # for comparison results['flags'] = {} results['flags']['pred'] = cfgs['use_pred_box'] if results['flags']['pred']: results['pred'] = dataset_inf.read_predictions(input_file_path) results['flags']['gt'] = cfgs['use_gt_box'] # load checkpoints model_dict = prepare_models(cfgs) # inference and update prediction inference(dataset_inf, model_dict, results, cfgs) # then you can run kitti-eval for evaluation evaluator = cfgs['dirs']['kitti_evaluator'] label_dir = cfgs['dirs']['kitti_label'] output_dir = os.path.join(cfgs['dirs']['output'], 'submission') # if no detections are produced, generate an empty file #generate_empty_file(output_dir, label_dir) command = "{} {} {}".format(evaluator, label_dir, output_dir) # e.g. # ~/Documents/Github/SMOKE/smoke/data/datasets/evaluation/kitti/kitti_eval/evaluate_object_3d_offline /home/nicholas/Documents/Github/SMOKE/datasets/kitti/training/label_2 /media/nicholas/Database/experiments/3DLearning/0826 # /media/nicholas/Database/Github/M3D-RPN/data/kitti_split1/devkit/cpp/evaluate_object /home/nicholas/Documents/Github/SMOKE/datasets/kitti/training/label_2 /media/nicholas/Database/Github/M3D-RPN/output/tmp_results return if __name__ == "__main__": main() ================================================ FILE: tools/kitti-eval/README.md ================================================ # kitti_eval `evaluate_object_3d_offline.cpp`evaluates your KITTI detection locally on your own computer using your validation data selected from KITTI training dataset, with the following metrics: - overlap on image (AP) - oriented overlap on image (AOS) - overlap on ground-plane (AP) - overlap in 3D (AP) Compile `evaluate_object_3d_offline.cpp` (or `evaluate_object_3d_offline_r40.cpp` for the updated metric) with dependency of Boost and Linux `dirent.h` (You should already have it under most Linux). Run the evalutaion by: ./evaluate_object_3d_offline groundtruth_dir result_dir Note that you don't have to detect over all KITTI training data. The evaluator only evaluates samples whose result files exist. ### Updates - June, 2017: * Fixed the bug of detection box filtering based on min height according to KITTI's note on 25.04.2017. ================================================ FILE: tools/kitti-eval/evaluate_object_3d.cpp ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mail.h" BOOST_GEOMETRY_REGISTER_C_ARRAY_CS(cs::cartesian) typedef boost::geometry::model::polygon > Polygon; using namespace std; /*======================================================================= STATIC EVALUATION PARAMETERS =======================================================================*/ // holds the number of test images on the server const int32_t N_TESTIMAGES = 7518; // easy, moderate and hard evaluation level enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2}; // evaluation metrics: image, ground or 3D enum METRIC{IMAGE=0, GROUND=1, BOX3D=2}; // evaluation parameter const int32_t MIN_HEIGHT[3] = {40, 25, 25}; // minimum height for evaluated groundtruth/detections const int32_t MAX_OCCLUSION[3] = {0, 1, 2}; // maximum occlusion level of the groundtruth used for evaluation const double MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation // evaluated object classes enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2}; const int NUM_CLASS = 3; // parameters varying per class vector CLASS_NAMES; // the minimum overlap required for 2D evaluation on the image/ground plane and 3D evaluation const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}; // no. of recall steps that should be evaluated (discretized) const double N_SAMPLE_PTS = 41; // initialize class names void initGlobals () { CLASS_NAMES.push_back("car"); CLASS_NAMES.push_back("pedestrian"); CLASS_NAMES.push_back("cyclist"); } /*======================================================================= DATA TYPES FOR EVALUATION =======================================================================*/ // holding data needed for precision-recall and precision-aos struct tPrData { vector v; // detection score for computing score thresholds double similarity; // orientation similarity int32_t tp; // true positives int32_t fp; // false positives int32_t fn; // false negatives tPrData () : similarity(0), tp(0), fp(0), fn(0) {} }; // holding bounding boxes for ground truth and detections struct tBox { string type; // object type as car, pedestrian or cyclist,... double x1; // left corner double y1; // top corner double x2; // right corner double y2; // bottom corner double alpha; // image orientation tBox (string type, double x1,double y1,double x2,double y2,double alpha) : type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {} }; // holding ground truth data struct tGroundtruth { tBox box; // object type, box, orientation double truncation; // truncation 0..1 int32_t occlusion; // occlusion 0,1,2 (non, partly, fully) double ry; double t1, t2, t3; double h, w, l; tGroundtruth () : box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {} tGroundtruth (tBox box,double truncation,int32_t occlusion) : box(box),truncation(truncation),occlusion(occlusion) {} tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) : box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {} }; // holding detection data struct tDetection { tBox box; // object type, box, orientation double thresh; // detection score double ry; double t1, t2, t3; double h, w, l; tDetection (): box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {} tDetection (tBox box,double thresh) : box(box),thresh(thresh) {} tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) : box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {} }; /*======================================================================= FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS =======================================================================*/ vector indices; vector loadDetections(string file_name, bool &compute_aos, vector &eval_image, vector &eval_ground, vector &eval_3d, bool &success) { // holds all detections (ignored detections are indicated by an index vector vector detections; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return detections; } while (!feof(fp)) { tDetection d; double trash; char str[255]; if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &trash, &trash, &d.box.alpha, &d.box.x1, &d.box.y1, &d.box.x2, &d.box.y2, &d.h, &d.w, &d.l, &d.t1, &d.t2, &d.t3, &d.ry, &d.thresh)==16) { // d.thresh = 1; d.box.type = str; detections.push_back(d); // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid if(d.box.alpha == -10) compute_aos = false; // a class is only evaluated if it is detected at least once for (int c = 0; c < NUM_CLASS; c++) { if (!strcasecmp(d.box.type.c_str(), CLASS_NAMES[c].c_str())) { if (!eval_image[c] && d.box.x1 >= 0) eval_image[c] = true; if (!eval_ground[c] && d.t1 != -1000) eval_ground[c] = true; if (!eval_3d[c] && d.t2 != -1000) eval_3d[c] = true; break; } } } } fclose(fp); success = true; return detections; } vector loadGroundtruth(string file_name,bool &success) { // holds all ground truth (ignored ground truth is indicated by an index vector vector groundtruth; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return groundtruth; } while (!feof(fp)) { tGroundtruth g; char str[255]; if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &g.truncation, &g.occlusion, &g.box.alpha, &g.box.x1, &g.box.y1, &g.box.x2, &g.box.y2, &g.h, &g.w, &g.l, &g.t1, &g.t2, &g.t3, &g.ry )==15) { g.box.type = str; groundtruth.push_back(g); } } fclose(fp); success = true; return groundtruth; } void saveStats (const vector &precision, const vector &aos, FILE *fp_det, FILE *fp_ori) { // save precision to file if(precision.empty()) return; for (int32_t i=0; i Polygon toPolygon(const T& g) { using namespace boost::numeric::ublas; using namespace boost::geometry; matrix mref(2, 2); mref(0, 0) = cos(g.ry); mref(0, 1) = sin(g.ry); mref(1, 0) = -sin(g.ry); mref(1, 1) = cos(g.ry); static int count = 0; matrix corners(2, 4); double data[] = {g.l / 2, g.l / 2, -g.l / 2, -g.l / 2, g.w / 2, -g.w / 2, -g.w / 2, g.w / 2}; std::copy(data, data + 8, corners.data().begin()); matrix gc = prod(mref, corners); for (int i = 0; i < 4; ++i) { gc(0, i) += g.t1; gc(1, i) += g.t3; } double points[][2] = {{gc(0, 0), gc(1, 0)},{gc(0, 1), gc(1, 1)},{gc(0, 2), gc(1, 2)},{gc(0, 3), gc(1, 3)},{gc(0, 0), gc(1, 0)}}; Polygon poly; append(poly, points); return poly; } // measure overlap between bird's eye view bounding boxes, parametrized by (ry, l, w, tx, tz) inline double groundBoxOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double inter_area = in.empty() ? 0 : area(in.front()); double union_area = area(un.front()); double o; if(criterion==-1) // union o = inter_area / union_area; else if(criterion==0) // bbox_a o = inter_area / area(dp); else if(criterion==1) // bbox_b o = inter_area / area(gp); return o; } // measure overlap between 3D bounding boxes, parametrized by (ry, h, w, l, tx, ty, tz) inline double box3DOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double ymax = min(d.t2, g.t2); double ymin = max(d.t2 - d.h, g.t2 - g.h); double inter_area = in.empty() ? 0 : area(in.front()); double inter_vol = inter_area * max(0.0, ymax - ymin); double det_vol = d.h * d.l * d.w; double gt_vol = g.h * g.l * g.w; double o; if(criterion==-1) // union o = inter_vol / (det_vol + gt_vol - inter_vol); else if(criterion==0) // bbox_a o = inter_vol / det_vol; else if(criterion==1) // bbox_b o = inter_vol / gt_vol; return o; } vector getThresholds(vector &v, double n_groundtruth){ // holds scores needed to compute N_SAMPLE_PTS recall values vector t; // sort scores in descending order // (highest score is assumed to give best/most confident detections) sort(v.begin(), v.end(), greater()); // get scores for linearly spaced recall double current_recall = 0; for(int32_t i=0; i >, const vector &det, vector &ignored_gt, vector &dc, vector &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){ // extract ground truth bounding boxes for current evaluation class for(int32_t i=0;iMAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height >, const vector &det, const vector &dc, const vector &ignored_gt, const vector &ignored_det, bool compute_fp, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), METRIC metric, bool compute_aos=false, double thresh=0, bool debug=false){ tPrData stat = tPrData(); const double NO_DETECTION = -10000000; vector delta; // holds angular difference for TPs (needed for AOS evaluation) vector assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth assigned_detection.assign(det.size(), false); vector ignored_threshold; ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed // detections with a low score are ignored for computing precision (needs FP) if(compute_fp) for(int32_t i=0; i 0.5) (logical len(det)) =======================================================================*/ int32_t det_idx = -1; double valid_detection = NO_DETECTION; double max_overlap = 0; // search for a possible detection bool assigned_ignored_det = false; for(int32_t j=0; jMIN_OVERLAP[metric][current_class] && det[j].thresh>valid_detection){ det_idx = j; valid_detection = det[j].thresh; } // for computing pr curve values, the candidate with the greatest overlap is considered // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){ max_overlap = overlap; det_idx = j; valid_detection = 1; assigned_ignored_det = false; } else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){ det_idx = j; valid_detection = 1; assigned_ignored_det = true; } } /*======================================================================= compute TP, FP and FN =======================================================================*/ // nothing was assigned to this valid ground truth if(valid_detection==NO_DETECTION && ignored_gt[i]==0) { stat.fn++; } // only evaluate valid ground truth <=> detection assignments (considering difficulty level) else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1)) assigned_detection[det_idx] = true; // found a valid true positive else if(valid_detection!=NO_DETECTION){ // write highest score to threshold vector stat.tp++; stat.v.push_back(det[det_idx].thresh); // compute angular difference of detection and ground truth if valid detection orientation was provided if(compute_aos) delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha); // clean up assigned_detection[det_idx] = true; } } // if FP are requested, consider stuff area if(compute_fp){ // count fp for(int32_t i=0; iMIN_OVERLAP[metric][current_class]){ assigned_detection[j] = true; nstuff++; } } } // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas stat.fp -= nstuff; // if all orientation values are valid, the AOS is computed if(compute_aos){ vector tmp; // FP have a similarity of 0, for all TP compute AOS tmp.assign(stat.fp, 0); for(int32_t i=0; i0 || stat.fp>0) stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0); // there was neither a FP nor a TP, so the similarity is ignored in the evaluation else stat.similarity = -1; } } return stat; } /*======================================================================= EVALUATE CLASS-WISE =======================================================================*/ bool eval_class (FILE *fp_det, FILE *fp_ori, CLASSES current_class, const vector< vector > &groundtruth, const vector< vector > &detections, bool compute_aos, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), vector &precision, vector &aos, DIFFICULTY difficulty, METRIC metric) { assert(groundtruth.size() == detections.size()); // init int32_t n_gt=0; // total no. of gt (denominator of recall) vector v, thresholds; // detection scores, evaluated for recall discretization vector< vector > ignored_gt, ignored_det; // index of ignored gt detection for current class/difficulty vector< vector > dontcare; // index of dontcare areas, included in ground truth // for all test images do for (int32_t i=0; i i_gt, i_det; vector dc; // only evaluate objects of current class and ignore occluded, truncated objects cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty); ignored_gt.push_back(i_gt); ignored_det.push_back(i_det); dontcare.push_back(dc); // compute statistics to get recall values tPrData pr_tmp = tPrData(); pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false, boxoverlap, metric); // add detection scores to vector over all images for(int32_t j=0; j pr; pr.assign(thresholds.size(),tPrData()); for (int32_t i=0; i recall; precision.assign(N_SAMPLE_PTS, 0); if(compute_aos) aos.assign(N_SAMPLE_PTS, 0); double r=0; for (int32_t i=0; i vals[],bool is_aos){ char command[1024]; // save plot data to file FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w"); printf("save %s\n", (dir_name + "/" + file_name + ".txt").c_str()); for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++) fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]); fclose(fp); // create png + eps for (int32_t j=0; j<2; j++) { // open file FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w"); // save gnuplot instructions if (j==0) { fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n"); fprintf(fp,"set output \"%s.png\"\n",file_name.c_str()); } else { fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n"); fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str()); } // set labels and ranges fprintf(fp,"set size ratio 0.7\n"); fprintf(fp,"set xrange [0:1]\n"); fprintf(fp,"set yrange [0:1]\n"); fprintf(fp,"set xlabel \"Recall\"\n"); if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n"); else fprintf(fp,"set ylabel \"Orientation Similarity\"\n"); obj_type[0] = toupper(obj_type[0]); fprintf(fp,"set title \"%s\"\n",obj_type.c_str()); // line width int32_t lw = 5; if (j==0) lw = 3; // plot error curve fprintf(fp,"plot "); fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw); // close file fclose(fp); // run gnuplot => create png + eps sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str()); system(command); } // create pdf and crop sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str()); system(command); } bool eval(string result_sha,Mail* mail){ // set some global parameters initGlobals(); // ground truth and result directories string gt_dir = "data/object/label_2"; string result_dir = "results/" + result_sha; string plot_dir = result_dir + "/plot"; // create output directories system(("mkdir " + plot_dir).c_str()); // hold detections and ground truth in memory vector< vector > groundtruth; vector< vector > detections; // holds wether orientation similarity shall be computed (might be set to false while loading detections) // and which labels where provided by this submission bool compute_aos=true; vector eval_image(NUM_CLASS, false); vector eval_ground(NUM_CLASS, false); vector eval_3d(NUM_CLASS, false); // for all images read groundtruth and detections mail->msg("Loading detections..."); for (int32_t i=0; i gt = loadGroundtruth(gt_dir + "/" + file_name,gt_success); vector det = loadDetections(result_dir + "/data/" + file_name, compute_aos, eval_image, eval_ground, eval_3d, det_success); groundtruth.push_back(gt); detections.push_back(det); // check for errors if (!gt_success) { mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name); return false; } if (!det_success) { mail->msg("ERROR: Couldn't read: %s", file_name); return false; } } mail->msg(" done."); // holds pointers for result files FILE *fp_det=0, *fp_ori=0; // eval image 2D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_image[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection.txt").c_str(), "w"); if(compute_aos) fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_orientation.txt").c_str(),"w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[0], aos[0], EASY, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[1], aos[1], MODERATE, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[2], aos[2], HARD, IMAGE)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection", CLASS_NAMES[c], precision, 0); if(compute_aos){ saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_orientation", CLASS_NAMES[c], aos, 1); fclose(fp_ori); } } } // don't evaluate AOS for birdview boxes and 3D boxes compute_aos = false; // eval bird's eye view bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_ground[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_ground.txt").c_str(), "w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[0], aos[0], EASY, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[1], aos[1], MODERATE, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[2], aos[2], HARD, GROUND)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_ground", CLASS_NAMES[c], precision, 0); } } // eval 3D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_3d[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_3d.txt").c_str(), "w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[0], aos[0], EASY, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[1], aos[1], MODERATE, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[2], aos[2], HARD, BOX3D)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_3d", CLASS_NAMES[c], precision, 0); } } // success return true; } int32_t main (int32_t argc,char *argv[]) { // we need 2 or 4 arguments! if (argc!=2 && argc!=4) { cout << "Usage: ./eval_detection result_sha [user_sha email]" << endl; return 1; } // read arguments string result_sha = argv[1]; // init notification mail Mail *mail; if (argc==4) mail = new Mail(argv[3]); else mail = new Mail(); mail->msg("Thank you for participating in our evaluation!"); // run evaluation if (eval(result_sha,mail)) { mail->msg("Your evaluation results are available at:"); mail->msg("http://www.cvlibs.net/datasets/kitti/user_submit_check_login.php?benchmark=object&user=%s&result=%s",argv[2], result_sha.c_str()); } else { system(("rm -r results/" + result_sha).c_str()); mail->msg("An error occured while processing your results."); mail->msg("Please make sure that the data in your zip archive has the right format!"); } // send mail and exit delete mail; return 0; } ================================================ FILE: tools/kitti-eval/evaluate_object_3d_offline.cpp ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mail.h" BOOST_GEOMETRY_REGISTER_C_ARRAY_CS(cs::cartesian) typedef boost::geometry::model::polygon > Polygon; using namespace std; /*======================================================================= STATIC EVALUATION PARAMETERS =======================================================================*/ // holds the number of test images on the server const int32_t N_TESTIMAGES = 7518; // easy, moderate and hard evaluation level enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2}; // evaluation metrics: image, ground or 3D enum METRIC{IMAGE=0, GROUND=1, BOX3D=2}; // evaluation parameter const int32_t MIN_HEIGHT[3] = {40, 25, 25}; // minimum height for evaluated groundtruth/detections const int32_t MAX_OCCLUSION[3] = {0, 1, 2}; // maximum occlusion level of the groundtruth used for evaluation const double MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation // evaluated object classes enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2}; const int NUM_CLASS = 3; // parameters varying per class vector CLASS_NAMES; // the minimum overlap required for 2D evaluation on the image/ground plane and 3D evaluation // const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}; const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}}; // no. of recall steps that should be evaluated (discretized) const double N_SAMPLE_PTS = 41; // initialize class names void initGlobals () { CLASS_NAMES.push_back("car"); CLASS_NAMES.push_back("pedestrian"); CLASS_NAMES.push_back("cyclist"); } /*======================================================================= DATA TYPES FOR EVALUATION =======================================================================*/ // holding data needed for precision-recall and precision-aos struct tPrData { vector v; // detection score for computing score thresholds double similarity; // orientation similarity int32_t tp; // true positives int32_t fp; // false positives int32_t fn; // false negatives tPrData () : similarity(0), tp(0), fp(0), fn(0) {} }; // holding bounding boxes for ground truth and detections struct tBox { string type; // object type as car, pedestrian or cyclist,... double x1; // left corner double y1; // top corner double x2; // right corner double y2; // bottom corner double alpha; // image orientation tBox (string type, double x1,double y1,double x2,double y2,double alpha) : type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {} }; // holding ground truth data struct tGroundtruth { tBox box; // object type, box, orientation double truncation; // truncation 0..1 int32_t occlusion; // occlusion 0,1,2 (non, partly, fully) double ry; double t1, t2, t3; double h, w, l; tGroundtruth () : box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {} tGroundtruth (tBox box,double truncation,int32_t occlusion) : box(box),truncation(truncation),occlusion(occlusion) {} tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) : box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {} }; // holding detection data struct tDetection { tBox box; // object type, box, orientation double thresh; // detection score double ry; double t1, t2, t3; double h, w, l; tDetection (): box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {} tDetection (tBox box,double thresh) : box(box),thresh(thresh) {} tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) : box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {} }; /*======================================================================= FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS =======================================================================*/ vector indices; vector loadDetections(string file_name, bool &compute_aos, vector &eval_image, vector &eval_ground, vector &eval_3d, bool &success) { // holds all detections (ignored detections are indicated by an index vector vector detections; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return detections; } while (!feof(fp)) { tDetection d; double trash; char str[255]; if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &trash, &trash, &d.box.alpha, &d.box.x1, &d.box.y1, &d.box.x2, &d.box.y2, &d.h, &d.w, &d.l, &d.t1, &d.t2, &d.t3, &d.ry, &d.thresh)==16) { // d.thresh = 1; d.box.type = str; detections.push_back(d); // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid if(d.box.alpha == -10) compute_aos = false; // a class is only evaluated if it is detected at least once for (int c = 0; c < NUM_CLASS; c++) { if (!strcasecmp(d.box.type.c_str(), CLASS_NAMES[c].c_str())) { if (!eval_image[c] && d.box.x1 >= 0) eval_image[c] = true; if (!eval_ground[c] && d.t1 != -1000) eval_ground[c] = true; if (!eval_3d[c] && d.t2 != -1000) eval_3d[c] = true; break; } } } } fclose(fp); success = true; return detections; } vector loadGroundtruth(string file_name,bool &success) { // holds all ground truth (ignored ground truth is indicated by an index vector vector groundtruth; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return groundtruth; } while (!feof(fp)) { tGroundtruth g; char str[255]; if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &g.truncation, &g.occlusion, &g.box.alpha, &g.box.x1, &g.box.y1, &g.box.x2, &g.box.y2, &g.h, &g.w, &g.l, &g.t1, &g.t2, &g.t3, &g.ry )==15) { g.box.type = str; groundtruth.push_back(g); } } fclose(fp); success = true; return groundtruth; } void saveStats (const vector &precision, const vector &aos, FILE *fp_det, FILE *fp_ori) { // save precision to file if(precision.empty()) return; for (int32_t i=0; i Polygon toPolygon(const T& g) { using namespace boost::numeric::ublas; using namespace boost::geometry; matrix mref(2, 2); mref(0, 0) = cos(g.ry); mref(0, 1) = sin(g.ry); mref(1, 0) = -sin(g.ry); mref(1, 1) = cos(g.ry); static int count = 0; matrix corners(2, 4); double data[] = {g.l / 2, g.l / 2, -g.l / 2, -g.l / 2, g.w / 2, -g.w / 2, -g.w / 2, g.w / 2}; std::copy(data, data + 8, corners.data().begin()); matrix gc = prod(mref, corners); for (int i = 0; i < 4; ++i) { gc(0, i) += g.t1; gc(1, i) += g.t3; } double points[][2] = {{gc(0, 0), gc(1, 0)},{gc(0, 1), gc(1, 1)},{gc(0, 2), gc(1, 2)},{gc(0, 3), gc(1, 3)},{gc(0, 0), gc(1, 0)}}; Polygon poly; append(poly, points); return poly; } // measure overlap between bird's eye view bounding boxes, parametrized by (ry, l, w, tx, tz) inline double groundBoxOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double inter_area = in.empty() ? 0 : area(in.front()); double union_area = area(un.front()); double o; if(criterion==-1) // union o = inter_area / union_area; else if(criterion==0) // bbox_a o = inter_area / area(dp); else if(criterion==1) // bbox_b o = inter_area / area(gp); return o; } // measure overlap between 3D bounding boxes, parametrized by (ry, h, w, l, tx, ty, tz) inline double box3DOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double ymax = min(d.t2, g.t2); double ymin = max(d.t2 - d.h, g.t2 - g.h); double inter_area = in.empty() ? 0 : area(in.front()); double inter_vol = inter_area * max(0.0, ymax - ymin); double det_vol = d.h * d.l * d.w; double gt_vol = g.h * g.l * g.w; double o; if(criterion==-1) // union o = inter_vol / (det_vol + gt_vol - inter_vol); else if(criterion==0) // bbox_a o = inter_vol / det_vol; else if(criterion==1) // bbox_b o = inter_vol / gt_vol; return o; } vector getThresholds(vector &v, double n_groundtruth){ // holds scores needed to compute N_SAMPLE_PTS recall values vector t; // sort scores in descending order // (highest score is assumed to give best/most confident detections) sort(v.begin(), v.end(), greater()); // get scores for linearly spaced recall double current_recall = 0; for(int32_t i=0; i >, const vector &det, vector &ignored_gt, vector &dc, vector &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){ // extract ground truth bounding boxes for current evaluation class for(int32_t i=0;iMAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height >, const vector &det, const vector &dc, const vector &ignored_gt, const vector &ignored_det, bool compute_fp, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), METRIC metric, bool compute_aos=false, double thresh=0, bool debug=false){ tPrData stat = tPrData(); const double NO_DETECTION = -10000000; vector delta; // holds angular difference for TPs (needed for AOS evaluation) vector assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth assigned_detection.assign(det.size(), false); vector ignored_threshold; ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed // detections with a low score are ignored for computing precision (needs FP) if(compute_fp) for(int32_t i=0; i 0.5) (logical len(det)) =======================================================================*/ int32_t det_idx = -1; double valid_detection = NO_DETECTION; double max_overlap = 0; // search for a possible detection bool assigned_ignored_det = false; for(int32_t j=0; jMIN_OVERLAP[metric][current_class] && det[j].thresh>valid_detection){ det_idx = j; valid_detection = det[j].thresh; } // for computing pr curve values, the candidate with the greatest overlap is considered // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){ max_overlap = overlap; det_idx = j; valid_detection = 1; assigned_ignored_det = false; } else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){ det_idx = j; valid_detection = 1; assigned_ignored_det = true; } } /*======================================================================= compute TP, FP and FN =======================================================================*/ // nothing was assigned to this valid ground truth if(valid_detection==NO_DETECTION && ignored_gt[i]==0) { stat.fn++; } // only evaluate valid ground truth <=> detection assignments (considering difficulty level) else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1)) assigned_detection[det_idx] = true; // found a valid true positive else if(valid_detection!=NO_DETECTION){ // write highest score to threshold vector stat.tp++; stat.v.push_back(det[det_idx].thresh); // compute angular difference of detection and ground truth if valid detection orientation was provided if(compute_aos) delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha); // test use ry as the error measure //delta.push_back(gt[i].ry - det[det_idx].ry); // clean up assigned_detection[det_idx] = true; } } // if FP are requested, consider stuff area if(compute_fp){ // count fp for(int32_t i=0; iMIN_OVERLAP[metric][current_class]){ assigned_detection[j] = true; nstuff++; } } } // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas stat.fp -= nstuff; // if all orientation values are valid, the AOS is computed if(compute_aos){ vector tmp; // FP have a similarity of 0, for all TP compute AOS tmp.assign(stat.fp, 0); for(int32_t i=0; i0 || stat.fp>0) stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0); // there was neither a FP nor a TP, so the similarity is ignored in the evaluation else stat.similarity = -1; } } return stat; } /*======================================================================= EVALUATE CLASS-WISE =======================================================================*/ bool eval_class (FILE *fp_det, FILE *fp_ori, CLASSES current_class, const vector< vector > &groundtruth, const vector< vector > &detections, bool compute_aos, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), vector &precision, vector &aos, DIFFICULTY difficulty, METRIC metric) { assert(groundtruth.size() == detections.size()); // init int32_t n_gt=0; // total no. of gt (denominator of recall) vector v, thresholds; // detection scores, evaluated for recall discretization vector< vector > ignored_gt, ignored_det; // index of ignored gt detection for current class/difficulty vector< vector > dontcare; // index of dontcare areas, included in ground truth // for all test images do for (int32_t i=0; i i_gt, i_det; vector dc; // only evaluate objects of current class and ignore occluded, truncated objects cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty); ignored_gt.push_back(i_gt); ignored_det.push_back(i_det); dontcare.push_back(dc); // compute statistics to get recall values tPrData pr_tmp = tPrData(); pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false, boxoverlap, metric); // add detection scores to vector over all images for(int32_t j=0; j pr; pr.assign(thresholds.size(),tPrData()); for (int32_t i=0; i recall; precision.assign(N_SAMPLE_PTS, 0); if(compute_aos) aos.assign(N_SAMPLE_PTS, 0); double r=0; for (int32_t i=0; i vals[],bool is_aos){ char command[1024]; // save plot data to file FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w"); printf("save %s\n", (dir_name + "/" + file_name + ".txt").c_str()); for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++) fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]); fclose(fp); float sum[3] = {0, 0, 0}; for (int v = 0; v < 3; ++v) for (int i = 0; i < vals[v].size(); i = i + 4) sum[v] += vals[v][i]; printf("%s AP: %f %f %f\n", file_name.c_str(), sum[0] / 11 * 100, sum[1] / 11 * 100, sum[2] / 11 * 100); // create png + eps for (int32_t j=0; j<2; j++) { // open file FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w"); // save gnuplot instructions if (j==0) { fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n"); fprintf(fp,"set output \"%s.png\"\n",file_name.c_str()); } else { fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n"); fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str()); } // set labels and ranges fprintf(fp,"set size ratio 0.7\n"); fprintf(fp,"set xrange [0:1]\n"); fprintf(fp,"set yrange [0:1]\n"); fprintf(fp,"set xlabel \"Recall\"\n"); if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n"); else fprintf(fp,"set ylabel \"Orientation Similarity\"\n"); obj_type[0] = toupper(obj_type[0]); fprintf(fp,"set title \"%s\"\n",obj_type.c_str()); // line width int32_t lw = 5; if (j==0) lw = 3; // plot error curve fprintf(fp,"plot "); fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw); // close file fclose(fp); // run gnuplot => create png + eps sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str()); system(command); } // create pdf and crop sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str()); system(command); } vector getEvalIndices(const string& result_dir) { DIR* dir; dirent* entity; dir = opendir(result_dir.c_str()); if (dir) { while (entity = readdir(dir)) { string path(entity->d_name); int32_t len = path.size(); if (len < 10) continue; int32_t index = atoi(path.substr(len - 10, 10).c_str()); indices.push_back(index); } } return indices; } bool eval(string gt_dir, string result_dir, Mail* mail){ // set some global parameters initGlobals(); // ground truth and result directories // string gt_dir = "data/object/label_2"; // string result_dir = "results/" + result_sha; string plot_dir = result_dir + "/plot"; // create output directories system(("mkdir " + plot_dir).c_str()); // hold detections and ground truth in memory vector< vector > groundtruth; vector< vector > detections; // holds wether orientation similarity shall be computed (might be set to false while loading detections) // and which labels where provided by this submission bool compute_aos=true; vector eval_image(NUM_CLASS, false); vector eval_ground(NUM_CLASS, false); vector eval_3d(NUM_CLASS, false); // for all images read groundtruth and detections mail->msg("Loading detections..."); std::vector indices = getEvalIndices(result_dir + "/data/"); printf("number of files for evaluation: %d\n", (int)indices.size()); for (int32_t i=0; i gt = loadGroundtruth(gt_dir + "/" + file_name,gt_success); vector det = loadDetections(result_dir + "/data/" + file_name, compute_aos, eval_image, eval_ground, eval_3d, det_success); groundtruth.push_back(gt); detections.push_back(det); // check for errors if (!gt_success) { mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name); return false; } if (!det_success) { mail->msg("ERROR: Couldn't read: %s", file_name); return false; } } mail->msg(" done."); // holds pointers for result files FILE *fp_det=0, *fp_ori=0; // eval image 2D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_image[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection.txt").c_str(), "w"); if(compute_aos) fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_orientation.txt").c_str(),"w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[0], aos[0], EASY, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[1], aos[1], MODERATE, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[2], aos[2], HARD, IMAGE)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection", CLASS_NAMES[c], precision, 0); if(compute_aos){ saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_orientation", CLASS_NAMES[c], aos, 1); fclose(fp_ori); } } } // don't evaluate AOS for birdview boxes and 3D boxes compute_aos = false; // eval bird's eye view bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_ground[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_ground.txt").c_str(), "w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[0], aos[0], EASY, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[1], aos[1], MODERATE, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[2], aos[2], HARD, GROUND)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_ground", CLASS_NAMES[c], precision, 0); } } // eval 3D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_3d[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_3d.txt").c_str(), "w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[0], aos[0], EASY, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[1], aos[1], MODERATE, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[2], aos[2], HARD, BOX3D)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_3d", CLASS_NAMES[c], precision, 0); } } // success return true; } int32_t main (int32_t argc,char *argv[]) { // we need 2 or 4 arguments! if (argc!=3) { cout << "Usage: ./eval_detection_3d_offline gt_dir result_dir" << endl; return 1; } // read arguments string gt_dir = argv[1]; string result_dir = argv[2]; // init notification mail Mail *mail; mail = new Mail(); mail->msg("Thank you for participating in our evaluation!"); // run evaluation if (eval(gt_dir, result_dir, mail)) { mail->msg("Your evaluation results are available at:"); mail->msg(result_dir.c_str()); } else { system(("rm -r " + result_dir + "/plot").c_str()); mail->msg("An error occured while processing your results."); } // send mail and exit delete mail; return 0; } ================================================ FILE: tools/kitti-eval/evaluate_object_3d_offline_r40.cpp ================================================ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "mail.h" BOOST_GEOMETRY_REGISTER_C_ARRAY_CS(cs::cartesian) typedef boost::geometry::model::polygon > Polygon; using namespace std; /*======================================================================= STATIC EVALUATION PARAMETERS =======================================================================*/ // holds the number of test images on the server const int32_t N_TESTIMAGES = 7518; // easy, moderate and hard evaluation level enum DIFFICULTY{EASY=0, MODERATE=1, HARD=2}; // evaluation metrics: image, ground or 3D enum METRIC{IMAGE=0, GROUND=1, BOX3D=2}; // evaluation parameter const int32_t MIN_HEIGHT[3] = {40, 25, 25}; // minimum height for evaluated groundtruth/detections const int32_t MAX_OCCLUSION[3] = {0, 1, 2}; // maximum occlusion level of the groundtruth used for evaluation const double MAX_TRUNCATION[3] = {0.15, 0.3, 0.5}; // maximum truncation level of the groundtruth used for evaluation // evaluated object classes enum CLASSES{CAR=0, PEDESTRIAN=1, CYCLIST=2}; const int NUM_CLASS = 3; // parameters varying per class vector CLASS_NAMES; // the minimum overlap required for 2D evaluation on the image/ground plane and 3D evaluation // const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.5, 0.25, 0.25}, {0.5, 0.25, 0.25}}; const double MIN_OVERLAP[3][3] = {{0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}, {0.7, 0.5, 0.5}}; // no. of recall steps that should be evaluated (discretized) const double N_SAMPLE_PTS = 41; // initialize class names void initGlobals () { CLASS_NAMES.push_back("car"); CLASS_NAMES.push_back("pedestrian"); CLASS_NAMES.push_back("cyclist"); } /*======================================================================= DATA TYPES FOR EVALUATION =======================================================================*/ // holding data needed for precision-recall and precision-aos struct tPrData { vector v; // detection score for computing score thresholds double similarity; // orientation similarity int32_t tp; // true positives int32_t fp; // false positives int32_t fn; // false negatives tPrData () : similarity(0), tp(0), fp(0), fn(0) {} }; // holding bounding boxes for ground truth and detections struct tBox { string type; // object type as car, pedestrian or cyclist,... double x1; // left corner double y1; // top corner double x2; // right corner double y2; // bottom corner double alpha; // image orientation tBox (string type, double x1,double y1,double x2,double y2,double alpha) : type(type),x1(x1),y1(y1),x2(x2),y2(y2),alpha(alpha) {} }; // holding ground truth data struct tGroundtruth { tBox box; // object type, box, orientation double truncation; // truncation 0..1 int32_t occlusion; // occlusion 0,1,2 (non, partly, fully) double ry; double t1, t2, t3; double h, w, l; tGroundtruth () : box(tBox("invalild",-1,-1,-1,-1,-10)),truncation(-1),occlusion(-1) {} tGroundtruth (tBox box,double truncation,int32_t occlusion) : box(box),truncation(truncation),occlusion(occlusion) {} tGroundtruth (string type,double x1,double y1,double x2,double y2,double alpha,double truncation,int32_t occlusion) : box(tBox(type,x1,y1,x2,y2,alpha)),truncation(truncation),occlusion(occlusion) {} }; // holding detection data struct tDetection { tBox box; // object type, box, orientation double thresh; // detection score double ry; double t1, t2, t3; double h, w, l; tDetection (): box(tBox("invalid",-1,-1,-1,-1,-10)),thresh(-1000) {} tDetection (tBox box,double thresh) : box(box),thresh(thresh) {} tDetection (string type,double x1,double y1,double x2,double y2,double alpha,double thresh) : box(tBox(type,x1,y1,x2,y2,alpha)),thresh(thresh) {} }; /*======================================================================= FUNCTIONS TO LOAD DETECTION AND GROUND TRUTH DATA ONCE, SAVE RESULTS =======================================================================*/ vector indices; vector loadDetections(string file_name, bool &compute_aos, vector &eval_image, vector &eval_ground, vector &eval_3d, bool &success) { // holds all detections (ignored detections are indicated by an index vector vector detections; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return detections; } while (!feof(fp)) { tDetection d; double trash; char str[255]; if (fscanf(fp, "%s %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &trash, &trash, &d.box.alpha, &d.box.x1, &d.box.y1, &d.box.x2, &d.box.y2, &d.h, &d.w, &d.l, &d.t1, &d.t2, &d.t3, &d.ry, &d.thresh)==16) { // d.thresh = 1; d.box.type = str; detections.push_back(d); // orientation=-10 is invalid, AOS is not evaluated if at least one orientation is invalid if(d.box.alpha == -10) compute_aos = false; // a class is only evaluated if it is detected at least once for (int c = 0; c < NUM_CLASS; c++) { if (!strcasecmp(d.box.type.c_str(), CLASS_NAMES[c].c_str())) { if (!eval_image[c] && d.box.x1 >= 0) eval_image[c] = true; if (!eval_ground[c] && d.t1 != -1000) eval_ground[c] = true; if (!eval_3d[c] && d.t2 != -1000) eval_3d[c] = true; break; } } } } fclose(fp); success = true; return detections; } vector loadGroundtruth(string file_name,bool &success) { // holds all ground truth (ignored ground truth is indicated by an index vector vector groundtruth; FILE *fp = fopen(file_name.c_str(),"r"); if (!fp) { success = false; return groundtruth; } while (!feof(fp)) { tGroundtruth g; char str[255]; if (fscanf(fp, "%s %lf %d %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf %lf", str, &g.truncation, &g.occlusion, &g.box.alpha, &g.box.x1, &g.box.y1, &g.box.x2, &g.box.y2, &g.h, &g.w, &g.l, &g.t1, &g.t2, &g.t3, &g.ry )==15) { g.box.type = str; groundtruth.push_back(g); } } fclose(fp); success = true; return groundtruth; } void saveStats (const vector &precision, const vector &aos, FILE *fp_det, FILE *fp_ori) { // save precision to file if(precision.empty()) return; for (int32_t i=0; i Polygon toPolygon(const T& g) { using namespace boost::numeric::ublas; using namespace boost::geometry; matrix mref(2, 2); mref(0, 0) = cos(g.ry); mref(0, 1) = sin(g.ry); mref(1, 0) = -sin(g.ry); mref(1, 1) = cos(g.ry); static int count = 0; matrix corners(2, 4); double data[] = {g.l / 2, g.l / 2, -g.l / 2, -g.l / 2, g.w / 2, -g.w / 2, -g.w / 2, g.w / 2}; std::copy(data, data + 8, corners.data().begin()); matrix gc = prod(mref, corners); for (int i = 0; i < 4; ++i) { gc(0, i) += g.t1; gc(1, i) += g.t3; } double points[][2] = {{gc(0, 0), gc(1, 0)},{gc(0, 1), gc(1, 1)},{gc(0, 2), gc(1, 2)},{gc(0, 3), gc(1, 3)},{gc(0, 0), gc(1, 0)}}; Polygon poly; append(poly, points); return poly; } // measure overlap between bird's eye view bounding boxes, parametrized by (ry, l, w, tx, tz) inline double groundBoxOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double inter_area = in.empty() ? 0 : area(in.front()); double union_area = area(un.front()); double o; if(criterion==-1) // union o = inter_area / union_area; else if(criterion==0) // bbox_a o = inter_area / area(dp); else if(criterion==1) // bbox_b o = inter_area / area(gp); return o; } // measure overlap between 3D bounding boxes, parametrized by (ry, h, w, l, tx, ty, tz) inline double box3DOverlap(tDetection d, tGroundtruth g, int32_t criterion = -1) { using namespace boost::geometry; Polygon gp = toPolygon(g); Polygon dp = toPolygon(d); std::vector in, un; intersection(gp, dp, in); union_(gp, dp, un); double ymax = min(d.t2, g.t2); double ymin = max(d.t2 - d.h, g.t2 - g.h); double inter_area = in.empty() ? 0 : area(in.front()); double inter_vol = inter_area * max(0.0, ymax - ymin); double det_vol = d.h * d.l * d.w; double gt_vol = g.h * g.l * g.w; double o; if(criterion==-1) // union o = inter_vol / (det_vol + gt_vol - inter_vol); else if(criterion==0) // bbox_a o = inter_vol / det_vol; else if(criterion==1) // bbox_b o = inter_vol / gt_vol; return o; } vector getThresholds(vector &v, double n_groundtruth){ // holds scores needed to compute N_SAMPLE_PTS recall values vector t; // sort scores in descending order // (highest score is assumed to give best/most confident detections) sort(v.begin(), v.end(), greater()); // get scores for linearly spaced recall double current_recall = 0; for(int32_t i=0; i >, const vector &det, vector &ignored_gt, vector &dc, vector &ignored_det, int32_t &n_gt, DIFFICULTY difficulty){ // extract ground truth bounding boxes for current evaluation class for(int32_t i=0;iMAX_OCCLUSION[difficulty] || gt[i].truncation>MAX_TRUNCATION[difficulty] || height >, const vector &det, const vector &dc, const vector &ignored_gt, const vector &ignored_det, bool compute_fp, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), METRIC metric, bool compute_aos=false, double thresh=0, bool debug=false){ tPrData stat = tPrData(); const double NO_DETECTION = -10000000; vector delta; // holds angular difference for TPs (needed for AOS evaluation) vector assigned_detection; // holds wether a detection was assigned to a valid or ignored ground truth assigned_detection.assign(det.size(), false); vector ignored_threshold; ignored_threshold.assign(det.size(), false); // holds detections with a threshold lower than thresh if FP are computed // detections with a low score are ignored for computing precision (needs FP) if(compute_fp) for(int32_t i=0; i 0.5) (logical len(det)) =======================================================================*/ int32_t det_idx = -1; double valid_detection = NO_DETECTION; double max_overlap = 0; // search for a possible detection bool assigned_ignored_det = false; for(int32_t j=0; jMIN_OVERLAP[metric][current_class] && det[j].thresh>valid_detection){ det_idx = j; valid_detection = det[j].thresh; } // for computing pr curve values, the candidate with the greatest overlap is considered // if the greatest overlap is an ignored detection (min_height), the overlapping detection is used else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && (overlap>max_overlap || assigned_ignored_det) && ignored_det[j]==0){ max_overlap = overlap; det_idx = j; valid_detection = 1; assigned_ignored_det = false; } else if(compute_fp && overlap>MIN_OVERLAP[metric][current_class] && valid_detection==NO_DETECTION && ignored_det[j]==1){ det_idx = j; valid_detection = 1; assigned_ignored_det = true; } } /*======================================================================= compute TP, FP and FN =======================================================================*/ // nothing was assigned to this valid ground truth if(valid_detection==NO_DETECTION && ignored_gt[i]==0) { stat.fn++; } // only evaluate valid ground truth <=> detection assignments (considering difficulty level) else if(valid_detection!=NO_DETECTION && (ignored_gt[i]==1 || ignored_det[det_idx]==1)) assigned_detection[det_idx] = true; // found a valid true positive else if(valid_detection!=NO_DETECTION){ // write highest score to threshold vector stat.tp++; stat.v.push_back(det[det_idx].thresh); // compute angular difference of detection and ground truth if valid detection orientation was provided if(compute_aos) delta.push_back(gt[i].box.alpha - det[det_idx].box.alpha); // test use ry as the error measure //delta.push_back(gt[i].ry - det[det_idx].ry); // clean up assigned_detection[det_idx] = true; } } // if FP are requested, consider stuff area if(compute_fp){ // count fp for(int32_t i=0; iMIN_OVERLAP[metric][current_class]){ assigned_detection[j] = true; nstuff++; } } } // FP = no. of all not to ground truth assigned detections - detections assigned to stuff areas stat.fp -= nstuff; // if all orientation values are valid, the AOS is computed if(compute_aos){ vector tmp; // FP have a similarity of 0, for all TP compute AOS tmp.assign(stat.fp, 0); for(int32_t i=0; i0 || stat.fp>0) stat.similarity = accumulate(tmp.begin(), tmp.end(), 0.0); // there was neither a FP nor a TP, so the similarity is ignored in the evaluation else stat.similarity = -1; } } return stat; } /*======================================================================= EVALUATE CLASS-WISE =======================================================================*/ bool eval_class (FILE *fp_det, FILE *fp_ori, CLASSES current_class, const vector< vector > &groundtruth, const vector< vector > &detections, bool compute_aos, double (*boxoverlap)(tDetection, tGroundtruth, int32_t), vector &precision, vector &aos, DIFFICULTY difficulty, METRIC metric) { assert(groundtruth.size() == detections.size()); // init int32_t n_gt=0; // total no. of gt (denominator of recall) vector v, thresholds; // detection scores, evaluated for recall discretization vector< vector > ignored_gt, ignored_det; // index of ignored gt detection for current class/difficulty vector< vector > dontcare; // index of dontcare areas, included in ground truth // for all test images do for (int32_t i=0; i i_gt, i_det; vector dc; // only evaluate objects of current class and ignore occluded, truncated objects cleanData(current_class, groundtruth[i], detections[i], i_gt, dc, i_det, n_gt, difficulty); ignored_gt.push_back(i_gt); ignored_det.push_back(i_det); dontcare.push_back(dc); // compute statistics to get recall values tPrData pr_tmp = tPrData(); pr_tmp = computeStatistics(current_class, groundtruth[i], detections[i], dc, i_gt, i_det, false, boxoverlap, metric); // add detection scores to vector over all images for(int32_t j=0; j pr; pr.assign(thresholds.size(),tPrData()); for (int32_t i=0; i recall; precision.assign(N_SAMPLE_PTS, 0); if(compute_aos) aos.assign(N_SAMPLE_PTS, 0); double r=0; for (int32_t i=0; i vals[],bool is_aos){ char command[1024]; // save plot data to file FILE *fp = fopen((dir_name + "/" + file_name + ".txt").c_str(),"w"); printf("save %s\n", (dir_name + "/" + file_name + ".txt").c_str()); for (int32_t i=0; i<(int)N_SAMPLE_PTS; i++) fprintf(fp,"%f %f %f %f\n",(double)i/(N_SAMPLE_PTS-1.0),vals[0][i],vals[1][i],vals[2][i]); fclose(fp); float sum[3] = {0, 0, 0}; for (int v = 0; v < 3; ++v) for (int i = 1; i < vals[v].size(); i = i + 1) sum[v] += vals[v][i]; printf("%s AP: %f %f %f\n", file_name.c_str(), sum[0] / 40 * 100, sum[1] / 40 * 100, sum[2] / 40 * 100); // create png + eps for (int32_t j=0; j<2; j++) { // open file FILE *fp = fopen((dir_name + "/" + file_name + ".gp").c_str(),"w"); // save gnuplot instructions if (j==0) { fprintf(fp,"set term png size 450,315 font \"Helvetica\" 11\n"); fprintf(fp,"set output \"%s.png\"\n",file_name.c_str()); } else { fprintf(fp,"set term postscript eps enhanced color font \"Helvetica\" 20\n"); fprintf(fp,"set output \"%s.eps\"\n",file_name.c_str()); } // set labels and ranges fprintf(fp,"set size ratio 0.7\n"); fprintf(fp,"set xrange [0:1]\n"); fprintf(fp,"set yrange [0:1]\n"); fprintf(fp,"set xlabel \"Recall\"\n"); if (!is_aos) fprintf(fp,"set ylabel \"Precision\"\n"); else fprintf(fp,"set ylabel \"Orientation Similarity\"\n"); obj_type[0] = toupper(obj_type[0]); fprintf(fp,"set title \"%s\"\n",obj_type.c_str()); // line width int32_t lw = 5; if (j==0) lw = 3; // plot error curve fprintf(fp,"plot "); fprintf(fp,"\"%s.txt\" using 1:2 title 'Easy' with lines ls 1 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:3 title 'Moderate' with lines ls 2 lw %d,",file_name.c_str(),lw); fprintf(fp,"\"%s.txt\" using 1:4 title 'Hard' with lines ls 3 lw %d",file_name.c_str(),lw); // close file fclose(fp); // run gnuplot => create png + eps sprintf(command,"cd %s; gnuplot %s",dir_name.c_str(),(file_name + ".gp").c_str()); system(command); } // create pdf and crop sprintf(command,"cd %s; ps2pdf %s.eps %s_large.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; pdfcrop %s_large.pdf %s.pdf",dir_name.c_str(),file_name.c_str(),file_name.c_str()); system(command); sprintf(command,"cd %s; rm %s_large.pdf",dir_name.c_str(),file_name.c_str()); system(command); } vector getEvalIndices(const string& result_dir) { DIR* dir; dirent* entity; dir = opendir(result_dir.c_str()); if (dir) { while (entity = readdir(dir)) { string path(entity->d_name); int32_t len = path.size(); if (len < 10) continue; int32_t index = atoi(path.substr(len - 10, 10).c_str()); indices.push_back(index); } } return indices; } bool eval(string gt_dir, string result_dir, Mail* mail){ // set some global parameters initGlobals(); // ground truth and result directories // string gt_dir = "data/object/label_2"; // string result_dir = "results/" + result_sha; string plot_dir = result_dir + "/plot"; // create output directories system(("mkdir " + plot_dir).c_str()); // hold detections and ground truth in memory vector< vector > groundtruth; vector< vector > detections; // holds wether orientation similarity shall be computed (might be set to false while loading detections) // and which labels where provided by this submission bool compute_aos=true; vector eval_image(NUM_CLASS, false); vector eval_ground(NUM_CLASS, false); vector eval_3d(NUM_CLASS, false); // for all images read groundtruth and detections mail->msg("Loading detections..."); std::vector indices = getEvalIndices(result_dir + "/data/"); printf("number of files for evaluation: %d\n", (int)indices.size()); for (int32_t i=0; i gt = loadGroundtruth(gt_dir + "/" + file_name,gt_success); vector det = loadDetections(result_dir + "/data/" + file_name, compute_aos, eval_image, eval_ground, eval_3d, det_success); groundtruth.push_back(gt); detections.push_back(det); // check for errors if (!gt_success) { mail->msg("ERROR: Couldn't read: %s of ground truth. Please write me an email!", file_name); return false; } if (!det_success) { mail->msg("ERROR: Couldn't read: %s", file_name); return false; } } mail->msg(" done."); // holds pointers for result files FILE *fp_det=0, *fp_ori=0; // eval image 2D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_image[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection.txt").c_str(), "w"); if(compute_aos) fp_ori = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_orientation.txt").c_str(),"w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[0], aos[0], EASY, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[1], aos[1], MODERATE, IMAGE) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, imageBoxOverlap, precision[2], aos[2], HARD, IMAGE)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection", CLASS_NAMES[c], precision, 0); if(compute_aos){ saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_orientation", CLASS_NAMES[c], aos, 1); fclose(fp_ori); } } } // don't evaluate AOS for birdview boxes and 3D boxes compute_aos = false; // eval bird's eye view bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_ground[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_ground.txt").c_str(), "w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[0], aos[0], EASY, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[1], aos[1], MODERATE, GROUND) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, groundBoxOverlap, precision[2], aos[2], HARD, GROUND)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_ground", CLASS_NAMES[c], precision, 0); } } // eval 3D bounding boxes for (int c = 0; c < NUM_CLASS; c++) { CLASSES cls = (CLASSES)c; if (eval_3d[c]) { fp_det = fopen((result_dir + "/stats_" + CLASS_NAMES[c] + "_detection_3d.txt").c_str(), "w"); vector precision[3], aos[3]; if( !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[0], aos[0], EASY, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[1], aos[1], MODERATE, BOX3D) || !eval_class(fp_det, fp_ori, cls, groundtruth, detections, compute_aos, box3DOverlap, precision[2], aos[2], HARD, BOX3D)) { mail->msg("%s evaluation failed.", CLASS_NAMES[c].c_str()); return false; } fclose(fp_det); saveAndPlotPlots(plot_dir, CLASS_NAMES[c] + "_detection_3d", CLASS_NAMES[c], precision, 0); } } // success return true; } int32_t main (int32_t argc,char *argv[]) { // we need 2 or 4 arguments! if (argc!=3) { cout << "Usage: ./eval_detection_3d_offline gt_dir result_dir" << endl; return 1; } // read arguments string gt_dir = argv[1]; string result_dir = argv[2]; // init notification mail Mail *mail; mail = new Mail(); mail->msg("Thank you for participating in our evaluation!"); // run evaluation if (eval(gt_dir, result_dir, mail)) { mail->msg("Your evaluation results are available at:"); mail->msg(result_dir.c_str()); } else { system(("rm -r " + result_dir + "/plot").c_str()); mail->msg("An error occured while processing your results."); } // send mail and exit delete mail; return 0; } ================================================ FILE: tools/kitti-eval/mail.h ================================================ #ifndef MAIL_H #define MAIL_H #include #include #include class Mail { public: Mail (std::string email = "") { if (email.compare("")) { mail = popen("/usr/lib/sendmail -t -f noreply@cvlibs.net","w"); fprintf(mail,"To: %s\n", email.c_str()); fprintf(mail,"From: noreply@cvlibs.net\n"); fprintf(mail,"Subject: KITTI Evaluation Benchmark\n"); fprintf(mail,"\n\n"); } else { mail = 0; } } ~Mail() { if (mail) { pclose(mail); } } void msg (const char *format, ...) { va_list args; va_start(args,format); if (mail) { vfprintf(mail,format,args); fprintf(mail,"\n"); } vprintf(format,args); printf("\n"); va_end(args); } private: FILE *mail; }; #endif ================================================ FILE: tools/train_IGRs.py ================================================ """ Training the coordinate localization sub-network. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import sys sys.path.append('../') import torch import os import libs.arguments.parse as parse import libs.logger.logger as liblogger import libs.dataset as dataset # import libs.dataset.ApolloScape.car_instance import libs.dataset.KITTI.car_instance import libs.trainer.trainer as trainer import libs.model as models import libs.optimizer.optimizer as optimizer import libs.loss.function as loss_func from libs.common.utils import get_model_summary from libs.metric.criterions import get_distance_src, get_angle_error from libs.metric.criterions import Evaluator def choose_loss_func(model_settings, cfgs): """ Initialize the loss function used for training. """ loss_type = model_settings['loss_type'] if loss_type == 'JointsCompositeLoss': spec_list = model_settings['loss_spec_list'] loss_weights = model_settings['loss_weight_list'] func = loss_func.JointsCompositeLoss(spec_list=spec_list, img_size=model_settings['input_size'], hm_size=model_settings['heatmap_size'], cr_loss_thres=model_settings['cr_loss_threshold'], loss_weights=loss_weights ) else: func = eval('loss_func.' + loss_type)(use_target_weight=cfgs['training_settings']['use_target_weight']) # the order of the points are needed when computing the cross-ratio loss if model_settings['loss_spec_list'][2] != 'None': func.cr_indices = libs.dataset.KITTI.car_instance.cr_indices_dict['bbox12'] func.target_cr = 4/3 return func.cuda() def train(model, model_settings, GPUs, cfgs, logger, final_output_dir): """ The training method. """ # get model summary input_size = model_settings['input_size'] input_channels = 5 if cfgs['heatmapModel']['add_xy'] else 3 dump_input = torch.rand((1, input_channels, input_size[1], input_size[0])) logger.info(get_model_summary(model, dump_input)) model = torch.nn.DataParallel(model, device_ids=GPUs).cuda() # get forward-pass time if you need # import time # dump_input = torch.rand((64, input_channels, input_size[1], input_size[0])).cuda() # t1 = time.clock() # out = model(dump_input) # l = out[0].sum() # l.backward() # torch.cuda.synchronize() # print(time.clock() - t1) # specify loss function func = choose_loss_func(model_settings, cfgs) # dataset preparation data_cfgs = cfgs['dataset'] train_dataset, valid_dataset = eval('dataset.' + data_cfgs['name'] + '.car_instance').prepare_data(cfgs, logger) # get the optimizer and learning rate scheduler optim, sche = optimizer.prepare_optim(model, cfgs) # metrics used for training error if cfgs['exp_type'] in ['baselinealpha', 'baselinetheta']: metric_function = get_angle_error save_debug_images = False elif cfgs['exp_type'] == 'instanceto2d': metric_function = get_distance_src save_debug_images = cfgs['training_settings']['debug']['save'] collate_fn = train_dataset.get_collate_fn() trainer.train(train_dataset=train_dataset, valid_dataset=valid_dataset, model=model, loss_func=func, optim=optim, sche=sche, metric_func=metric_function, cfgs=cfgs, logger=logger, collate_fn=collate_fn, save_debug=save_debug_images ) final_model_state_file = os.path.join(final_output_dir, 'HC.pth') logger.info('=> saving final model state to {}'.format(final_model_state_file)) torch.save(model.module.state_dict(), final_model_state_file) return def evaluate(model, model_settings, GPUs, cfgs, logger, final_output_dir, eval_train=False): saved_path = cfgs['dirs']['load_hm_model'] model.load_state_dict(torch.load(saved_path)) model = torch.nn.DataParallel(model, device_ids=GPUs).cuda() evaluator = Evaluator(cfgs['testing_settings']['eval_metrics'], cfgs) # define loss function (criterion) and optimizer loss_func = choose_loss_func(model_settings, cfgs) # dataset preparation data_cfgs = cfgs['dataset'] train_dataset, valid_dataset = eval('dataset.' + data_cfgs['name'] + '.car_instance').prepare_data(cfgs, logger) collate_fn = valid_dataset.get_collate_fn() logger.info("Evaluation on the validation split:") trainer.evaluate(valid_dataset, model, loss_func, cfgs, logger, evaluator, collate_fn=collate_fn) if eval_train: logger.info("Evaluation on the training split:") trainer.evaluate(train_dataset, model, loss_func, cfgs, logger, evaluator, collate_fn=collate_fn) return def main(): # experiment configurations cfgs = parse.parse_args() # logging logger, final_output_dir = liblogger.get_logger(cfgs) # Set GPU if cfgs['use_gpu'] and torch.cuda.is_available(): GPUs = cfgs['gpu_id'] else: logger.info("GPU acceleration is disabled.") if len(GPUs) == 1: torch.cuda.set_device(GPUs[0]) # cudnn related setting torch.backends.cudnn.benchmark = cfgs['cudnn']['benchmark'] torch.backends.cudnn.deterministic = cfgs['cudnn']['deterministic'] torch.backends.cudnn.enabled = cfgs['cudnn']['enabled'] # model initialization model_settings = cfgs['heatmapModel'] model_name = model_settings['name'] method_str = 'models.heatmapModel' + '.' + model_name + '.get_pose_net' model = eval(method_str)(cfgs, is_train=cfgs['train']) if cfgs['train']: train(model, model_settings, GPUs, cfgs, logger, final_output_dir) elif cfgs['evaluate']: evaluate(model, model_settings, GPUs, cfgs, logger, final_output_dir) if __name__ == '__main__': main() torch.cuda.empty_cache() ================================================ FILE: tools/train_lifting.py ================================================ """ Training the sub-network \mathcal{L}() that predicts 3D cuboid given 2D screen coordinates as input. Author: Shichao Li Contact: nicholas.li@connect.ust.hk """ import sys sys.path.append('../') import libs.arguments.parse as parse import libs.logger.logger as liblogger # Deprecated: Apolloscape dataset #import libs.dataset.ApolloScape.car_instance as car_instance # KITTI dataset import libs.dataset.KITTI.car_instance as car_instance import libs.trainer.trainer as trainer import torch import numpy as np import os def main(): # experiment configurations cfgs = parse.parse_args() # logging logger, final_output_dir = liblogger.get_logger(cfgs) # Set GPU if cfgs['use_gpu'] and torch.cuda.is_available(): GPUs = cfgs['gpu_id'] else: logger.info("GPU acceleration is disabled.") # load datasets train_dataset, eval_dataset = car_instance.prepare_data(cfgs, logger) logger.info("Finished preparing datasets...") # training if cfgs['train']: record = trainer.train_cascade(train_dataset, eval_dataset, cfgs, logger) cascade = record['cascade'] if cfgs['save'] and 'cascade' in locals(): save_path = os.path.join(cfgs['dirs']['output'], "KITTI") if not os.path.exists(save_path): os.mkdir(save_path) # save the model and the normalization statistics torch.save(cascade[0].cpu().state_dict(), os.path.join(save_path, 'L.pth') ) np.save(os.path.join(save_path, 'LS.npy'), train_dataset.statistics) logger.info('=> saving final model state to {}'.format(save_path)) # save loss history #np.save(os.path.join(save_path, 'record.npy'), record['record']) if cfgs['visualize'] or cfgs['evaluate']: # visualize the predictions cascade = torch.load(cfgs['load_model_path']) if cfgs['use_gpu']: cascade.cuda() if cfgs['evaluate']: trainer.evaluate_cascade(cascade, eval_dataset, cfgs) return record if __name__ == "__main__": record = main() torch.cuda.empty_cache()