Repository: haomo-ai/Cam4DOcc Branch: main Commit: 542f14a9d9e1 Files: 110 Total size: 543.7 KB Directory structure: gitextract_hgf40bk9/ ├── LICENSE ├── README.md ├── data/ │ ├── README.md │ ├── cam4docc/ │ │ ├── .gitkeep │ │ ├── GMO/ │ │ │ └── .gitkeep │ │ ├── GMO_lyft/ │ │ │ └── .gitkeep │ │ ├── MMO/ │ │ │ └── .gitkeep │ │ └── MMO_lyft/ │ │ └── .gitkeep │ └── nuscenes/ │ └── .gitkeep ├── other_baselines/ │ ├── README.md │ ├── lifted_2d/ │ │ └── eval_lifted_2d.py │ ├── static_world/ │ │ └── eval_static_world.py │ └── voxel_pcp/ │ └── eval_voxel_pcp.py ├── projects/ │ ├── __init__.py │ ├── configs/ │ │ ├── _base_/ │ │ │ ├── datasets/ │ │ │ │ ├── custom_lyft-3d.py │ │ │ │ ├── custom_nus-3d.py │ │ │ │ └── custom_waymo-3d.py │ │ │ ├── default_runtime.py │ │ │ └── schedules/ │ │ │ ├── cosine.py │ │ │ ├── cyclic_20e.py │ │ │ ├── cyclic_40e.py │ │ │ ├── mmdet_schedule_1x.py │ │ │ ├── schedule_2x.py │ │ │ ├── schedule_3x.py │ │ │ ├── seg_cosine_150e.py │ │ │ ├── seg_cosine_200e.py │ │ │ └── seg_cosine_50e.py │ │ ├── baselines/ │ │ │ ├── OCFNet_in_Cam4DOcc_V1.1.py │ │ │ ├── OCFNet_in_Cam4DOcc_V1.1_lyft.py │ │ │ ├── OCFNet_in_Cam4DOcc_V1.2.py │ │ │ └── OCFNet_in_Cam4DOcc_V1.2_lyft.py │ │ └── datasets/ │ │ └── custom_nus-3d.py │ └── occ_plugin/ │ ├── __init__.py │ ├── core/ │ │ ├── __init__.py │ │ ├── evaluation/ │ │ │ ├── __init__.py │ │ │ ├── efficiency_hooks.py │ │ │ └── eval_hooks.py │ │ └── visualizer/ │ │ ├── __init__.py │ │ └── show_occ.py │ ├── datasets/ │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── cam4docc_dataset.py │ │ ├── cam4docc_lyft_dataset.py │ │ ├── nuscenes_dataset.py │ │ ├── pipelines/ │ │ │ ├── __init__.py │ │ │ ├── formating.py │ │ │ ├── loading_bevdet.py │ │ │ ├── loading_instance.py │ │ │ ├── loading_occupancy.py │ │ │ └── transform_3d.py │ │ └── samplers/ │ │ ├── __init__.py │ │ ├── distributed_sampler.py │ │ ├── group_sampler.py │ │ └── sampler.py │ ├── occupancy/ │ │ ├── __init__.py │ │ ├── apis/ │ │ │ ├── __init__.py │ │ │ ├── mmdet_train.py │ │ │ ├── test.py │ │ │ └── train.py │ │ ├── backbones/ │ │ │ ├── __init__.py │ │ │ ├── pred_block.py │ │ │ └── resnet3d.py │ │ ├── dense_heads/ │ │ │ ├── __init__.py │ │ │ ├── flow_head.py │ │ │ ├── lovasz_softmax.py │ │ │ ├── occ_head.py │ │ │ └── utils.py │ │ ├── detectors/ │ │ │ ├── __init__.py │ │ │ ├── bevdepth.py │ │ │ └── ocfnet.py │ │ ├── fuser/ │ │ │ ├── __init__.py │ │ │ ├── addfuse.py │ │ │ ├── convfuse.py │ │ │ └── visfuse.py │ │ ├── image2bev/ │ │ │ ├── ViewTransformerLSSBEVDepth.py │ │ │ ├── ViewTransformerLSSVoxel.py │ │ │ └── __init__.py │ │ ├── necks/ │ │ │ ├── __init__.py │ │ │ ├── fpn3d.py │ │ │ └── second_fpn_3d.py │ │ └── voxel_encoder/ │ │ ├── __init__.py │ │ └── sparse_lidar_enc.py │ ├── ops/ │ │ ├── __init__.py │ │ └── occ_pooling/ │ │ ├── OCC_Pool.py │ │ ├── __init__.py │ │ └── src/ │ │ ├── occ_pool.cpp │ │ └── occ_pool_cuda.cu │ └── utils/ │ ├── __init__.py │ ├── coordinate_transform.py │ ├── formating.py │ ├── gaussian.py │ ├── geometry.py │ ├── metric_util.py │ ├── nusc_param.py │ ├── semkitti.py │ └── voxel_to_points.py ├── run.sh ├── run_eval.sh ├── setup.py ├── tools/ │ ├── dist_test.sh │ ├── dist_train.sh │ ├── gen_data/ │ │ └── gen_depth_gt.py │ ├── misc/ │ │ ├── browse_dataset.py │ │ ├── fuse_conv_bn.py │ │ ├── print_config.py │ │ └── visualize_results.py │ ├── test.py │ └── train.py └── viz/ ├── viz_gt.py └── viz_pred.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ MIT License Copyright (c) 2023 HAOMO.AI Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Cam4DOcc The official code an data for the benchmark with baselines for our paper: [Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications](https://arxiv.org/abs/2311.17663) This work has been accepted by CVPR 2024 :tada: [Junyi Ma#](https://github.com/BIT-MJY), [Xieyuanli Chen#](https://github.com/Chen-Xieyuanli), Jiawei Huang, [Jingyi Xu](https://github.com/BIT-XJY), [Zhen Luo](https://github.com/Blurryface0814), Jintao Xu, Weihao Gu, Rui Ai, [Hesheng Wang*](https://scholar.google.com/citations?hl=en&user=q6AY9XsAAAAJ) ## Citation If you use Cam4DOcc in an academic work, please cite our paper: @inproceedings{ma2024cvpr, author = {Junyi Ma and Xieyuanli Chen and Jiawei Huang and Jingyi Xu and Zhen Luo and Jintao Xu and Weihao Gu and Rui Ai and Hesheng Wang}, title = {{Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications}}, booktitle = {Proc.~of the IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)}, year = 2024 } ## Installation
We follow the installation instructions of our codebase OpenOccupancy, which are also posted here * Create a conda virtual environment and activate it ```bash conda create -n cam4docc python=3.7 -y conda activate cam4docc ``` * Install PyTorch and torchvision (tested on torch==1.10.1 & cuda=11.3) ```bash conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge ``` * Install gcc>=5 in conda env ```bash conda install -c omgarcia gcc-6 ``` * Install mmcv, mmdet, and mmseg ```bash pip install mmcv-full==1.4.0 pip install mmdet==2.14.0 pip install mmsegmentation==0.14.1 ``` * Install mmdet3d from the source code ```bash git clone https://github.com/open-mmlab/mmdetection3d.git cd mmdetection3d git checkout v0.17.1 # Other versions may not be compatible. python setup.py install ``` * Install other dependencies ```bash pip install timm pip install open3d-python pip install PyMCubes pip install spconv-cu113 pip install fvcore pip install setuptools==59.5.0 pip install lyft_dataset_sdk # for lyft dataset ``` * Install occupancy pooling ``` git clone git@github.com:haomo-ai/Cam4DOcc.git cd Cam4DOcc export PYTHONPATH=“.” python setup.py develop ```
## Data Structure ### nuScenes dataset * Please link your [nuScenes V1.0 full dataset](https://www.nuscenes.org/nuscenes#download) to the data folder. * [nuScenes-Occupancy](https://drive.google.com/file/d/1vTbgddMzUN6nLyWSsCZMb9KwihS7nPoH/view?usp=sharing), [nuscenes_occ_infos_train.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/train_pkl), and [nuscenes_occ_infos_val.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/val_pkl) are also provided by the previous work. If you only want to reproduce the forecasting results with "inflated" form, nuScenes dataset and Cam4DOcc are all you need. ### Lyft dataset * Please link your [Lyft dataset](https://www.kaggle.com/c/3d-object-detection-for-autonomous-vehicles/data) to the data folder. * The required folders are listed below. Note that the folders under `cam4docc` will be generated automatically once you first run our training or evaluation scripts. ```bash Cam4DOcc ├── data/ │ ├── nuscenes/ │ │ ├── maps/ │ │ ├── samples/ │ │ ├── sweeps/ │ │ ├── lidarseg/ │ │ ├── v1.0-test/ │ │ ├── v1.0-trainval/ │ │ ├── nuscenes_occ_infos_train.pkl │ │ ├── nuscenes_occ_infos_val.pkl │ ├── nuScenes-Occupancy/ │ ├── lyft/ │ │ ├── maps/ │ │ ├── train_data/ │ │ ├── images/ # from train images, containing xxx.jpeg │ ├── cam4docc │ │ ├── GMO/ │ │ │ ├── segmentation/ │ │ │ ├── instance/ │ │ │ ├── flow/ │ │ ├── MMO/ │ │ │ ├── segmentation/ │ │ │ ├── instance/ │ │ │ ├── flow/ │ │ ├── GMO_lyft/ │ │ │ ├── ... │ │ ├── MMO_lyft/ │ │ │ ├── ... ``` Alternatively, you could manually modify the path parameters in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines) instead of using the default data structure, which are also listed here: ``` occ_path = "./data/nuScenes-Occupancy" depth_gt_path = './data/depth_gt' train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl" val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl" cam4docc_dataset_path = "./data/cam4docc/" nusc_root = './data/nuscenes/' ``` ## Training and Evaluation We directly integrate the Cam4DOcc dataset generation pipeline into the dataloader, so you can directly run training or evaluate scripts and just wait :smirk: Optionally, you can set `only_generate_dataset=True` in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines) to only generate the Cam4DOcc data without model training and inference. ### Train OCFNetV1.1 with 8 GPUs OCFNetV1.1 can forecast inflated GMO and others. In this case, _vehicle_ and _human_ are considered as one unified category. For the nuScenes dataset, please run ```bash bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py 8 ``` For the Lyft dataset, please run ```bash bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1_lyft.py 8 ``` ### Train OCFNetV1.2 with 8 GPUs OCFNetV1.2 can forecast inflated GMO including _bicycle_, _bus_, _car_, _construction_, _motorcycle_, _trailer_, _truck_, _pedestrian_, and others. In this case, _vehicle_ and _human_ are divided into multiple categories for clearer evaluation on forecasting performance. For the nuScenes dataset, please run ```bash bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py 8 ``` For the Lyft dataset, please run ```bash bash run.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2_lyft.py 8 ``` * The training/test process will be accelerated several times after you generate datasets by the first epoch. ### Test OCFNet for different tasks If you only want to test the performance of occupancy prediction for the present frame (current observation), please set `test_present=True` in the [config files](https://github.com/haomo-ai/Cam4DOcc/tree/main/projects/configs/baselines). Otherwise, forecasting performance on the future interval is evaluated. ```bash bash run_eval.sh $PATH_TO_CFG $PATH_TO_CKPT $GPU_NUM # e.g. bash run_eval.sh ./projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py ./work_dirs/OCFNet_in_Cam4DOcc_V1.1/epoch_20.pth 8 ``` Please set `save_pred` and `save_path` in the config files once saving prediction results is needed. `VPQ` evaluation of 3D instance prediction will be refined in the future. ### Visualization Please install the dependencies as follows: ```bash sudo apt-get install Xvfb pip install xvfbwrapper pip install mayavi ``` where `Xvfb` may be needed for visualization in your server. **Visualize ground-truth occupancy labels**. Set `show_time_change = True` if you want to show the changing state of occupancy in time intervals. ```bash cd viz python viz_gt.py ``` **Visualize occupancy forecasting results**. Set `show_time_change = True` if you want to show the changing state of occupancy in time intervals. ```bash cd viz python viz_pred.py ``` There is still room for improvement. Camera-only 4D occupancy forecasting remains challenging, especially for predicting over longer time intervals with many moving objects. We envision this benchmark as a valuable evaluation tool, and our OCFNet can serve as a foundational codebase for future research on 4D occupancy forecasting. ## Basic Information Some basic information as well as key parameters for our current version. | Type | Info | Parameter | | :----: | :----: | :----: | | train | 23,930 sequences | train_capacity | | val | 5,119 frames | test_capacity | | voxel size | 0.2m | voxel_x/y/z | | range | [-51.2m, -51.2m, -5m, 51.2m, 51.2m, 3m]| point_cloud_range | | volume size | [512, 512, 40]| occ_size | | classes | 2 for V1.1 / 9 for V1.2 | num_cls | | observation frames | 3 | time_receptive_field | | future frames | 4 | n_future_frames | | extension frames | 6 | n_future_frames_plus | Our proposed OCFNet can still perform well while being trained with partial data. Please try to decrease `train_capacity` if you want to explore more details with sparser supervision signals. In addition, please make sure that `n_future_frames_plus <= time_receptive_field + n_future_frames` because `n_future_frames_plus` means the real prediction number. We estimate more frames including the past ones rather than only `n_future_frames`. ## Pretrained Models We will provide our pretrained models of the erratum version. Your patience is appreciated. **Deprecated:** ~~Please download our pretrained models (for epoch=20) to resume training or reproduce results.~~ | Version | Google Drive Google Drive | Baidu Cloud Baidu Yun | Config | | :---: | :---: | :---: | :---: | | ~~V1.0~~ | ~~link~~ | ~~link~~ | ~~only vehicle~~ | | V1.1 | [link](https://drive.google.com/file/d/1IXRqOQk3RKpIjGgBBqV9D9vgSt58QDr8/view?usp=sharing) | [link](https://pan.baidu.com/s/18gODsVnBAXEJ4pzv2-LqGA?pwd=m99b) | [OCFNet_in_Cam4DOcc_V1.1.py](https://github.com/haomo-ai/Cam4DOcc/blob/main/projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1.py) | | V1.2 | [link](https://drive.google.com/file/d/1q1XnRt0wYE3oq6YBMBnagpGL7h2I46uN/view?usp=sharing) | [link](https://pan.baidu.com/s/1OPc1-a2McOO_0QPX63J7WQ?pwd=adic) | [OCFNet_in_Cam4DOcc_V1.2.py](https://github.com/haomo-ai/Cam4DOcc/blob/main/projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py) | ## Other Baselines We also provide the evaluation on the forecasting performance of [other baselines](https://github.com/haomo-ai/Cam4DOcc/tree/main/other_baselines) in Cam4DOcc. ## TODO The tutorial is being updated ... We will release our pretrained models as soon as possible. OCFNetV1.3 and OCFNetV2 are on their way ... ### Acknowledgement We thank the fantastic works [OpenOccupancy](https://github.com/JeffWang987/OpenOccupancy), [PowerBEV](https://github.com/EdwardLeeLPZ/PowerBEV), and [FIERY](https://anthonyhu.github.io/fiery) for their pioneer code release, which provide codebase for this benchmark. ================================================ FILE: data/README.md ================================================ ### Data Structure Please link your [nuScenes V1.0 full dataset ](https://www.nuscenes.org/nuscenes#download) to the data folder. [nuScenes-Occupancy](https://drive.google.com/file/d/1vTbgddMzUN6nLyWSsCZMb9KwihS7nPoH/view?usp=sharing), [nuscenes_occ_infos_train.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/train_pkl), and [nuscenes_occ_infos_val.pkl](https://github.com/JeffWang987/OpenOccupancy/releases/tag/val_pkl) are also provided by the previous work. If you only want to reproduce the forecasting results with "inflated" form, nuScenes dataset and Cam4DOcc are all you need. Note that the folders under `cam4docc` will be generated automatically once you first run our training or evaluation scripts. ```bash Cam4DOcc ├── data/ │ ├── nuscenes/ │ │ ├── maps/ │ │ ├── samples/ │ │ ├── sweeps/ │ │ ├── lidarseg/ │ │ ├── v1.0-test/ │ │ ├── v1.0-trainval/ │ │ ├── nuscenes_occ_infos_train.pkl/ │ │ ├── nuscenes_occ_infos_val.pkl/ │ ├── nuScenes-Occupancy/ │ ├── cam4docc │ │ ├── GMO/ │ │ │ ├── segmentation/ │ │ │ ├── instance/ │ │ │ ├── flow/ │ │ ├── MMO/ │ │ │ ├── segmentation/ │ │ │ ├── instance/ │ │ │ ├── flow/ ``` The GMO folder will contain the data where vehicle and human are considered as one unified category. The MMO folder will contain the data where vehicle and human are divided into multiple categories for clearer evaluation on forecasting performance. In near future, we will unify GMO and MMO for easier usage. ================================================ FILE: data/cam4docc/.gitkeep ================================================ ================================================ FILE: data/cam4docc/GMO/.gitkeep ================================================ ================================================ FILE: data/cam4docc/GMO_lyft/.gitkeep ================================================ ================================================ FILE: data/cam4docc/MMO/.gitkeep ================================================ ================================================ FILE: data/cam4docc/MMO_lyft/.gitkeep ================================================ ================================================ FILE: data/nuscenes/.gitkeep ================================================ ================================================ FILE: other_baselines/README.md ================================================ ## I. Static World The static world model is built based on the identity hypothesis. ```bash cd other_baselines/static_world python ./eval_static_world.py ``` #### Parameters: * **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process. * **test_results_dir**: Path of occupancy prediction results. Here we simply set it to the path of OCFNet forecasting results and use the present occupancy prediction results for evaluation. You can also replace them with [OpenOccupancy](https://github.com/JeffWang987/OpenOccupancy) estimation results. * **gt_dir**: Path of ground-truth segmentations. ## II. Voxelization of PCP Voxelization of point cloud prediction requires the outputs of [PCPNet](https://github.com/Blurryface0814/PCPNet). Here we use nuScenes-Occupancy as ground-truth since predicted points are limited by sparsity. ```bash cd other_baselines/voxel_pcp python ./eval_voxel_pcp.py ``` #### Parameters: * **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process. * **occ_path**: Path of nuScenes-Occupancy. * **test_results_dir**: Path of point cloud prediction results. The data is organized as follows: ```bash Cam4DOcc ├── data/ │ ├── cam4docc/ │ │ ├── pcpnet_results/ │ │ │ ├── point_clouds/ │ │ │ │ ├── past/ │ │ │ │ │ ├── 000000.ply │ │ │ │ │ ├── 000001.ply │ │ │ │ │ ├── 000002.ply │ │ │ │ │ ├── 000003.ply │ │ │ │ ├── pred/ │ │ │ │ │ ├── 000000.ply │ │ │ │ │ ├── ... │ │ │ ├── saved_labels/ │ │ │ │ ├── past/ │ │ │ │ │ ├── 000000.label │ │ │ │ │ ├── 000001.label │ │ │ │ │ ├── 000002.label │ │ │ │ │ ├── 000003.label │ │ │ │ ├── pred/ │ │ │ │ │ ├── 000000.ply │ │ │ │ │ ├── ... ``` We will provide our PCPNet predictions soon and please open an issue [here](https://github.com/Blurryface0814/PCPNet) if you have questions about how PCPNet is implemented for points forecasting. ## III. 2D-3D Lifted Prediction 2D-3D lifted prediction requires the outputs of [PowerBEV](https://github.com/EdwardLeeLPZ/PowerBEV). ```bash cd other_baselines/lifted_2d python ./eval_lifted_2d.py ``` #### Parameters: * **test_idx_dir**: Path of test indexes, which is generated by the standard OCFNet evaluation process. * **gt_dir**: Path of ground-truth segmentations. * **hmin**: minimum height for lifting operation. * **hmax**: maximum height for lifting operation. * **test_results_dir**: Path of point cloud prediction results. The data is organized as follows: ```bash Cam4DOcc ├── data/ │ ├── cam4docc/ │ │ ├── powerbev_results/ │ │ │ ├── {scene_token}_{lidar_token}.npz │ │ │ ├── ... ``` We have provided our [PowerBEV predictions](https://drive.google.com/file/d/1X_N-GwU2ZB65UI9-EYpeQrb2BzS44VVX/view?usp=sharing) and please open an issue [here](https://github.com/EdwardLeeLPZ/PowerBEV) if you have questions about how PowerBEV is implemented for BEV-based instance prediction. More refinement strategies for the baselines will be released ... Before that, please simply use the scripts here for fast evaluation. ## Publications If you use our proposed baselines in your work, please cite as: * Cam4DOcc ``` @inproceedings{ma2024cvpr, author = {Junyi Ma and Xieyuanli Chen and Jiawei Huang and Jingyi Xu and Zhen Luo and Jintao Xu and Weihao Gu and Rui Ai and Hesheng Wang}, title = {{Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications}}, booktitle = {Proc.~of the IEEE/CVF Conf.~on Computer Vision and Pattern Recognition (CVPR)}, year = 2024 } ``` * OpenOccupancy ``` @article{wang2023openoccupancy, title={Openoccupancy: A large scale benchmark for surrounding semantic occupancy perception}, author={Wang, Xiaofeng and Zhu, Zheng and Xu, Wenbo and Zhang, Yunpeng and Wei, Yi and Chi, Xu and Ye, Yun and Du, Dalong and Lu, Jiwen and Wang, Xingang}, journal={arXiv preprint arXiv:2303.03991}, year={2023} } ``` * PCPNet ``` @ARTICLE{10141631, author={Luo, Zhen and Ma, Junyi and Zhou, Zijie and Xiong, Guangming}, journal={IEEE Robotics and Automation Letters}, title={PCPNet: An Efficient and Semantic-Enhanced Transformer Network for Point Cloud Prediction}, year={2023}, volume={8}, number={7}, pages={4267-4274}, doi={10.1109/LRA.2023.3281937}} ``` * PowerBEV ``` @inproceedings{ijcai2023p120, title = {PowerBEV: A Powerful Yet Lightweight Framework for Instance Prediction in Bird’s-Eye View}, author = {Li, Peizheng and Ding, Shuxiao and Chen, Xieyuanli and Hanselmann, Niklas and Cordts, Marius and Gall, Juergen}, booktitle = {Proceedings of the Thirty-Second International Joint Conference on Artificial Intelligence, {IJCAI-23}}, pages = {1080--1088}, year = {2023}, month = {8}, doi = {10.24963/ijcai.2023/120}, } ``` ================================================ FILE: other_baselines/lifted_2d/eval_lifted_2d.py ================================================ # Developed by Junyi Ma # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc from tqdm import trange import numpy as np from nuscenes import NuScenes import os import torch import torch.nn.functional as F import copy from pyquaternion import Quaternion # Setups ================================================================================================= test_idx_dir = "../../data/cam4docc/test_ids/" test_results_dir = "../../data/cam4docc/powerbev_results/" gt_dir = "../../data/cam4docc/MMO/segmentation/" test_seqs = os.listdir(test_idx_dir) test_segmentations = os.listdir(test_results_dir) dimension = [512, 512, 40] future_ious = [0, 0, 0, 0] voxel_size = np.array([0.2,0.2,0.2]) pc_range = np.array([-50, -50, 0, 50, 50, 0]) voxel_size_new = np.array([0.2,0.2,0.2]) pc_range_new = np.array([-51.2, -51.2, -5, 51.2, 51.2, 3]) # 10*0.2=2m # You can modify the parameters to show the changes with variable heights for lifting hmin = -1 hmax = 9 nusc = NuScenes(version='v1.0-trainval', dataroot="../../data/nuscenes", verbose=False) # ======================================================================================================== def cm_to_ious(cm): mean_ious = [] cls_num = len(cm) for i in range(cls_num): tp = cm[i, i] p = cm[:, i].sum() g = cm[i, :].sum() union = p + g - tp mean_ious.append(tp / union) return mean_ious def fast_hist(pred, label, max_label=18): pred = copy.deepcopy(pred.flatten()) label = copy.deepcopy(label.flatten()) bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2) iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2])) return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred for i in trange(len(test_seqs)): segmentation_file = test_results_dir + test_seqs[i] instance_seq = np.load(segmentation_file)['arr_0'] instance_seq = torch.from_numpy(instance_seq) test_seqs_idxs = np.load(test_idx_dir+test_seqs[i])["arr_0"] gt_segmentation_file = os.path.join(gt_dir, test_seqs[i]) gt_segmentation_seqs = np.load(gt_segmentation_file, allow_pickle=True)['arr_0'] for t in range(3, 7): scene_token_cur = test_seqs_idxs[t].split("_")[0] lidar_token_cur = test_seqs_idxs[t].split("_")[1] instance_ = instance_seq[0,(t-1)].unsqueeze(0) # t-1 -> t instance_ = instance_.unsqueeze(0) instance_ = F.interpolate(instance_.float(), size=[500, 500], mode='nearest').contiguous() # Note: default PowerBEV has different ranges with OCFNet instance_ = instance_.squeeze(0) x_grid = torch.linspace(0, 500-1, 500, dtype=torch.float) x_grid = x_grid.view(500, 1).expand(500,500) y_grid = torch.linspace(0, 500-1,500, dtype=torch.float) y_grid = y_grid.view(1, 500).expand(500,500) mesh_grid_2d = torch.stack((x_grid, y_grid), -1) mesh_grid_2d = mesh_grid_2d.view(-1, 2) instance_ = instance_.view(-1, 1) semantics_lifted = [] for ii in range(hmin, hmax): semantics_lifted_ = torch.cat((mesh_grid_2d, ii*torch.ones_like(mesh_grid_2d[:,0:1])),dim=-1) semantics_lifted_ = torch.cat((semantics_lifted_, instance_),dim=-1) semantics_lifted.append(semantics_lifted_) semantics_lifted = np.array(torch.cat(semantics_lifted, dim=0)) kept = semantics_lifted[:,-1]!=0 semantics_lifted = semantics_lifted[kept] if semantics_lifted.shape[0] == 0: semantics_lifted = np.zeros((1,4)) lidar_sample = nusc.get('sample_data', lidar_token_cur) lidar_sample_calib = nusc.get('calibrated_sensor', lidar_sample['calibrated_sensor_token']) lidar_sensor_rotation = Quaternion(lidar_sample_calib['rotation']) lidar_sensor_translation = np.array(lidar_sample_calib['translation'])[:, None] lidar_to_lidarego = np.vstack([ np.hstack((lidar_sensor_rotation.rotation_matrix, lidar_sensor_translation)), np.array([0, 0, 0, 1]) ]) lidarego_to_lidar = np.linalg.inv(lidar_to_lidarego) points = np.ones_like(semantics_lifted) points[:,:3] = semantics_lifted[:,:3] points[:,:3] = points[:,:3] * voxel_size[None, :] + pc_range[:3][None, :] points = lidarego_to_lidar @ points.T semantics_lifted_transformed = np.ones_like(semantics_lifted) semantics_lifted_transformed[:,:3] = (points.T)[:,:3] semantics_lifted_transformed[:,-1] = semantics_lifted[:,-1] semantics_lifted_transformed[:,:3] = (semantics_lifted_transformed[:,:3] - pc_range_new[:3][None, :]) / voxel_size_new[None, :] pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2])) for j in range(semantics_lifted_transformed.shape[0]): cur_ind = semantics_lifted_transformed[j, :3].astype(int) cur_label = semantics_lifted_transformed[j, -1] if cur_label != 0: pred_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = 1 gt_segmentation = np.zeros((dimension[0], dimension[1], dimension[2])) gt_segmentation_raw = gt_segmentation_seqs[t].cpu().numpy() gt_segmentation[gt_segmentation_raw[:,0].astype(int),gt_segmentation_raw[:,1].astype(int),gt_segmentation_raw[:,2].astype(int)] = gt_segmentation_raw[:, -1] hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), gt_segmentation.astype(int), max_label=2) if t <= 3: future_ious[0] = future_ious[0] + hist_cur if t <= 4: future_ious[1] = future_ious[1] + hist_cur if t <= 5: future_ious[2] = future_ious[2] + hist_cur if t <= 6: future_ious[3] = future_ious[3] + hist_cur for t in range(len(future_ious)): print("iou for step "+str(t), cm_to_ious(future_ious[t])) ================================================ FILE: other_baselines/static_world/eval_static_world.py ================================================ # Developed by Junyi Ma # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np import os import copy from tqdm import trange # Setups ================================================================================================= test_idx_dir = "../../data/cam4docc/test_ids/" test_results_dir = "../../data/cam4docc/results/" gt_dir = "../../data/cam4docc/MMO/segmentation/" objects_max_label = 9 test_seqs = os.listdir(test_idx_dir) test_segmentations = os.listdir(test_results_dir) dimension = [512, 512, 40] future_ious = [0, 0, 0, 0] # ======================================================================================================== def cm_to_ious(cm): mean_ious = [] cls_num = len(cm) for i in range(cls_num): tp = cm[i, i] p = cm[:, i].sum() g = cm[i, :].sum() union = p + g - tp mean_ious.append(tp / union) return mean_ious def fast_hist(pred, label, max_label=18): pred = copy.deepcopy(pred.flatten()) label = copy.deepcopy(label.flatten()) bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2) iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2])) return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred for i in trange(len(test_seqs)): segmentation_file = test_results_dir + test_seqs[i] if not os.path.exists(segmentation_file): continue segmentation = np.load(segmentation_file,allow_pickle=True)['arr_0'] test_seqs_idxs = np.load(os.path.join(test_idx_dir, test_seqs[i]))["arr_0"] gt_segmentation_file = os.path.join(gt_dir, test_seqs[i]) gt_segmentation_seqs = np.load(gt_segmentation_file,allow_pickle=True)['arr_0'] # hard coding for input:3 output:4 for t in range(3,7): # static world using present predictions segmentation_t = segmentation[0] pred_segmentation = np.zeros((dimension[0], dimension[1], dimension[2])) for j in range(segmentation_t.shape[0]): cur_ind = segmentation_t[j, :3].astype(int) cur_label = segmentation_t[j, -1] pred_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label gt_segmentation = np.zeros((dimension[0], dimension[1], dimension[2])) gt_segmentation_raw = gt_segmentation_seqs[t] for k in range(gt_segmentation_raw.shape[0]): cur_ind = gt_segmentation_raw[k, :3].astype(int) cur_label = gt_segmentation_raw[k, -1] gt_segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label hist_cur, iou_per_pred = fast_hist(pred_segmentation.astype(int), gt_segmentation.astype(int), max_label=objects_max_label) if t <= 3: future_ious[0] = future_ious[0] + hist_cur if t <= 4: future_ious[1] = future_ious[1] + hist_cur if t <= 5: future_ious[2] = future_ious[2] + hist_cur if t <= 6: future_ious[3] = future_ious[3] + hist_cur for t in range(len(future_ious)): print("iou for step "+str(t), cm_to_ious(future_ious[t])) ================================================ FILE: other_baselines/voxel_pcp/eval_voxel_pcp.py ================================================ # Developed by Junyi Ma # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np import os import copy from tqdm import trange import open3d as o3d from nuscenes import NuScenes from pyquaternion import Quaternion # Setups ================================================================================================= test_idx_dir = "../../data/cam4docc/test_ids/" test_results_dir = "../../data/cam4docc/pcpnet_results/" occ_path = "../../data/nuScenes-Occupancy" test_seqs = os.listdir(test_idx_dir) test_segmentations = os.listdir(test_results_dir) pc_range= np.array([-51.2, -51.2, -5.0, 51.2, 51.2, 3.0]) dimension = [512, 512, 40] grid_size= np.array(dimension) voxel_size = (pc_range[3:] -pc_range[:3]) / grid_size future_ious = [0, 0, 0, 0] nusc = NuScenes(version='v1.0-trainval', dataroot="../../data/nuscenes", verbose=False) # ======================================================================================================== lidar_token2sample_token = {} for i in range(len(nusc.sample)): my_sample = nusc.sample[i] frame_token = my_sample['token'] lidar_token = my_sample['data']['LIDAR_TOP'] lidar_token2sample_token[lidar_token] = frame_token def voxel2world(voxel): """ voxel: [N, 3] """ return voxel *voxel_size[None, :] + pc_range[:3][None, :] def world2voxel(world): """ world: [N, 3] """ return (world - pc_range[:3][None, :]) / voxel_size[None, :] def cm_to_ious(cm): mean_ious = [] cls_num = len(cm) for i in range(cls_num): tp = cm[i, i] p = cm[:, i].sum() g = cm[i, :].sum() union = p + g - tp mean_ious.append(tp / union) return mean_ious def fast_hist(pred, label, max_label=18): pred = copy.deepcopy(pred.flatten()) label = copy.deepcopy(label.flatten()) bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2) iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2])) return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred def nb_process_label(processed_label, sorted_label_voxel_pair): label_size = 256 counter = np.zeros((label_size,), dtype=np.uint16) counter[sorted_label_voxel_pair[0, 3]] = 1 cur_sear_ind = sorted_label_voxel_pair[0, :3] for i in range(1, sorted_label_voxel_pair.shape[0]): cur_ind = sorted_label_voxel_pair[i, :3] if not np.all(np.equal(cur_ind, cur_sear_ind)): processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter) counter = np.zeros((label_size,), dtype=np.uint16) cur_sear_ind = cur_ind counter[sorted_label_voxel_pair[i, 3]] += 1 processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter) return processed_label def get_ego2lidar_pose(rec): lidar_top_data = nusc.get('sample_data', rec['data']['LIDAR_TOP']) lidar2ego_translation = nusc.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation'] lidar2ego_rotation = nusc.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation'] trans = -np.array(lidar2ego_translation) rot = Quaternion(lidar2ego_rotation).inverse return trans, rot def get_lidar_pose(rec): current_sample = nusc.get('sample', rec['token']) egopose = nusc.get('ego_pose', nusc.get('sample_data', current_sample['data']['LIDAR_TOP'])['ego_pose_token']) trans = -np.array(egopose['translation']) rot = Quaternion(egopose['rotation']).inverse return trans, rot for i in trange(len(test_seqs)): test_seqs_idxs = np.load(os.path.join(test_idx_dir, test_seqs[i]))['arr_0'] scene_token_present = test_seqs[i].split("_")[0] lidar_token_present = test_seqs[i].split("_")[1][:-4] # transform past point clouds to the present frame # point cloud prediction baseline is limited by sparsity of laser points, so we aggregate # past point clouds to mitigate in this version # More reasonable versions will be released past_voxels = [] for t in range(1, 4): scene_token_ = test_seqs_idxs[t-1].split("_")[0] lidar_token_ = test_seqs_idxs[t-1].split("_")[1] point_file = test_results_dir+"point_clouds/"+scene_token_present+"_"+lidar_token_present+"/past/00000"+str(t)+".ply" label_file = test_results_dir+"saved_labels/"+scene_token_present+"_"+lidar_token_present+"/past/00000"+str(t)+".label" pcd_load = o3d.io.read_point_cloud(point_file) xyz_load = np.asarray(pcd_load.points) sample_token_present = lidar_token2sample_token[lidar_token_present] rec_present = nusc.get('sample', sample_token_present) translation_present, rotation_present = get_lidar_pose(rec_present) ego2lidar_translation_present, ego2lidar_rotation_present = get_ego2lidar_pose(rec_present) sample_token_ = lidar_token2sample_token[lidar_token_] rec_ = nusc.get('sample', sample_token_) translation_, rotation_ = get_lidar_pose(rec_) ego2lidar_translation_, ego2lidar_rotation_ = get_ego2lidar_pose(rec_) present_global2ego = [translation_present, rotation_present] present_ego2lidar = [ego2lidar_translation_present, ego2lidar_rotation_present] cur_global2ego = [translation_, rotation_] cur_ego2lidar = [ego2lidar_translation_, ego2lidar_rotation_] pcd_np_cor = np.dot(cur_ego2lidar[1].inverse.rotation_matrix, xyz_load.T) pcd_np_cor = pcd_np_cor.T pcd_np_cor = pcd_np_cor - cur_ego2lidar[0] pcd_np_cor = np.dot(cur_global2ego[1].inverse.rotation_matrix, pcd_np_cor.T) pcd_np_cor = pcd_np_cor.T pcd_np_cor = pcd_np_cor - cur_global2ego[0] pcd_np_cor = pcd_np_cor + present_global2ego[0] pcd_np_cor = np.dot(present_global2ego[1].rotation_matrix, pcd_np_cor.T) pcd_np_cor = pcd_np_cor.T pcd_np_cor = pcd_np_cor + present_ego2lidar[0] # trans pcd_np_cor = np.dot(present_ego2lidar[1].rotation_matrix, pcd_np_cor.T) xyz_load = pcd_np_cor.T xyz_load = world2voxel(xyz_load) label = np.fromfile(label_file, dtype=np.uint32) label = label.reshape((-1,1)) segmentation_t = np.concatenate((xyz_load, label), axis=-1) kept = (segmentation_t[:,0]>0) & (segmentation_t[:,0]0) & (segmentation_t[:,1]0) & (segmentation_t[:,2]0) & (segmentation_t[:,0]0) & (segmentation_t[:,1]0) & (segmentation_t[:,2] n_future_frames has to be set time_receptive_field = 3 n_future_frames = 4 n_future_frames_plus = 6 iou_thresh_for_vpq = 0.2 test_present = False # Occupancy-related params ****************************************** point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] occ_size = [512, 512, 40] lss_downsample = [4, 4, 4] voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0] voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1] voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2] empty_idx = 0 if use_separate_classes: num_cls = len(class_names) + 1 else: num_cls = 2 img_norm_cfg = None # Save params ****************************************** save_pred = False save_path = "./data/cam4docc/results" # Data-generation and pipeline params ****************************************** dataset_type = 'Cam4DOccDataset' file_client_args = dict(backend='disk') data_config={ 'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'], 'Ncams': 6, 'input_size': (896, 1600), 'src_size': (900, 1600), # image-view augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': False, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(0.95, 1.05), flip_dx_ratio=0.5, flip_dy_ratio=0.5) train_capacity = 23930 # default: use all sequences test_capacity = 5119 # default: use all sequences train_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes), dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config, sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root, mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False), dict(type='OccDefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']), ] test_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes), dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root, sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True), dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']), ] train_config=dict( type=dataset_type, data_root=nusc_root, occ_root=occ_path, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=train_ann_file, pipeline=train_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, test_mode=False, use_valid_flag=True, occ_size=occ_size, pc_range=point_cloud_range, box_type_3d='LiDAR', time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) test_config=dict( type=dataset_type, occ_root=occ_path, data_root=nusc_root, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=val_ann_file, pipeline=test_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, occ_size=occ_size, pc_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) # in our work we use 8 NVIDIA A100 GPUs data = dict( samples_per_gpu=1, workers_per_gpu=1, train=train_config, val=test_config, test=test_config, shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler'), ) # Model params ****************************************** grid_config = { 'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]], 'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]], 'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]], 'dbound': [2.0, 58.0, 0.5], } voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)] pred_channels = [32, 32*2, 32*4, 32*8] decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)] numC_Trans = 64 occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field voxel_out_channel = 32*(n_future_frames_plus) flow_out_channel = 32*(n_future_frames_plus) voxel_out_channel_per_frame = 32 voxel_out_indices = (0, 1, 2, 3) my_voxel_out_indices = (0, 1, 2, 3) model = dict( type='OCFNet', only_generate_dataset=only_generate_dataset, loss_norm=False, disable_loss_depth=True, point_cloud_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, n_future_frames_plus=n_future_frames_plus, max_label=num_cls, iou_thresh_for_vpq=iou_thresh_for_vpq, test_present=test_present, record_time=False, save_pred=save_pred, save_path=save_path, img_backbone=dict( pretrained='torchvision://resnet50', type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=0, with_cp=False, norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False, style='pytorch'), img_neck=dict( type='SECONDFPN', in_channels=[256, 512, 1024, 2048], upsample_strides=[0.25, 0.5, 1, 2], out_channels=[128, 128, 128, 128]), img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel', norm_cfg=dict(type='SyncBN', requires_grad=True), loss_depth_weight=3., loss_depth_type='kld', grid_config=grid_config, data_config=data_config, numC_Trans=numC_Trans, vp_megvii=False), occ_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=voxel_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=flow_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_head=dict( type='FlowHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=3, # 3-dim flow point_cloud_range=point_cloud_range, ), pts_bbox_head=dict( type='OccHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=num_cls, point_cloud_range=point_cloud_range, loss_weight_cfg=dict( loss_voxel_ce_weight=1.0, loss_voxel_sem_scal_weight=1.0, loss_voxel_geo_scal_weight=1.0, loss_voxel_lovasz_weight=1.0, ), ), empty_idx=empty_idx, ) # Learning policy params ****************************************** optimizer = dict( type='AdamW', lr=3e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3) runner = dict(type='EpochBasedRunner', max_epochs=24) evaluation = dict( interval=1, pipeline=test_pipeline, save_best='SSC_mean', rule='greater', ) custom_hooks = [ dict(type='OccEfficiencyHook'), ] ================================================ FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.1_lyft.py ================================================ # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc # 2 classes: inflated GMO and others # Basic params ****************************************** _base_ = [ '../datasets/custom_nus-3d.py', '../_base_/default_runtime.py' ] find_unused_parameters = True # whether training and test together with dataset generation only_generate_dataset = False # we only consider use_camera in Cam4DOcc in the current version input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) plugin = True plugin_dir = "projects/occ_plugin/" # path unused for lyft occ_path = " " depth_gt_path = " " train_ann_file = " " val_ann_file = " " cam4docc_dataset_path = "./data/cam4docc/" nusc_root = './data/lyft/' # GMO class names class_names = ['vehicle', 'human'] use_separate_classes = False use_fine_occ = False # Forecasting-related params ****************************************** # we use *time_receptive_field* past frames to forecast future *n_future_frames* frames # for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set time_receptive_field = 3 n_future_frames = 4 n_future_frames_plus = 6 iou_thresh_for_vpq = 0.2 test_present = False # Occupancy-related params ****************************************** point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] occ_size = [512, 512, 40] lss_downsample = [4, 4, 4] voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0] voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1] voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2] empty_idx = 0 if use_separate_classes: num_cls = len(class_names) + 1 else: num_cls = 2 img_norm_cfg = None # Save params ****************************************** save_pred = False save_path = "./data/cam4docc/results" # Data-generation and pipeline params ****************************************** dataset_type = 'Cam4DOccLyftDataset' file_client_args = dict(backend='disk') data_config={ 'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'], 'Ncams': 6, 'input_size': (896, 1600), 'src_size': (900, 1600), # image-view augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': False, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(0.95, 1.05), flip_dx_ratio=0.5, flip_dy_ratio=0.5) train_capacity = 15720 # default: use all sequences test_capacity = 5880 # default: use all sequences train_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes, use_lyft=True), dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config, sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root, mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg, use_lyft=True), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False), dict(type='OccDefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']), ] test_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes, use_lyft=True), dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root, sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True, use_lyft=True), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True), dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']), ] train_config=dict( type=dataset_type, data_root=nusc_root, occ_root=occ_path, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=train_ann_file, pipeline=train_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, test_mode=False, use_valid_flag=True, occ_size=occ_size, pc_range=point_cloud_range, box_type_3d='LiDAR', time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) test_config=dict( type=dataset_type, occ_root=occ_path, data_root=nusc_root, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=val_ann_file, pipeline=test_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, occ_size=occ_size, pc_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) # in our work we use 8 NVIDIA A100 GPUs data = dict( samples_per_gpu=1, workers_per_gpu=1, train=train_config, val=test_config, test=test_config, shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler'), ) # Model params ****************************************** grid_config = { 'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]], 'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]], 'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]], 'dbound': [2.0, 58.0, 0.5], } voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)] pred_channels = [32, 32*2, 32*4, 32*8] decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)] numC_Trans = 64 occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field voxel_out_channel = 32*(n_future_frames_plus) flow_out_channel = 32*(n_future_frames_plus) voxel_out_channel_per_frame = 32 voxel_out_indices = (0, 1, 2, 3) my_voxel_out_indices = (0, 1, 2, 3) model = dict( type='OCFNet', only_generate_dataset=only_generate_dataset, loss_norm=False, disable_loss_depth=True, point_cloud_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, n_future_frames_plus=n_future_frames_plus, max_label=num_cls, iou_thresh_for_vpq=iou_thresh_for_vpq, test_present=test_present, record_time=False, save_pred=save_pred, save_path=save_path, img_backbone=dict( pretrained='torchvision://resnet50', type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=0, with_cp=False, norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False, style='pytorch'), img_neck=dict( type='SECONDFPN', in_channels=[256, 512, 1024, 2048], upsample_strides=[0.25, 0.5, 1, 2], out_channels=[128, 128, 128, 128]), img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel', norm_cfg=dict(type='SyncBN', requires_grad=True), loss_depth_weight=3., loss_depth_type='kld', grid_config=grid_config, data_config=data_config, numC_Trans=numC_Trans, vp_megvii=False), occ_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=voxel_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=flow_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_head=dict( type='FlowHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=3, # 3-dim flow point_cloud_range=point_cloud_range, ), pts_bbox_head=dict( type='OccHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=num_cls, point_cloud_range=point_cloud_range, loss_weight_cfg=dict( loss_voxel_ce_weight=1.0, loss_voxel_sem_scal_weight=1.0, loss_voxel_geo_scal_weight=1.0, loss_voxel_lovasz_weight=1.0, ), ), empty_idx=empty_idx, ) # Learning policy params ****************************************** optimizer = dict( type='AdamW', lr=3e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3) runner = dict(type='EpochBasedRunner', max_epochs=24) evaluation = dict( interval=1, pipeline=test_pipeline, save_best='SSC_mean', rule='greater', ) custom_hooks = [ dict(type='OccEfficiencyHook'), ] ================================================ FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2.py ================================================ # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc # multiple classes: inflated multiple MO classes # Basic params ****************************************** _base_ = [ '../datasets/custom_nus-3d.py', '../_base_/default_runtime.py' ] find_unused_parameters = True # whether training and test together with dataset generation only_generate_dataset = False # we only consider use_camera in Cam4DOcc in the current version input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) plugin = True plugin_dir = "projects/occ_plugin/" occ_path = "./data/nuScenes-Occupancy" depth_gt_path = './data/depth_gt' train_ann_file = "./data/nuscenes/nuscenes_occ_infos_train.pkl" val_ann_file = "./data/nuscenes/nuscenes_occ_infos_val.pkl" cam4docc_dataset_path = "./data/cam4docc/" nusc_root = './data/nuscenes/' # GMO class names class_names = [ 'vehicle.bicycle', 'bus', 'car', 'construction', 'motorcycle', 'trailer', 'truck', 'pedestrian' ] use_separate_classes = True use_fine_occ = False # Forecasting-related params ****************************************** # we use *time_receptive_field* past frames to forecast future *n_future_frames* frames # for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set time_receptive_field = 3 n_future_frames = 4 n_future_frames_plus = 6 iou_thresh_for_vpq = 0.2 test_present = False # Occupancy-related params ****************************************** point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] occ_size = [512, 512, 40] lss_downsample = [4, 4, 4] voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0] voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1] voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2] empty_idx = 0 if use_separate_classes: num_cls = len(class_names) + 1 else: num_cls = 2 img_norm_cfg = None # Save params ****************************************** save_pred = False save_path = "./data/cam4docc/results" # Data-generation and pipeline params ****************************************** dataset_type = 'Cam4DOccDataset' file_client_args = dict(backend='disk') data_config={ 'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'], 'Ncams': 6, 'input_size': (896, 1600), 'src_size': (900, 1600), # image-view augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': False, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(0.95, 1.05), flip_dx_ratio=0.5, flip_dy_ratio=0.5) train_capacity = 23930 # default: use all sequences test_capacity = 5119 # default: use all sequences train_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes), dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config, sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root, mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False), dict(type='OccDefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']), ] test_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes), dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root, sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True), dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']), ] train_config=dict( type=dataset_type, data_root=nusc_root, occ_root=occ_path, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=train_ann_file, pipeline=train_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, test_mode=False, use_valid_flag=True, occ_size=occ_size, pc_range=point_cloud_range, box_type_3d='LiDAR', time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) test_config=dict( type=dataset_type, occ_root=occ_path, data_root=nusc_root, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=val_ann_file, pipeline=test_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, occ_size=occ_size, pc_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) # in our work we use 8 NVIDIA A100 GPUs data = dict( samples_per_gpu=1, workers_per_gpu=1, train=train_config, val=test_config, test=test_config, shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler'), ) # Model params ****************************************** grid_config = { 'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]], 'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]], 'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]], 'dbound': [2.0, 58.0, 0.5], } voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)] pred_channels = [32, 32*2, 32*4, 32*8] decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)] numC_Trans = 64 occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field voxel_out_channel = 32*(n_future_frames_plus) flow_out_channel = 32*(n_future_frames_plus) voxel_out_channel_per_frame = 32 voxel_out_indices = (0, 1, 2, 3) my_voxel_out_indices = (0, 1, 2, 3) model = dict( type='OCFNet', only_generate_dataset=only_generate_dataset, loss_norm=False, disable_loss_depth=True, point_cloud_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, n_future_frames_plus=n_future_frames_plus, max_label=num_cls, iou_thresh_for_vpq=iou_thresh_for_vpq, test_present=test_present, record_time=False, save_pred=save_pred, save_path=save_path, img_backbone=dict( pretrained='torchvision://resnet50', type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=0, with_cp=False, norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False, style='pytorch'), img_neck=dict( type='SECONDFPN', in_channels=[256, 512, 1024, 2048], upsample_strides=[0.25, 0.5, 1, 2], out_channels=[128, 128, 128, 128]), img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel', norm_cfg=dict(type='SyncBN', requires_grad=True), loss_depth_weight=3., loss_depth_type='kld', grid_config=grid_config, data_config=data_config, numC_Trans=numC_Trans, vp_megvii=False), occ_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=voxel_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=flow_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_head=dict( type='FlowHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=3, # 3-dim flow point_cloud_range=point_cloud_range, ), pts_bbox_head=dict( type='OccHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=num_cls, point_cloud_range=point_cloud_range, loss_weight_cfg=dict( loss_voxel_ce_weight=1.0, loss_voxel_sem_scal_weight=1.0, loss_voxel_geo_scal_weight=1.0, loss_voxel_lovasz_weight=1.0, ), ), empty_idx=empty_idx, ) # Learning policy params ****************************************** optimizer = dict( type='AdamW', lr=3e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3) runner = dict(type='EpochBasedRunner', max_epochs=24) evaluation = dict( interval=1, pipeline=test_pipeline, save_best='SSC_mean', rule='greater', ) custom_hooks = [ dict(type='OccEfficiencyHook'), ] ================================================ FILE: projects/configs/baselines/OCFNet_in_Cam4DOcc_V1.2_lyft.py ================================================ # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc # multiple classes: inflated multiple MO classes # Basic params ****************************************** _base_ = [ '../datasets/custom_nus-3d.py', '../_base_/default_runtime.py' ] find_unused_parameters = True # whether training and test together with dataset generation only_generate_dataset = False # we only consider use_camera in Cam4DOcc in the current version input_modality = dict( use_lidar=False, use_camera=True, use_radar=False, use_map=False, use_external=False) plugin = True plugin_dir = "projects/occ_plugin/" # path unused for lyft occ_path = " " depth_gt_path = " " train_ann_file = " " val_ann_file = " " cam4docc_dataset_path = "./data/cam4docc/" nusc_root = './data/lyft/' # GMO class names # refine the classes for lyft datasets according to your needs class_names = [ 'bicycle', 'bus', 'car', 'construction', 'motorcycle', 'trailer', 'truck', 'pedestrian' ] use_separate_classes = True use_fine_occ = False # Forecasting-related params ****************************************** # we use *time_receptive_field* past frames to forecast future *n_future_frames* frames # for 3D instance prediction, n_future_frames_plus > n_future_frames has to be set time_receptive_field = 3 n_future_frames = 4 n_future_frames_plus = 6 iou_thresh_for_vpq = 0.2 test_present = False # Occupancy-related params ****************************************** point_cloud_range = [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] occ_size = [512, 512, 40] lss_downsample = [4, 4, 4] voxel_x = (point_cloud_range[3] - point_cloud_range[0]) / occ_size[0] voxel_y = (point_cloud_range[4] - point_cloud_range[1]) / occ_size[1] voxel_z = (point_cloud_range[5] - point_cloud_range[2]) / occ_size[2] empty_idx = 0 if use_separate_classes: num_cls = len(class_names) + 1 else: num_cls = 2 img_norm_cfg = None # Save params ****************************************** save_pred = False save_path = "./data/cam4docc/results" # Data-generation and pipeline params ****************************************** dataset_type = 'Cam4DOccLyftDataset' file_client_args = dict(backend='disk') data_config={ 'cams': ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'], 'Ncams': 6, 'input_size': (896, 1600), 'src_size': (900, 1600), # image-view augmentation 'resize': (-0.06, 0.11), 'rot': (-5.4, 5.4), 'flip': False, 'crop_h': (0.0, 0.0), 'resize_test': 0.00, } bda_aug_conf = dict( rot_lim=(-0, 0), scale_lim=(0.95, 1.05), flip_dx_ratio=0.5, flip_dy_ratio=0.5) train_capacity = 15720 # default: use all sequences test_capacity = 5880 # default: use all sequences train_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes, use_lyft=True), dict(type='LoadMultiViewImageFromFiles_BEVDet', is_train=True, data_config=data_config, sequential=False, aligned=True, trans_only=False, depth_gt_path=depth_gt_path, data_root=nusc_root, mmlabnorm=True, load_depth=True, img_norm_cfg=img_norm_cfg, use_lyft=True), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=False), dict(type='OccDefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion']), ] test_pipeline = [ dict(type='LoadInstanceWithFlow', cam4docc_dataset_path=cam4docc_dataset_path, grid_size=occ_size, use_flow=True, background=empty_idx, pc_range=point_cloud_range, use_separate_classes=use_separate_classes, use_lyft=True), dict(type='LoadMultiViewImageFromFiles_BEVDet', data_config=data_config, depth_gt_path=depth_gt_path, data_root=nusc_root, sequential=False, aligned=True, trans_only=False, mmlabnorm=True, img_norm_cfg=img_norm_cfg, test_mode=True, use_lyft=True), dict(type='LoadOccupancy', to_float32=True, occ_path=occ_path, grid_size=occ_size, unoccupied=empty_idx, pc_range=point_cloud_range, use_fine_occ=use_fine_occ, test_mode=True), dict(type='OccDefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['img_inputs_seq', 'gt_occ', 'segmentation', 'instance', 'flow', 'future_egomotion'], meta_keys=['pc_range', 'occ_size', 'scene_token', 'lidar_token']), ] train_config=dict( type=dataset_type, data_root=nusc_root, occ_root=occ_path, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=train_ann_file, pipeline=train_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, test_mode=False, use_valid_flag=True, occ_size=occ_size, pc_range=point_cloud_range, box_type_3d='LiDAR', time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) test_config=dict( type=dataset_type, occ_root=occ_path, data_root=nusc_root, idx_root=cam4docc_dataset_path, ori_data_root=cam4docc_dataset_path, ann_file=val_ann_file, pipeline=test_pipeline, classes=class_names, use_separate_classes=use_separate_classes, modality=input_modality, occ_size=occ_size, pc_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, train_capacity=train_capacity, test_capacity=test_capacity, ) # in our work we use 8 NVIDIA A100 GPUs data = dict( samples_per_gpu=1, workers_per_gpu=1, train=train_config, val=test_config, test=test_config, shuffler_sampler=dict(type='DistributedGroupSampler'), nonshuffler_sampler=dict(type='DistributedSampler'), ) # Model params ****************************************** grid_config = { 'xbound': [point_cloud_range[0], point_cloud_range[3], voxel_x*lss_downsample[0]], 'ybound': [point_cloud_range[1], point_cloud_range[4], voxel_y*lss_downsample[1]], 'zbound': [point_cloud_range[2], point_cloud_range[5], voxel_z*lss_downsample[2]], 'dbound': [2.0, 58.0, 0.5], } voxel_channels = [32*(time_receptive_field), 32*2*(time_receptive_field), 32*4*(time_receptive_field), 32*8*(time_receptive_field)] pred_channels = [32, 32*2, 32*4, 32*8] decoder_channels = [32*(n_future_frames_plus), 32*2*(n_future_frames_plus), 32*4*(n_future_frames_plus), 32*8*(n_future_frames_plus)] numC_Trans = 64 occ_encoder_input_channel = (numC_Trans+6)*time_receptive_field voxel_out_channel = 32*(n_future_frames_plus) flow_out_channel = 32*(n_future_frames_plus) voxel_out_channel_per_frame = 32 voxel_out_indices = (0, 1, 2, 3) my_voxel_out_indices = (0, 1, 2, 3) model = dict( type='OCFNet', only_generate_dataset=only_generate_dataset, loss_norm=False, disable_loss_depth=True, point_cloud_range=point_cloud_range, time_receptive_field=time_receptive_field, n_future_frames=n_future_frames, n_future_frames_plus=n_future_frames_plus, max_label=num_cls, iou_thresh_for_vpq=iou_thresh_for_vpq, test_present=test_present, record_time=False, save_pred=save_pred, save_path=save_path, img_backbone=dict( pretrained='torchvision://resnet50', type='ResNet', depth=50, num_stages=4, out_indices=(0, 1, 2, 3), frozen_stages=0, with_cp=False, norm_cfg=dict(type='SyncBN', requires_grad=True), norm_eval=False, style='pytorch'), img_neck=dict( type='SECONDFPN', in_channels=[256, 512, 1024, 2048], upsample_strides=[0.25, 0.5, 1, 2], out_channels=[128, 128, 128, 128]), img_view_transformer=dict(type='ViewTransformerLiftSplatShootVoxel', norm_cfg=dict(type='SyncBN', requires_grad=True), loss_depth_weight=3., loss_depth_type='kld', grid_config=grid_config, data_config=data_config, numC_Trans=numC_Trans, vp_megvii=False), occ_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), occ_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=voxel_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_backbone=dict( type='CustomResNet3D', depth=18, n_input_channels=occ_encoder_input_channel, block_inplanes=voxel_channels, out_indices=my_voxel_out_indices, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_predictor=dict( type='Predictor', n_input_channels=pred_channels, in_timesteps=time_receptive_field, out_timesteps=n_future_frames_plus, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_encoder_neck=dict( type='FPN3D', with_cp=False, in_channels=decoder_channels, out_channels=flow_out_channel, norm_cfg=dict(type='SyncBN', requires_grad=True), ), flow_head=dict( type='FlowHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=3, # 3-dim flow point_cloud_range=point_cloud_range, ), pts_bbox_head=dict( type='OccHead', norm_cfg=dict(type='SyncBN', requires_grad=True), soft_weights=True, final_occ_size=occ_size, fine_topk=15000, empty_idx=empty_idx, num_level=len(my_voxel_out_indices), in_channels=[voxel_out_channel_per_frame] * len(my_voxel_out_indices), out_channel=num_cls, point_cloud_range=point_cloud_range, loss_weight_cfg=dict( loss_voxel_ce_weight=1.0, loss_voxel_sem_scal_weight=1.0, loss_voxel_geo_scal_weight=1.0, loss_voxel_lovasz_weight=1.0, ), ), empty_idx=empty_idx, ) # Learning policy params ****************************************** optimizer = dict( type='AdamW', lr=3e-4, paramwise_cfg=dict( custom_keys={ 'img_backbone': dict(lr_mult=0.1), }), weight_decay=0.01) optimizer_config = dict(grad_clip=dict(max_norm=35, norm_type=2)) lr_config = dict( policy='CosineAnnealing', warmup='linear', warmup_iters=500, warmup_ratio=1.0 / 3, min_lr_ratio=1e-3) runner = dict(type='EpochBasedRunner', max_epochs=24) evaluation = dict( interval=1, pipeline=test_pipeline, save_best='SSC_mean', rule='greater', ) custom_hooks = [ dict(type='OccEfficiencyHook'), ] ================================================ FILE: projects/configs/datasets/custom_nus-3d.py ================================================ # If point cloud range is changed, the models should also change their point # cloud range accordingly point_cloud_range = [-50, -50, -5, 50, 50, 3] # For nuScenes we usually do 10-class detection class_names = [ 'car', 'truck', 'trailer', 'bus', 'construction_vehicle', 'bicycle', 'motorcycle', 'pedestrian', 'traffic_cone', 'barrier' ] dataset_type = 'NuScenesDataset_eval_modified' data_root = 'data/nuscenes/' # Input modality for nuScenes dataset, this is consistent with the submission # format which requires the information in input_modality. input_modality = dict( use_lidar=True, use_camera=False, use_radar=False, use_map=False, use_external=False) file_client_args = dict(backend='disk') # Uncomment the following if use ceph or other file clients. # See https://mmcv.readthedocs.io/en/latest/api.html#mmcv.fileio.FileClient # for more details. # file_client_args = dict( # backend='petrel', # path_mapping=dict({ # './data/nuscenes/': 's3://nuscenes/nuscenes/', # 'data/nuscenes/': 's3://nuscenes/nuscenes/' # })) train_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True), dict( type='GlobalRotScaleTrans', rot_range=[-0.3925, 0.3925], scale_ratio_range=[0.95, 1.05], translation_std=[0, 0, 0]), dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectNameFilter', classes=class_names), dict(type='PointShuffle'), dict(type='DefaultFormatBundle3D', class_names=class_names), dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d']) ] test_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='MultiScaleFlipAug3D', img_scale=(1333, 800), pts_scale_ratio=1, flip=False, transforms=[ dict( type='GlobalRotScaleTrans', rot_range=[0, 0], scale_ratio_range=[1., 1.], translation_std=[0, 0, 0]), dict(type='RandomFlip3D'), dict( type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ]) ] # construct a pipeline for data and gt loading in show function # please keep its loading function consistent with test_pipeline (e.g. client) eval_pipeline = [ dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=5, file_client_args=file_client_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=10, file_client_args=file_client_args), dict( type='DefaultFormatBundle3D', class_names=class_names, with_label=False), dict(type='Collect3D', keys=['points']) ] data = dict( samples_per_gpu=4, workers_per_gpu=4, train=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_train.pkl', pipeline=train_pipeline, classes=class_names, modality=input_modality, test_mode=False, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'), val=dict( type=dataset_type, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR'), test=dict( type=dataset_type, data_root=data_root, ann_file=data_root + 'nuscenes_infos_val.pkl', pipeline=test_pipeline, classes=class_names, modality=input_modality, test_mode=True, box_type_3d='LiDAR')) # For nuScenes dataset, we usually evaluate the model at the end of training. # Since the models are trained by 24 epochs by default, we set evaluation # interval to be 24. Please change the interval accordingly if you do not # use a default schedule. evaluation = dict(interval=24, pipeline=eval_pipeline) ================================================ FILE: projects/occ_plugin/__init__.py ================================================ from .core.evaluation.eval_hooks import OccDistEvalHook, OccEvalHook from .core.evaluation.efficiency_hooks import OccEfficiencyHook from .core.visualizer import save_occ from .datasets.pipelines import ( PhotoMetricDistortionMultiViewImage, PadMultiViewImage, NormalizeMultiviewImage, CustomCollect3D) from .occupancy import * ================================================ FILE: projects/occ_plugin/core/__init__.py ================================================ from .evaluation import * from .visualizer import * ================================================ FILE: projects/occ_plugin/core/evaluation/__init__.py ================================================ from .eval_hooks import OccDistEvalHook, OccEvalHook from .efficiency_hooks import OccEfficiencyHook ================================================ FILE: projects/occ_plugin/core/evaluation/efficiency_hooks.py ================================================ import copy from mmcv.runner import HOOKS, Hook import time try: from mmcv.cnn import get_model_complexity_info except ImportError: raise ImportError('Please upgrade mmcv to >0.6.2') import torch import torch.distributed as dist @HOOKS.register_module() class OccEfficiencyHook(Hook): def __init__(self, dataloader, **kwargs): self.dataloader = dataloader self.warm_up = 5 def construct_input(self, DUMMY_SHAPE=None, m_info=None): if m_info is None: m_info = next(iter(self.dataloader)) img_metas = m_info['img_metas'].data input = dict( img_metas=img_metas, ) if 'img_inputs' in m_info.keys(): img_inputs = m_info['img_inputs'] for i in range(len(img_inputs)): if isinstance(img_inputs[i], list): for j in range(len(img_inputs[i])): img_inputs[i][j] = img_inputs[i][j].cuda() else: img_inputs[i] = img_inputs[i].cuda() input['img_inputs'] = img_inputs if 'points' in m_info.keys(): points = m_info['points'].data[0] points[0] = points[0].cuda() input['points'] = points return input def before_run(self, runner): torch.cuda.reset_peak_memory_stats() # model = copy.deepcopy(runner.model) # if hasattr(model, 'module'): # model = model.module # if hasattr(model, 'forward_dummy'): # model.forward_train = model.forward_dummy # model.forward_test = model.forward_dummy # model.eval() # else: # raise NotImplementedError( # 'FLOPs counter is currently not supported for {}'.format( # model.__class__.__name__)) # # inf time # pure_inf_time = 0 # itv_sample = 10 # for i, data in enumerate(self.dataloader): # torch.cuda.synchronize() # start_time = time.perf_counter() # with torch.no_grad(): # model(return_loss=False, rescale=True, **self.construct_input(m_info=data)) # torch.cuda.synchronize() # elapsed = time.perf_counter() - start_time # if i >= self.warm_up: # pure_inf_time += elapsed # if (i + 1) % itv_sample == 0: # fps = (i + 1 - self.warm_up) / pure_inf_time # if runner.rank == 0: # runner.logger.info(f'Done sample [{i + 1:<3}/ {itv_sample*5}], ' # f'fps: {fps:.1f} sample / s') # if (i + 1) == itv_sample*5: # pure_inf_time += elapsed # fps = (i + 1 - self.warm_up) / pure_inf_time # if runner.rank == 0: # runner.logger.info(f'Overall fps: {fps:.1f} sample / s') # break # # flops and params # if runner.rank == 0: # flops, params = get_model_complexity_info( # model, (None, None), input_constructor=self.construct_input) # split_line = '=' * 30 # gpu_measure = torch.cuda.max_memory_allocated() / 1024. / 1024. /1024. # runner.logger.info(f'{split_line}\n' f'Flops: {flops}\nParams: {params}\nGPU memory: {gpu_measure:.2f}GB\n{split_line}') if dist.is_available() and dist.is_initialized(): dist.barrier() def after_run(self, runner): pass def before_epoch(self, runner): pass def after_epoch(self, runner): pass def before_iter(self, runner): pass def after_iter(self, runner): pass ================================================ FILE: projects/occ_plugin/core/evaluation/eval_hooks.py ================================================ # Note: Considering that MMCV's EvalHook updated its interface in V1.3.16, # in order to avoid strong version dependency, we did not directly # inherit EvalHook but BaseDistEvalHook. import os.path as osp import torch.distributed as dist from mmcv.runner import DistEvalHook as BaseDistEvalHook from torch.nn.modules.batchnorm import _BatchNorm from mmcv.runner import EvalHook as BaseEvalHook class OccEvalHook(BaseEvalHook): def __init__(self, *args, **kwargs): super(OccEvalHook, self).__init__(*args, **kwargs) def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" if not self._should_evaluate(runner): return from projects.occ_plugin.occupancy.apis.test import custom_single_gpu_test results = custom_single_gpu_test(runner.model, self.dataloader, show=False) runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) class OccDistEvalHook(BaseDistEvalHook): def __init__(self, *args, **kwargs): super(OccDistEvalHook, self).__init__(*args, **kwargs) def _do_evaluate(self, runner): """perform evaluation and save ckpt.""" # Synchronization of BatchNorm's buffer (running_mean # and running_var) is not supported in the DDP of pytorch, # which may cause the inconsistent performance of models in # different ranks, so we broadcast BatchNorm's buffers # of rank 0 to other ranks to avoid this. if self.broadcast_bn_buffer: model = runner.model for name, module in model.named_modules(): if isinstance(module, _BatchNorm) and module.track_running_stats: dist.broadcast(module.running_var, 0) dist.broadcast(module.running_mean, 0) if not self._should_evaluate(runner): return tmpdir = self.tmpdir if tmpdir is None: tmpdir = osp.join(runner.work_dir, '.eval_hook') from projects.occ_plugin.occupancy.apis.test import custom_multi_gpu_test # to solve circlur import results = custom_multi_gpu_test( runner.model, self.dataloader, tmpdir=tmpdir, gpu_collect=self.gpu_collect) if runner.rank == 0: print('\n') runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) key_score = self.evaluate(runner, results) if self.save_best: self._save_ckpt(runner, key_score) ================================================ FILE: projects/occ_plugin/core/visualizer/__init__.py ================================================ from .show_occ import save_occ ================================================ FILE: projects/occ_plugin/core/visualizer/show_occ.py ================================================ import torch.nn.functional as F import torch import numpy as np from os import path as osp import os def save_occ(pred_c, pred_f, img_metas, path, visible_mask=None, gt_occ=None, free_id=0, thres_low=0.4, thres_high=0.99): """ visualization saving for paper: 1. gt 2. pred_f pred_c 3. gt visible 4. pred_f visible """ pred_f = F.softmax(pred_f, dim=1) pred_f = pred_f[0].cpu().numpy() # C W H D pred_c = F.softmax(pred_c, dim=1) pred_c = pred_c[0].cpu().numpy() # C W H D visible_mask = visible_mask[0].cpu().numpy().reshape(-1) > 0 # WHD gt_occ = gt_occ.data[0][0].cpu().numpy() # W H D gt_occ[gt_occ==255] = 0 _, W, H, D = pred_f.shape coordinates_3D_f = np.stack(np.meshgrid(np.arange(W), np.arange(H), np.arange(D), indexing='ij'), axis=-1).reshape(-1, 3) # (W*H*D, 3) _, W, H, D = pred_c.shape coordinates_3D_c = np.stack(np.meshgrid(np.arange(W), np.arange(H), np.arange(D), indexing='ij'), axis=-1).reshape(-1, 3) # (W*H*D, 3) pred_f = np.argmax(pred_f, axis=0) # (W, H, D) pred_c = np.argmax(pred_c, axis=0) # (W, H, D) occ_pred_f_mask = (pred_f.reshape(-1))!=free_id occ_pred_c_mask = (pred_c.reshape(-1))!=free_id occ_gt_mask = (gt_occ.reshape(-1))!=free_id pred_f_save = np.concatenate([coordinates_3D_f[occ_pred_f_mask], pred_f.reshape(-1)[occ_pred_f_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls pred_c_save = np.concatenate([coordinates_3D_c[occ_pred_c_mask], pred_c.reshape(-1)[occ_pred_c_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls pred_f_visible_save = np.concatenate([coordinates_3D_f[occ_pred_f_mask&visible_mask], pred_f.reshape(-1)[occ_pred_f_mask&visible_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls gt_save = np.concatenate([coordinates_3D_f[occ_gt_mask], gt_occ.reshape(-1)[occ_gt_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls gt_visible_save = np.concatenate([coordinates_3D_f[occ_gt_mask&visible_mask], gt_occ.reshape(-1)[occ_gt_mask&visible_mask].reshape(-1, 1)], axis=1)[:, [2,1,0,3]] # zyx cls scene_token = img_metas.data[0][0]['scene_token'] lidar_token = img_metas.data[0][0]['lidar_token'] save_path = osp.join(path, scene_token, lidar_token) if not osp.exists(save_path): os.makedirs(save_path) save_pred_f_path = osp.join(save_path, 'pred_f.npy') save_pred_c_path = osp.join(save_path, 'pred_c.npy') save_pred_f_v_path = osp.join(save_path, 'pred_f_visible.npy') save_gt_path = osp.join(save_path, 'gt.npy') save_gt_v_path = osp.join(save_path, 'gt_visible.npy') np.save(save_pred_f_path, pred_f_save) np.save(save_pred_c_path, pred_c_save) np.save(save_pred_f_v_path, pred_f_visible_save) np.save(save_gt_path, gt_save) np.save(save_gt_v_path, gt_visible_save) ================================================ FILE: projects/occ_plugin/datasets/__init__.py ================================================ from .nuscenes_dataset import CustomNuScenesDataset from .cam4docc_dataset import Cam4DOccDataset from .cam4docc_lyft_dataset import Cam4DOccLyftDataset from .builder import custom_build_dataset __all__ = [ 'CustomNuScenesDataset', 'NuscOCCDataset' ] ================================================ FILE: projects/occ_plugin/datasets/builder.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import copy import platform import random from functools import partial import numpy as np from mmcv.parallel import collate from mmcv.runner import get_dist_info from mmcv.utils import Registry, build_from_cfg from torch.utils.data import DataLoader from mmdet.datasets.samplers import GroupSampler from projects.occ_plugin.datasets.samplers.group_sampler import DistributedGroupSampler from projects.occ_plugin.datasets.samplers.distributed_sampler import DistributedSampler from projects.occ_plugin.datasets.samplers.sampler import build_sampler def build_dataloader(dataset, samples_per_gpu, workers_per_gpu, num_gpus=1, dist=True, shuffle=True, seed=None, shuffler_sampler=None, nonshuffler_sampler=None, **kwargs): """Build PyTorch DataLoader. In distributed training, each GPU/process has a dataloader. In non-distributed training, there is only one dataloader for all GPUs. Args: dataset (Dataset): A PyTorch dataset. samples_per_gpu (int): Number of training samples on each GPU, i.e., batch size of each GPU. workers_per_gpu (int): How many subprocesses to use for data loading for each GPU. num_gpus (int): Number of GPUs. Only used in non-distributed training. dist (bool): Distributed training/test or not. Default: True. shuffle (bool): Whether to shuffle the data at every epoch. Default: True. kwargs: any keyword argument to be used to initialize DataLoader Returns: DataLoader: A PyTorch dataloader. """ rank, world_size = get_dist_info() if dist: # DistributedGroupSampler will definitely shuffle the data to satisfy # that images on each GPU are in the same group if shuffle: sampler = build_sampler(shuffler_sampler if shuffler_sampler is not None else dict(type='DistributedGroupSampler'), dict( dataset=dataset, samples_per_gpu=samples_per_gpu, num_replicas=world_size, rank=rank, seed=seed) ) else: sampler = build_sampler(nonshuffler_sampler if nonshuffler_sampler is not None else dict(type='DistributedSampler'), dict( dataset=dataset, num_replicas=world_size, rank=rank, shuffle=shuffle, seed=seed) ) batch_size = samples_per_gpu num_workers = workers_per_gpu else: print('WARNING!!!!, Only can be used for obtain inference speed!!!!') sampler = GroupSampler(dataset, samples_per_gpu) if shuffle else None batch_size = num_gpus * samples_per_gpu num_workers = num_gpus * workers_per_gpu init_fn = partial( worker_init_fn, num_workers=num_workers, rank=rank, seed=seed) if seed is not None else None data_loader = DataLoader( dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers, collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), pin_memory=False, worker_init_fn=init_fn, **kwargs) return data_loader def worker_init_fn(worker_id, num_workers, rank, seed): # The seed of each worker equals to # num_worker * rank + worker_id + user_seed worker_seed = num_workers * rank + worker_id + seed np.random.seed(worker_seed) random.seed(worker_seed) # Copyright (c) OpenMMLab. All rights reserved. import platform from mmcv.utils import Registry, build_from_cfg from mmdet.datasets import DATASETS from mmdet.datasets.builder import _concat_dataset if platform.system() != 'Windows': # https://github.com/pytorch/pytorch/issues/973 import resource rlimit = resource.getrlimit(resource.RLIMIT_NOFILE) base_soft_limit = rlimit[0] hard_limit = rlimit[1] soft_limit = min(max(4096, base_soft_limit), hard_limit) resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit)) OBJECTSAMPLERS = Registry('Object sampler') def custom_build_dataset(cfg, default_args=None): from mmdet3d.datasets.dataset_wrappers import CBGSDataset from mmdet.datasets.dataset_wrappers import (ClassBalancedDataset, ConcatDataset, RepeatDataset) if isinstance(cfg, (list, tuple)): dataset = ConcatDataset([custom_build_dataset(c, default_args) for c in cfg]) elif cfg['type'] == 'ConcatDataset': dataset = ConcatDataset( [custom_build_dataset(c, default_args) for c in cfg['datasets']], cfg.get('separate_eval', True)) elif cfg['type'] == 'RepeatDataset': dataset = RepeatDataset( custom_build_dataset(cfg['dataset'], default_args), cfg['times']) elif cfg['type'] == 'ClassBalancedDataset': dataset = ClassBalancedDataset( custom_build_dataset(cfg['dataset'], default_args), cfg['oversample_thr']) elif cfg['type'] == 'CBGSDataset': dataset = CBGSDataset(custom_build_dataset(cfg['dataset'], default_args)) elif isinstance(cfg.get('ann_file'), (list, tuple)): dataset = _concat_dataset(cfg, default_args) else: dataset = build_from_cfg(cfg, DATASETS, default_args) return dataset ================================================ FILE: projects/occ_plugin/datasets/cam4docc_dataset.py ================================================ # Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np from mmcv.runner import get_dist_info from mmdet.datasets import DATASETS from mmdet3d.datasets import NuScenesDataset import os from nuscenes.eval.common.utils import quaternion_yaw, Quaternion from projects.occ_plugin.utils.formating import cm_to_ious, format_iou_results from projects.occ_plugin.utils.geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy from nuscenes import NuScenes from pyquaternion import Quaternion import torch import random import time @DATASETS.register_module() class Cam4DOccDataset(NuScenesDataset): def __init__(self, occ_size, pc_range, occ_root, idx_root, ori_data_root, data_root, time_receptive_field, n_future_frames, classes, use_separate_classes, train_capacity, test_capacity, **kwargs): ''' Cam4DOccDataset contains sequential occupancy states as well as instance flow for training occupancy forecasting models. We unify the related operations in the LiDAR coordinate system following OpenOccupancy. occ_size: number of grids along H W L, default: [512, 512, 40] pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] occ_root: data path of nuScenes-Occupancy idx_root: save path of test indexes time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3 n_future_frames: number of forecasted future frames, default: 4 classes: predefiend categories in GMO use_separate_classes: separate movable objects instead of the general one train_capacity: number of sequences used for training, default: 23930 test_capacity: number of sequences used for testing, default: 5119 ''' self.train_capacity = train_capacity self.test_capacity = test_capacity super().__init__(**kwargs) rank, world_size = get_dist_info() self.time_receptive_field = time_receptive_field self.n_future_frames = n_future_frames self.sequence_length = time_receptive_field + n_future_frames if rank == 0: print("-------------") print("use past " + str(self.time_receptive_field) + " frames to forecast future " + str(self.n_future_frames) + " frames") print("-------------") self.data_infos = list(sorted(self.data_infos, key=lambda e: e['timestamp'])) self.data_infos = self.data_infos[::self.load_interval] self.occ_size = occ_size self.pc_range = pc_range self.occ_root = occ_root self.idx_root = idx_root self.ori_data_root = ori_data_root self.data_root = data_root self.classes = classes self.use_separate_classes = use_separate_classes self.indices = self.get_indices() self.present_scene_lidar_token = " " self._set_group_flag() # load origin nusc dataset for instance annotation self.nusc = NuScenes(version='v1.0-trainval', dataroot=self.data_root, verbose=False) if self.test_mode: self.chosen_list = random.sample(range(0, self.test_capacity) , self.test_capacity) self.chosen_list_num = len(self.chosen_list) else: self.chosen_list = random.sample(range(0, self.train_capacity) , self.train_capacity) self.chosen_list_num = len(self.chosen_list) def _set_group_flag(self): if self.test_mode: self.flag = np.zeros(self.test_capacity, dtype=np.uint8) else: self.flag = np.zeros(self.train_capacity, dtype=np.uint8) def __len__(self): if self.test_mode: return self.test_capacity else: return self.train_capacity def __getitem__(self, idx): idx = int(self.chosen_list[idx]) self.egopose_list = [] self.ego2lidar_list = [] self.visible_instance_set = set() self.instance_dict = {} if self.test_mode: return self.prepare_test_data(idx) while True: data = self.prepare_train_data(idx) if data is None: idx = self._rand_another(idx) idx = int(self.chosen_list[idx]) continue return data def get_indices(self): ''' Generate sequential indexes for training and testing ''' indices = [] for index in range(len(self.data_infos)): is_valid_data = True previous_rec = None current_indices = [] for t in range(self.sequence_length): index_t = index + t # Going over the dataset size limit. if index_t >= len(self.data_infos): is_valid_data = False break rec = self.data_infos[index_t] # Check if scene is the same if (previous_rec is not None) and (rec['scene_token'] != previous_rec['scene_token']): is_valid_data = False break current_indices.append(index_t) previous_rec = rec if is_valid_data: indices.append(current_indices) return np.asarray(indices) def get_lidar_pose(self, rec): ''' Get global poses for following bbox transforming ''' ego2global_translation = rec['ego2global_translation'] ego2global_rotation = rec['ego2global_rotation'] trans = -np.array(ego2global_translation) rot = Quaternion(ego2global_rotation).inverse return trans, rot def get_ego2lidar_pose(self, rec): ''' Get LiDAR poses in ego system ''' lidar2ego_translation = rec['lidar2ego_translation'] lidar2ego_rotation = rec['lidar2ego_rotation'] trans = -np.array(lidar2ego_translation) rot = Quaternion(lidar2ego_rotation).inverse return trans, rot def record_instance(self, idx, instance_map): """ Record information about each visible instance in the sequence and assign a unique ID to it """ rec = self.data_infos[idx] translation, rotation = self.get_lidar_pose(rec) self.egopose_list.append([translation, rotation]) ego2lidar_translation, ego2lidar_rotation = self.get_ego2lidar_pose(rec) self.ego2lidar_list.append([ego2lidar_translation, ego2lidar_rotation]) current_sample = self.nusc.get('sample', rec['token']) for annotation_token in current_sample['anns']: annotation = self.nusc.get('sample_annotation', annotation_token) # Instance extraction for Cam4DOcc-V1 # Filter out all non vehicle instances # if 'vehicle' not in annotation['category_name']: # continue gmo_flag = False for class_name in self.classes: if class_name in annotation['category_name']: gmo_flag = True break if not gmo_flag: continue # Specify semantic id if use_separate_classes semantic_id = 1 if self.use_separate_classes: if 'vehicle.bicycle' in annotation['category_name']: # rm static_object.bicycle_rack semantic_id = 1 elif 'bus' in annotation['category_name']: semantic_id = 2 elif 'car' in annotation['category_name']: semantic_id = 3 elif 'construction' in annotation['category_name']: semantic_id = 4 elif 'motorcycle' in annotation['category_name']: semantic_id = 5 elif 'trailer' in annotation['category_name']: semantic_id = 6 elif 'truck' in annotation['category_name']: semantic_id = 7 elif 'pedestrian' in annotation['category_name']: semantic_id = 8 # Filter out invisible vehicles FILTER_INVISIBLE_VEHICLES = True if FILTER_INVISIBLE_VEHICLES and int(annotation['visibility_token']) == 1 and annotation['instance_token'] not in self.visible_instance_set: continue # Filter out vehicles that have not been seen in the past if self.counter >= self.time_receptive_field and annotation['instance_token'] not in self.visible_instance_set: continue self.visible_instance_set.add(annotation['instance_token']) if annotation['instance_token'] not in instance_map: instance_map[annotation['instance_token']] = len(instance_map) + 1 instance_id = instance_map[annotation['instance_token']] instance_attribute = int(annotation['visibility_token']) if annotation['instance_token'] not in self.instance_dict: # For the first occurrence of an instance self.instance_dict[annotation['instance_token']] = { 'timestep': [self.counter], 'translation': [annotation['translation']], 'rotation': [annotation['rotation']], 'size': annotation['size'], 'instance_id': instance_id, 'semantic_id': semantic_id, 'attribute_label': [instance_attribute], } else: # For the instance that have appeared before self.instance_dict[annotation['instance_token']]['timestep'].append(self.counter) self.instance_dict[annotation['instance_token']]['translation'].append(annotation['translation']) self.instance_dict[annotation['instance_token']]['rotation'].append(annotation['rotation']) self.instance_dict[annotation['instance_token']]['attribute_label'].append(instance_attribute) return instance_map def get_future_egomotion(self, idx): ''' Calculate LiDAR pose updates between idx and idx+1 ''' rec_t0 = self.data_infos[idx] future_egomotion = np.eye(4, dtype=np.float32) if idx < len(self.data_infos) - 1: rec_t1 = self.data_infos[idx + 1] if rec_t0['scene_token'] == rec_t1['scene_token']: egopose_t0_trans = rec_t0['ego2global_translation'] egopose_t0_rot = rec_t0['ego2global_rotation'] egopose_t1_trans = rec_t1['ego2global_translation'] egopose_t1_rot = rec_t1['ego2global_rotation'] egopose_t0 = convert_egopose_to_matrix_numpy(egopose_t0_trans, egopose_t0_rot) egopose_t1 = convert_egopose_to_matrix_numpy(egopose_t1_trans, egopose_t1_rot) lidar2ego_t0_trans = rec_t0['lidar2ego_translation'] lidar2ego_t0_rot = rec_t0['lidar2ego_rotation'] lidar2ego_t1_trans = rec_t1['lidar2ego_translation'] lidar2ego_t1_rot = rec_t1['lidar2ego_rotation'] lidar2ego_t0 = convert_egopose_to_matrix_numpy(lidar2ego_t0_trans, lidar2ego_t0_rot) lidar2ego_t1 = convert_egopose_to_matrix_numpy(lidar2ego_t1_trans, lidar2ego_t1_rot) future_egomotion = invert_matrix_egopose_numpy(lidar2ego_t1).dot(invert_matrix_egopose_numpy(egopose_t1)).dot(egopose_t0).dot(lidar2ego_t0) future_egomotion = torch.Tensor(future_egomotion).float() # Convert to 6DoF vector return future_egomotion.unsqueeze(0) @staticmethod def _check_consistency(translation, prev_translation, threshold=1.0): """ Check for significant displacement of the instance adjacent moments """ x, y = translation[:2] prev_x, prev_y = prev_translation[:2] if abs(x - prev_x) > threshold or abs(y - prev_y) > threshold: return False return True def refine_instance_poly(self, instance): """ Fix the missing frames and disturbances of ground truth caused by noise """ pointer = 1 for i in range(instance['timestep'][0] + 1, self.sequence_length): # Fill in the missing frames if i not in instance['timestep']: instance['timestep'].insert(pointer, i) instance['translation'].insert(pointer, instance['translation'][pointer-1]) instance['rotation'].insert(pointer, instance['rotation'][pointer-1]) instance['attribute_label'].insert(pointer, instance['attribute_label'][pointer-1]) pointer += 1 continue # Eliminate observation disturbances if self._check_consistency(instance['translation'][pointer], instance['translation'][pointer-1]): instance['translation'][pointer] = instance['translation'][pointer-1] instance['rotation'][pointer] = instance['rotation'][pointer-1] instance['attribute_label'][pointer] = instance['attribute_label'][pointer-1] pointer += 1 return instance def prepare_train_data(self, index): ''' Generate a training sequence ''' input_dict = self.get_data_info(index) if input_dict is None: return None example = self.prepare_sequential_data(index) return example def prepare_test_data(self, index): ''' Generate a test sequence TODO: Give additional functions here such as visualization ''' input_dict = self.get_data_info(index) if input_dict is None: return None example = self.prepare_sequential_data(index) # TODO: visualize example data return example def prepare_sequential_data(self, index): ''' Use the predefined pipeline to generate inputs of the baseline network and ground truth for the standard evaluation protocol in Cam4DOcc ''' instance_map = {} input_seq_data = {} keys = ['input_dict','future_egomotion', 'sample_token'] for key in keys: input_seq_data[key] = [] scene_lidar_token = [] for self.counter, index_t in enumerate(self.indices[index]): input_dict_per_frame = self.get_data_info(index_t) if input_dict_per_frame is None: return None input_seq_data['input_dict'].append(input_dict_per_frame) input_seq_data['sample_token'].append(input_dict_per_frame['sample_idx']) instance_map = self.record_instance(index_t, instance_map) future_egomotion = self.get_future_egomotion(index_t) input_seq_data['future_egomotion'].append(future_egomotion) scene_lidar_token.append(input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token']) if self.counter == self.time_receptive_field - 1: self.present_scene_lidar_token = input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token'] # save sequential test indexes for possible evaluation if self.test_mode: test_idx_path = os.path.join(self.idx_root, "test_ids") if not os.path.exists(test_idx_path): os.mkdir(test_idx_path) np.savez(os.path.join(test_idx_path, self.present_scene_lidar_token), scene_lidar_token) for token in self.instance_dict.keys(): self.instance_dict[token] = self.refine_instance_poly(self.instance_dict[token]) input_seq_data.update( dict( time_receptive_field=self.time_receptive_field, sequence_length=self.sequence_length, egopose_list=self.egopose_list, ego2lidar_list=self.ego2lidar_list, instance_dict=self.instance_dict, instance_map=instance_map, indices=self.indices[index], scene_token=self.present_scene_lidar_token, )) example = self.pipeline(input_seq_data) return example def get_data_info(self, index): ''' get_data_info from .pkl also used by OpenOccupancy ''' info = self.data_infos[index] # standard protocal modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], lidar2ego_translation=info['lidar2ego_translation'], lidar2ego_rotation=info['lidar2ego_rotation'], ego2global_translation=info['ego2global_translation'], ego2global_rotation=info['ego2global_rotation'], prev_idx=info['prev'], next_idx=info['next'], scene_token=info['scene_token'], can_bus=info['can_bus'], # frame_idx=info['frame_idx'], timestamp=info['timestamp'] / 1e6, occ_size = np.array(self.occ_size), pc_range = np.array(self.pc_range), lidar_token=info['lidar_token'], lidarseg=info['lidarseg'], curr=info, ) if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] lidar2cam_rts = [] cam_intrinsics = [] lidar2cam_dic = {} for cam_type, cam_info in info['cams'].items(): image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) cam_intrinsics.append(viewpad) lidar2cam_rts.append(lidar2cam_rt.T) lidar2cam_dic[cam_type] = lidar2cam_rt.T input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, cam_intrinsic=cam_intrinsics, lidar2cam=lidar2cam_rts, lidar2cam_dic=lidar2cam_dic, )) return input_dict def evaluate(self, results, logger=None, **kawrgs): ''' Evaluate by IOU and VPQ metrics for model evaluation ''' eval_results = {} ''' calculate IOU ''' hist_for_iou = sum(results['hist_for_iou']) ious = cm_to_ious(hist_for_iou) res_table, res_dic = format_iou_results(ious, return_dic=True) for key, val in res_dic.items(): eval_results['IOU_{}'.format(key)] = val if logger is not None: logger.info('IOU Evaluation') logger.info(res_table) ''' calculate VPQ ''' if 'vpq_metric' in results.keys() and 'vpq_len' in results.keys(): vpq_sum = sum(results['vpq_metric']) eval_results['VPQ'] = vpq_sum/results['vpq_len'] return eval_results ================================================ FILE: projects/occ_plugin/datasets/cam4docc_lyft_dataset.py ================================================ # Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np from mmcv.runner import get_dist_info from mmdet.datasets import DATASETS from mmdet3d.datasets import NuScenesDataset from mmdet3d.datasets.pipelines import Compose from torch.utils.data import Dataset from lyft_dataset_sdk.lyftdataset import LyftDataset import os from nuscenes.eval.common.utils import quaternion_yaw, Quaternion from projects.occ_plugin.utils.formating import cm_to_ious, format_iou_results from projects.occ_plugin.utils.geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy from nuscenes import NuScenes from pyquaternion import Quaternion import torch import random import time @DATASETS.register_module() class Cam4DOccLyftDataset(Dataset): def __init__(self, occ_size, pc_range, occ_root, idx_root, ori_data_root, data_root, time_receptive_field, n_future_frames, classes, use_separate_classes, train_capacity, test_capacity, test_mode=False, pipeline=None, **kwargs): ''' Cam4DOccLyftDataset contains sequential occupancy states as well as instance flow for training occupancy forecasting models. We unify the related operations in the LiDAR coordinate system following OpenOccupancy. occ_size: number of grids along H W L, default: [512, 512, 40] pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] occ_root: data path of nuScenes-Occupancy idx_root: save path of test indexes time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3 n_future_frames: number of forecasted future frames, default: 4 classes: predefiend categories in GMO use_separate_classes: separate movable objects instead of the general one train_capacity: number of sequences used for training, default: 23930 test_capacity: number of sequences used for testing, default: 5119 ''' self.test_mode = test_mode self.CLASSES = classes self.train_capacity = train_capacity self.test_capacity = test_capacity super().__init__() # training and test indexes following PowerBEV self.TRAIN_LYFT_INDICES = [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 39, 41, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 59, 60, 62, 63, 65, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 82, 83, 84, 86, 87, 88, 89, 93, 95, 97, 98, 99, 103, 104, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 121, 122, 124, 127, 128, 130, 131, 132, 134, 135, 136, 137, 138, 139, 143, 144, 146, 147, 148, 149, 150, 151, 152, 153, 154, 156, 157, 158, 159, 161, 162, 165, 166, 167, 171, 172, 173, 174, 175, 176, 177, 178, 179] self.VAL_LYFT_INDICES = [0, 2, 4, 13, 22, 25, 26, 34, 38, 40, 42, 54, 57, 58, 61, 64, 66, 67, 77, 80, 85, 90, 91, 92, 94, 96, 100, 101, 102, 105, 106, 112, 120, 123, 125, 126, 129, 133, 140, 141, 142, 145, 155, 160, 163, 164, 168, 169, 170] rank, world_size = get_dist_info() self.time_receptive_field = time_receptive_field self.n_future_frames = n_future_frames self.sequence_length = time_receptive_field + n_future_frames if rank == 0: print("-------------") print("use past " + str(self.time_receptive_field) + " frames to forecast future " + str(self.n_future_frames) + " frames") print("-------------") self.occ_size = occ_size self.pc_range = pc_range self.occ_root = occ_root self.idx_root = idx_root self.ori_data_root = ori_data_root self.data_root = data_root self.classes = classes self.use_separate_classes = use_separate_classes self.pipeline = Compose(pipeline) # load origin nusc dataset for instance annotation self.lyft = LyftDataset(data_path=self.data_root, json_path=os.path.join(self.data_root, 'train_data'), verbose=False) self.scenes = self.get_scenes() self.ixes = self.get_samples() self.indices = self.get_indices() self.present_scene_lidar_token = " " self._set_group_flag() if self.test_mode: self.chosen_list = random.sample(range(0, self.test_capacity) , self.test_capacity) self.chosen_list_num = len(self.chosen_list) else: self.chosen_list = random.sample(range(0, self.train_capacity) , self.train_capacity) self.chosen_list_num = len(self.chosen_list) def _set_group_flag(self): if self.test_mode: self.flag = np.zeros(self.test_capacity, dtype=np.uint8) else: self.flag = np.zeros(self.train_capacity, dtype=np.uint8) def __len__(self): if self.test_mode: return self.test_capacity else: return self.train_capacity def __getitem__(self, idx): idx = int(self.chosen_list[idx]) self.egopose_list = [] self.ego2lidar_list = [] self.visible_instance_set = set() self.instance_dict = {} if self.test_mode: return self.prepare_test_data(idx) while True: data = self.prepare_train_data(idx) if data is None: idx = self._rand_another(idx) idx = int(self.chosen_list[idx]) continue return data def get_scenes(self): """ Obtain the list of scenes names in the given split. """ scenes = [row['name'] for row in self.lyft.scene] # split in train/val indices = self.VAL_LYFT_INDICES if self.test_mode else self.TRAIN_LYFT_INDICES scenes = [scenes[i] for i in indices] return scenes def get_samples(self): """ Find and sort the samples in the given split by scene. """ samples = [sample for sample in self.lyft.sample] # remove samples that aren't in this split samples = [sample for sample in samples if self.lyft.get('scene', sample['scene_token'])['name'] in self.scenes] # sort by scene, timestamp (only to make chronological viz easier) samples.sort(key=lambda x: (x['scene_token'], x['timestamp'])) return samples def get_indices(self): ''' Generate sequential indexes for training and testing ''' indices = [] for index in range(len(self.ixes)): is_valid_data = True previous_rec = None current_indices = [] for t in range(self.sequence_length): index_t = index + t # Going over the dataset size limit. if index_t >= len(self.ixes): is_valid_data = False break rec = self.ixes[index_t] # Check if scene is the same if (previous_rec is not None) and (rec['scene_token'] != previous_rec['scene_token']): is_valid_data = False break current_indices.append(index_t) previous_rec = rec if is_valid_data: indices.append(current_indices) return np.asarray(indices) def get_lidar_pose(self, rec): ''' Get global poses for following bbox transforming ''' current_sample = self.lyft.get('sample', rec['token']) egopose = self.lyft.get('ego_pose', self.lyft.get('sample_data', current_sample['data']['LIDAR_TOP'])['ego_pose_token']) ego2global_translation = egopose['translation'] ego2global_rotation = egopose['rotation'] trans = -np.array(ego2global_translation) rot = Quaternion(ego2global_rotation).inverse return trans, rot def get_ego2lidar_pose(self, rec): ''' Get LiDAR poses in ego system ''' current_sample = self.lyft.get('sample', rec['token']) lidar_top_data = self.lyft.get('sample_data', current_sample['data']['LIDAR_TOP']) lidar2ego_translation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation'] lidar2ego_rotation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation'] trans = -np.array(lidar2ego_translation) rot = Quaternion(lidar2ego_rotation).inverse return trans, rot def record_instance(self, idx, instance_map): """ Record information about each visible instance in the sequence and assign a unique ID to it """ rec = self.ixes[idx] translation, rotation = self.get_lidar_pose(rec) self.egopose_list.append([translation, rotation]) ego2lidar_translation, ego2lidar_rotation = self.get_ego2lidar_pose(rec) self.ego2lidar_list.append([ego2lidar_translation, ego2lidar_rotation]) current_sample = self.lyft.get('sample', rec['token']) for annotation_token in current_sample['anns']: annotation = self.lyft.get('sample_annotation', annotation_token) # Instance extraction for Cam4DOcc-V1 # Filter out all non vehicle instances # if 'vehicle' not in annotation['category_name']: # continue gmo_flag = False for class_name in self.classes: if class_name in annotation['category_name']: gmo_flag = True break if not gmo_flag: continue # Specify semantic id if use_separate_classes semantic_id = 1 if self.use_separate_classes: if 'bicycle' in annotation['category_name']: semantic_id = 1 elif 'bus' in annotation['category_name']: semantic_id = 2 elif 'car' in annotation['category_name']: semantic_id = 3 elif 'construction' in annotation['category_name']: semantic_id = 4 elif 'motorcycle' in annotation['category_name']: semantic_id = 5 elif 'trailer' in annotation['category_name']: semantic_id = 6 elif 'truck' in annotation['category_name']: semantic_id = 7 elif 'pedestrian' in annotation['category_name']: semantic_id = 8 if annotation['instance_token'] not in instance_map: instance_map[annotation['instance_token']] = len(instance_map) + 1 instance_id = instance_map[annotation['instance_token']] instance_attribute = 1 # deprecated if annotation['instance_token'] not in self.instance_dict: # For the first occurrence of an instance self.instance_dict[annotation['instance_token']] = { 'timestep': [self.counter], 'translation': [annotation['translation']], 'rotation': [annotation['rotation']], 'size': annotation['size'], 'instance_id': instance_id, 'semantic_id': semantic_id, 'attribute_label': [instance_attribute], } else: # For the instance that have appeared before self.instance_dict[annotation['instance_token']]['timestep'].append(self.counter) self.instance_dict[annotation['instance_token']]['translation'].append(annotation['translation']) self.instance_dict[annotation['instance_token']]['rotation'].append(annotation['rotation']) self.instance_dict[annotation['instance_token']]['attribute_label'].append(instance_attribute) return instance_map def get_future_egomotion(self, idx): ''' Calculate LiDAR pose updates between idx and idx+1 ''' rec_t0 = self.ixes[idx] future_egomotion = np.eye(4, dtype=np.float32) if idx < len(self.ixes) - 1: rec_t1 = self.ixes[idx + 1] if rec_t0['scene_token'] == rec_t1['scene_token']: egopose_t0 = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec_t0['data']['LIDAR_TOP'])['ego_pose_token']) egopose_t0_trans = egopose_t0['translation'] egopose_t0_rot = egopose_t0['rotation'] egopose_t1 = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec_t1['data']['LIDAR_TOP'])['ego_pose_token']) egopose_t1_trans = egopose_t1['translation'] egopose_t1_rot = egopose_t1['rotation'] egopose_t0 = convert_egopose_to_matrix_numpy(egopose_t0_trans, egopose_t0_rot) egopose_t1 = convert_egopose_to_matrix_numpy(egopose_t1_trans, egopose_t1_rot) lidar_top_data_t0 = self.lyft.get('sample_data', rec_t0['data']['LIDAR_TOP']) lidar2ego_t0_trans = self.lyft.get('calibrated_sensor', lidar_top_data_t0['calibrated_sensor_token'])['translation'] lidar2ego_t0_rot = self.lyft.get('calibrated_sensor', lidar_top_data_t0['calibrated_sensor_token'])['rotation'] lidar_top_data_t1 = self.lyft.get('sample_data', rec_t1['data']['LIDAR_TOP']) lidar2ego_t1_trans = self.lyft.get('calibrated_sensor', lidar_top_data_t1['calibrated_sensor_token'])['translation'] lidar2ego_t1_rot = self.lyft.get('calibrated_sensor', lidar_top_data_t1['calibrated_sensor_token'])['rotation'] lidar2ego_t0 = convert_egopose_to_matrix_numpy(lidar2ego_t0_trans, lidar2ego_t0_rot) lidar2ego_t1 = convert_egopose_to_matrix_numpy(lidar2ego_t1_trans, lidar2ego_t1_rot) future_egomotion = invert_matrix_egopose_numpy(lidar2ego_t1).dot(invert_matrix_egopose_numpy(egopose_t1)).dot(egopose_t0).dot(lidar2ego_t0) future_egomotion = torch.Tensor(future_egomotion).float() return future_egomotion.unsqueeze(0) @staticmethod def _check_consistency(translation, prev_translation, threshold=1.0): """ Check for significant displacement of the instance adjacent moments """ x, y = translation[:2] prev_x, prev_y = prev_translation[:2] if abs(x - prev_x) > threshold or abs(y - prev_y) > threshold: return False return True def refine_instance_poly(self, instance): """ Fix the missing frames and disturbances of ground truth caused by noise """ pointer = 1 for i in range(instance['timestep'][0] + 1, self.sequence_length): # Fill in the missing frames if i not in instance['timestep']: instance['timestep'].insert(pointer, i) instance['translation'].insert(pointer, instance['translation'][pointer-1]) instance['rotation'].insert(pointer, instance['rotation'][pointer-1]) instance['attribute_label'].insert(pointer, instance['attribute_label'][pointer-1]) pointer += 1 continue # Eliminate observation disturbances if self._check_consistency(instance['translation'][pointer], instance['translation'][pointer-1]): instance['translation'][pointer] = instance['translation'][pointer-1] instance['rotation'][pointer] = instance['rotation'][pointer-1] instance['attribute_label'][pointer] = instance['attribute_label'][pointer-1] pointer += 1 return instance def prepare_train_data(self, index): ''' Generate a training sequence ''' example = self.prepare_sequential_data(index) return example def prepare_test_data(self, index): ''' Generate a test sequence TODO: Give additional functions here such as visualization ''' example = self.prepare_sequential_data(index) # TODO: visualize example data return example def prepare_sequential_data(self, index): ''' Use the predefined pipeline to generate inputs of the baseline network and ground truth for the standard evaluation protocol in Cam4DOcc ''' instance_map = {} input_seq_data = {} keys = ['input_dict','future_egomotion', 'sample_token'] for key in keys: input_seq_data[key] = [] scene_lidar_token = [] for self.counter, index_t in enumerate(self.indices[index]): input_dict_per_frame = {} rec = self.ixes[index_t] # sample lidar_top_data = self.lyft.get('sample_data', rec['data']['LIDAR_TOP']) lidar2ego_translation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['translation'] lidar2ego_rotation = self.lyft.get('calibrated_sensor', lidar_top_data['calibrated_sensor_token'])['rotation'] egopose = self.lyft.get('ego_pose', self.lyft.get('sample_data', rec['data']['LIDAR_TOP'])['ego_pose_token']) ego2global_translation = egopose['translation'] ego2global_rotation = egopose['rotation'] input_dict_per_frame['lidar2ego_translation'] = lidar2ego_translation input_dict_per_frame['lidar2ego_rotation'] = lidar2ego_rotation input_dict_per_frame['ego2global_translation'] = ego2global_translation input_dict_per_frame['ego2global_rotation'] = ego2global_rotation input_dict_per_frame['scene_token'] = rec['scene_token'] input_dict_per_frame['lidar_token'] = rec['data']['LIDAR_TOP'] input_dict_per_frame['occ_size'] = np.array(self.occ_size) input_dict_per_frame['pc_range'] = np.array(self.pc_range) input_dict_per_frame['sample_idx'] = rec['token'] image_paths = [] lidar2img_rts = [] lidar2cam_rts = [] cam_intrinsics = [] cam_intrinsics_ori = [] lidar2cam_dic = {} lidar_sample = self.lyft.get('sample_data', rec['data']['LIDAR_TOP']) lidar_pose = self.lyft.get('ego_pose', lidar_sample['ego_pose_token']) lidar_rotation = Quaternion(lidar_pose['rotation']) lidar_translation = np.array(lidar_pose['translation'])[:, None] lidar_to_world = np.vstack([ np.hstack((lidar_rotation.rotation_matrix, lidar_translation)), np.array([0, 0, 0, 1]) ]) lidar_sample_calib = self.lyft.get('calibrated_sensor', lidar_sample['calibrated_sensor_token']) lidar_sensor_rotation = Quaternion(lidar_sample_calib['rotation']) lidar_sensor_translation = np.array(lidar_sample_calib['translation'])[:, None] lidar_to_lidarego = np.vstack([ np.hstack((lidar_sensor_rotation.rotation_matrix, lidar_sensor_translation)), np.array([0, 0, 0, 1]) ]) cameras = ['CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_LEFT', 'CAM_BACK', 'CAM_BACK_RIGHT'] for cam in cameras: camera_sample = self.lyft.get('sample_data', rec['data'][cam]) image_paths.append(os.path.join("/tos://haomo-algorithms/c6089dc67ff976615510d22b5eaaaa4e/mjy/cam4docc/data/lyft/", camera_sample['filename'])) car_egopose = self.lyft.get('ego_pose', camera_sample['ego_pose_token']) egopose_rotation = Quaternion(car_egopose['rotation']).inverse egopose_translation = -np.array(car_egopose['translation'])[:, None] world_to_car_egopose = np.vstack([ np.hstack((egopose_rotation.rotation_matrix, egopose_rotation.rotation_matrix @ egopose_translation)), np.array([0, 0, 0, 1]) ]) sensor_sample = self.lyft.get('calibrated_sensor', camera_sample['calibrated_sensor_token']) intrinsic = torch.Tensor(sensor_sample['camera_intrinsic']) cam_intrinsics_ori.append(intrinsic) sensor_rotation = Quaternion(sensor_sample['rotation']) sensor_translation = np.array(sensor_sample['translation'])[:, None] car_egopose_to_sensor = np.vstack([ np.hstack((sensor_rotation.rotation_matrix, sensor_translation)), np.array([0, 0, 0, 1]) ]) car_egopose_to_sensor = np.linalg.inv(car_egopose_to_sensor) lidar_to_sensor = car_egopose_to_sensor @ world_to_car_egopose @ lidar_to_world @ lidar_to_lidarego sensor_to_lidar =np.linalg.inv(lidar_to_sensor) lidar2cam_r = lidar_to_sensor[:3, :3] lidar2cam_t = sensor_to_lidar[:3, -1].reshape(1,3) @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) cam_intrinsics.append(viewpad) lidar2cam_rts.append(lidar2cam_rt.T) lidar2cam_dic[cam] = lidar2cam_rt.T input_dict_per_frame.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, cam_intrinsic=cam_intrinsics, cam_intrinsics=cam_intrinsics_ori, lidar2cam=lidar2cam_rts, lidar2cam_dic=lidar2cam_dic, )) input_seq_data['input_dict'].append(input_dict_per_frame) instance_map = self.record_instance(index_t, instance_map) future_egomotion = self.get_future_egomotion(index_t) input_seq_data['future_egomotion'].append(future_egomotion) input_seq_data['sample_token'].append(input_dict_per_frame['sample_idx']) scene_lidar_token.append(input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token']) if self.counter == self.time_receptive_field - 1: self.present_scene_lidar_token = input_dict_per_frame['scene_token']+"_"+input_dict_per_frame['lidar_token'] for token in self.instance_dict.keys(): self.instance_dict[token] = self.refine_instance_poly(self.instance_dict[token]) input_seq_data.update( dict( time_receptive_field=self.time_receptive_field, sequence_length=self.sequence_length, egopose_list=self.egopose_list, ego2lidar_list=self.ego2lidar_list, instance_dict=self.instance_dict, instance_map=instance_map, indices=self.indices[index], scene_token=self.present_scene_lidar_token, )) example = self.pipeline(input_seq_data) return example def evaluate(self, results, logger=None, **kawrgs): ''' Evaluate by IOU and VPQ metrics for model evaluation ''' eval_results = {} ''' calculate IOU ''' hist_for_iou = sum(results['hist_for_iou']) ious = cm_to_ious(hist_for_iou) res_table, res_dic = format_iou_results(ious, return_dic=True) for key, val in res_dic.items(): eval_results['IOU_{}'.format(key)] = val if logger is not None: logger.info('IOU Evaluation') logger.info(res_table) ''' calculate VPQ ''' if 'vpq_metric' in results.keys() and 'vpq_len' in results.keys(): vpq_sum = sum(results['vpq_metric']) eval_results['VPQ'] = vpq_sum/results['vpq_len'] return eval_results ================================================ FILE: projects/occ_plugin/datasets/nuscenes_dataset.py ================================================ import copy import numpy as np from mmdet.datasets import DATASETS from mmdet3d.datasets import NuScenesDataset import mmcv from os import path as osp from mmdet.datasets import DATASETS import torch import numpy as np from nuscenes.eval.common.utils import quaternion_yaw, Quaternion from mmcv.parallel import DataContainer as DC import random @DATASETS.register_module() class CustomNuScenesDataset(NuScenesDataset): r"""NuScenes Dataset. This datset only add camera intrinsics and extrinsics to the results. """ def __init__(self, queue_length=4, bev_size=(200, 200), overlap_test=False, *args, **kwargs): super().__init__(*args, **kwargs) self.queue_length = queue_length self.overlap_test = overlap_test self.bev_size = bev_size def prepare_train_data(self, index): """ Training data preparation. Args: index (int): Index for accessing the target data. Returns: dict: Training data dict of the corresponding index. """ queue = [] index_list = list(range(index-self.queue_length, index)) random.shuffle(index_list) index_list = sorted(index_list[1:]) index_list.append(index) for i in index_list: i = max(0, i) input_dict = self.get_data_info(i) if input_dict is None: return None self.pre_pipeline(input_dict) example = self.pipeline(input_dict) if self.filter_empty_gt and \ (example is None or ~(example['gt_labels_3d']._data != -1).any()): return None queue.append(example) return self.union2one(queue) def union2one(self, queue): imgs_list = [each['img'].data for each in queue] metas_map = {} prev_scene_token = None prev_pos = None prev_angle = None for i, each in enumerate(queue): metas_map[i] = each['img_metas'].data if metas_map[i]['scene_token'] != prev_scene_token: metas_map[i]['prev_bev_exists'] = False prev_scene_token = metas_map[i]['scene_token'] prev_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) prev_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) metas_map[i]['can_bus'][:3] = 0 metas_map[i]['can_bus'][-1] = 0 else: metas_map[i]['prev_bev_exists'] = True tmp_pos = copy.deepcopy(metas_map[i]['can_bus'][:3]) tmp_angle = copy.deepcopy(metas_map[i]['can_bus'][-1]) metas_map[i]['can_bus'][:3] -= prev_pos metas_map[i]['can_bus'][-1] -= prev_angle prev_pos = copy.deepcopy(tmp_pos) prev_angle = copy.deepcopy(tmp_angle) queue[-1]['img'] = DC(torch.stack(imgs_list), cpu_only=False, stack=True) queue[-1]['img_metas'] = DC(metas_map, cpu_only=True) queue = queue[-1] return queue def get_data_info(self, index): """Get data info according to the given index. Args: index (int): Index of the sample data to get. Returns: dict: Data information that will be passed to the data \ preprocessing pipelines. It includes the following keys: - sample_idx (str): Sample index. - pts_filename (str): Filename of point clouds. - sweeps (list[dict]): Infos of sweeps. - timestamp (float): Sample timestamp. - img_filename (str, optional): Image filename. - lidar2img (list[np.ndarray], optional): Transformations \ from lidar to different cameras. - ann_info (dict): Annotation info. """ info = self.data_infos[index] # standard protocal modified from SECOND.Pytorch input_dict = dict( sample_idx=info['token'], pts_filename=info['lidar_path'], sweeps=info['sweeps'], ego2global_translation=info['ego2global_translation'], ego2global_rotation=info['ego2global_rotation'], prev_idx=info['prev'], next_idx=info['next'], scene_token=info['scene_token'], can_bus=info['can_bus'], frame_idx=info['frame_idx'], timestamp=info['timestamp'] / 1e6, ) if self.modality['use_camera']: image_paths = [] lidar2img_rts = [] lidar2cam_rts = [] cam_intrinsics = [] for cam_type, cam_info in info['cams'].items(): image_paths.append(cam_info['data_path']) # obtain lidar to image transformation matrix lidar2cam_r = np.linalg.inv(cam_info['sensor2lidar_rotation']) lidar2cam_t = cam_info[ 'sensor2lidar_translation'] @ lidar2cam_r.T lidar2cam_rt = np.eye(4) lidar2cam_rt[:3, :3] = lidar2cam_r.T lidar2cam_rt[3, :3] = -lidar2cam_t intrinsic = cam_info['cam_intrinsic'] viewpad = np.eye(4) viewpad[:intrinsic.shape[0], :intrinsic.shape[1]] = intrinsic lidar2img_rt = (viewpad @ lidar2cam_rt.T) lidar2img_rts.append(lidar2img_rt) cam_intrinsics.append(viewpad) lidar2cam_rts.append(lidar2cam_rt.T) input_dict.update( dict( img_filename=image_paths, lidar2img=lidar2img_rts, cam_intrinsic=cam_intrinsics, lidar2cam=lidar2cam_rts, )) if not self.test_mode: annos = self.get_ann_info(index) input_dict['ann_info'] = annos rotation = Quaternion(input_dict['ego2global_rotation']) translation = input_dict['ego2global_translation'] can_bus = input_dict['can_bus'] can_bus[:3] = translation can_bus[3:7] = rotation patch_angle = quaternion_yaw(rotation) / np.pi * 180 if patch_angle < 0: patch_angle += 360 can_bus[-2] = patch_angle / 180 * np.pi can_bus[-1] = patch_angle return input_dict def __getitem__(self, idx): """Get item from infos according to the given index. Returns: dict: Data dictionary of the corresponding index. """ if self.test_mode: return self.prepare_test_data(idx) while True: data = self.prepare_train_data(idx) if data is None: idx = self._rand_another(idx) continue return data ================================================ FILE: projects/occ_plugin/datasets/pipelines/__init__.py ================================================ from .transform_3d import ( PadMultiViewImage, NormalizeMultiviewImage, PhotoMetricDistortionMultiViewImage, CustomCollect3D, CustomOccCollect3D, RandomScaleImageMultiViewImage) from .formating import OccDefaultFormatBundle3D from .loading_occupancy import LoadOccupancy from .loading_bevdet import LoadAnnotationsBEVDepth, LoadMultiViewImageFromFiles_BEVDet from .loading_instance import LoadInstanceWithFlow __all__ = [ 'PadMultiViewImage', 'NormalizeMultiviewImage', 'CustomOccCollect3D', 'LoadAnnotationsBEVDepth', 'LoadMultiViewImageFromFiles_BEVDet', 'LoadOccupancy', 'PhotoMetricDistortionMultiViewImage', 'OccDefaultFormatBundle3D', 'CustomCollect3D', 'RandomScaleImageMultiViewImage', "LoadInstanceWithFlow", ] ================================================ FILE: projects/occ_plugin/datasets/pipelines/formating.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np from mmcv.parallel import DataContainer as DC from mmdet.datasets.builder import PIPELINES from mmdet.datasets.pipelines import to_tensor from mmdet3d.datasets.pipelines import DefaultFormatBundle3D @PIPELINES.register_module() class OccDefaultFormatBundle3D(DefaultFormatBundle3D): """Default formatting bundle. It simplifies the pipeline of formatting common fields for voxels, including "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg". These fields are formatted as follows. - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) - proposals: (1)to tensor, (2)to DataContainer - gt_bboxes: (1)to tensor, (2)to DataContainer - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer - gt_labels: (1)to tensor, (2)to DataContainer """ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) def __call__(self, results): """Call function to transform and format common fields in results. Args: results (dict): Result dict contains the data to convert. Returns: dict: The result dict contains the data that is formatted with default bundle. """ # Format 3D data results = super(OccDefaultFormatBundle3D, self).__call__(results) if 'gt_occ' in results.keys(): results['gt_occ'] = DC(to_tensor(results['gt_occ']), stack=True) if 'gt_occ' in results.keys(): results['segmentation'] = DC(to_tensor(results['segmentation']), stack=True) if 'gt_occ' in results.keys(): results['instance'] = DC(to_tensor(results['instance']), stack=True) if 'gt_occ' in results.keys(): results['attribute_label'] = DC(to_tensor(results['attribute_label']), stack=True) if 'gt_occ' in results.keys(): results['flow'] = DC(to_tensor(results['flow']), stack=True) if 'gt_vel' in results.keys(): results['gt_vel'] = DC(to_tensor(results['gt_vel']), stack=False) return results ================================================ FILE: projects/occ_plugin/datasets/pipelines/loading_bevdet.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import mmcv import numpy as np from mmdet.datasets.builder import PIPELINES import os import torch from PIL import Image from pyquaternion import Quaternion from mmdet3d.core.bbox import LiDARInstance3DBoxes from numpy import random import pdb def mmlabNormalize(img, img_norm_cfg=None): from mmcv.image.photometric import imnormalize if img_norm_cfg is None: mean = np.array([123.675, 116.28, 103.53], dtype=np.float32) std = np.array([58.395, 57.12, 57.375], dtype=np.float32) to_rgb = True else: mean = np.array(img_norm_cfg['mean'], dtype=np.float32) std = np.array(img_norm_cfg['std'], dtype=np.float32) to_rgb = img_norm_cfg['to_rgb'] img = imnormalize(np.array(img), mean, std, to_rgb) img = torch.tensor(img).float().permute(2, 0, 1).contiguous() return img def depth_transform(cam_depth, resize, resize_dims, crop, flip, rotate): """Transform depth based on ida augmentation configuration. Args: cam_depth (np array): Nx3, 3: x,y,d. resize (float): Resize factor. resize_dims (list): Final dimension. crop (list): x1, y1, x2, y2 flip (bool): Whether to flip. rotate (float): Rotation value. Returns: np array: [h/down_ratio, w/down_ratio, d] """ H, W = resize_dims cam_depth[:, :2] = cam_depth[:, :2] * resize cam_depth[:, 0] -= crop[0] cam_depth[:, 1] -= crop[1] if flip: cam_depth[:, 0] = resize_dims[1] - cam_depth[:, 0] cam_depth[:, 0] -= W / 2.0 cam_depth[:, 1] -= H / 2.0 h = rotate / 180 * np.pi rot_matrix = [ [np.cos(h), np.sin(h)], [-np.sin(h), np.cos(h)], ] cam_depth[:, :2] = np.matmul(rot_matrix, cam_depth[:, :2].T).T cam_depth[:, 0] += W / 2.0 cam_depth[:, 1] += H / 2.0 depth_coords = cam_depth[:, :2].astype(np.int16) depth_map = np.zeros(resize_dims) valid_mask = ((depth_coords[:, 1] < resize_dims[0]) & (depth_coords[:, 0] < resize_dims[1]) & (depth_coords[:, 1] >= 0) & (depth_coords[:, 0] >= 0)) depth_map[depth_coords[valid_mask, 1], depth_coords[valid_mask, 0]] = cam_depth[valid_mask, 2] return torch.Tensor(depth_map) @PIPELINES.register_module() class LoadMultiViewImageFromFiles_BEVDet(object): """Load multi channel images from a list of separate channel files. Expects results['img_filename'] to be a list of filenames. Args: to_float32 (bool): Whether to convert the img to float32. Defaults to False. color_type (str): Color type of the file. Defaults to 'unchanged'. """ def __init__(self, data_config, is_train=False, using_ego=True, colorjitter=False, sequential=False, aligned=False, trans_only=True, img_norm_cfg=None, mmlabnorm=False, load_depth=False, depth_gt_path=None, data_root=None, test_mode=False, use_lyft=False): self.is_train = is_train self.data_config = data_config # using mean camera ego frame, rather than the lidar coordinates self.using_ego = using_ego self.normalize_img = mmlabNormalize self.img_norm_cfg = img_norm_cfg self.sequential = sequential self.aligned = aligned self.trans_only = trans_only self.load_depth = load_depth self.depth_gt_path = depth_gt_path self.data_root = data_root self.colorjitter = colorjitter self.pipeline_colorjitter = PhotoMetricDistortionMultiViewImage() self.test_mode = test_mode self.use_lyft = use_lyft def get_rot(self,h): return torch.Tensor([ [np.cos(h), np.sin(h)], [-np.sin(h), np.cos(h)], ]) def img_transform(self, img, post_rot, post_tran, resize, resize_dims, crop, flip, rotate): # adjust image img = self.img_transform_core(img, resize_dims, crop, flip, rotate) # post-homography transformation post_rot *= resize post_tran -= torch.Tensor(crop[:2]) if flip: A = torch.Tensor([[-1, 0], [0, 1]]) b = torch.Tensor([crop[2] - crop[0], 0]) post_rot = A.matmul(post_rot) post_tran = A.matmul(post_tran) + b A = self.get_rot(rotate / 180 * np.pi) b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 b = A.matmul(-b) + b post_rot = A.matmul(post_rot) post_tran = A.matmul(post_tran) + b return img, post_rot, post_tran def img_transform_core(self, img, resize_dims, crop, flip, rotate): # adjust image img = img.resize(resize_dims) img = img.crop(crop) if flip: img = img.transpose(method=Image.FLIP_LEFT_RIGHT) img = img.rotate(rotate) return img def choose_cams(self): if self.is_train and self.data_config['Ncams'] < len(self.data_config['cams']): cam_names = np.random.choice(self.data_config['cams'], self.data_config['Ncams'], replace=False) else: cam_names = self.data_config['cams'] return cam_names def sample_augmentation(self, H , W, flip=None, scale=None): fH, fW = self.data_config['input_size'] if self.is_train: resize = float(fW)/float(W) resize += np.random.uniform(*self.data_config['resize']) resize_dims = (int(W * resize), int(H * resize)) newW, newH = resize_dims crop_h = int((1 - np.random.uniform(*self.data_config['crop_h'])) * newH) - fH crop_w = int(np.random.uniform(0, max(0, newW - fW))) crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) # We do not use flip here to keep right forecasting flip = None rotate = 0 else: resize = float(fW)/float(W) resize += self.data_config.get('resize_test', 0.0) if scale is not None: resize = scale resize_dims = (int(W * resize), int(H * resize)) newW, newH = resize_dims crop_h = int((1 - np.mean(self.data_config['crop_h'])) * newH) - fH crop_w = int(max(0, newW - fW) / 2) crop = (crop_w, crop_h, crop_w + fW, crop_h + fH) flip = None rotate = 0 return resize, resize_dims, crop, flip, rotate def get_sensor2ego_transformation(self, cam_info, key_info, cam_name): w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation'] # sweep sensor to sweep ego sweepsensor2sweepego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sweepsensor2sweepego_tran = torch.Tensor( cam_info['cams'][cam_name]['sensor2ego_translation']) sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros( (4, 4)) sweepsensor2sweepego[3, 3] = 1 sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran # sweep ego to global w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation'] sweepego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sweepego2global_tran = torch.Tensor( cam_info['cams'][cam_name]['ego2global_translation']) sweepego2global = sweepego2global_rot.new_zeros((4, 4)) sweepego2global[3, 3] = 1 sweepego2global[:3, :3] = sweepego2global_rot sweepego2global[:3, -1] = sweepego2global_tran # global sensor to cur ego w, x, y, z = key_info['cams'][cam_name]['ego2global_rotation'] keyego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) keyego2global_tran = torch.Tensor( key_info['cams'][cam_name]['ego2global_translation']) keyego2global = keyego2global_rot.new_zeros((4, 4)) keyego2global[3, 3] = 1 keyego2global[:3, :3] = keyego2global_rot keyego2global[:3, -1] = keyego2global_tran global2keyego = keyego2global.inverse() # cur ego to sensor w, x, y, z = key_info['cams'][cam_name]['sensor2ego_rotation'] keysensor2keyego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) keysensor2keyego_tran = torch.Tensor( key_info['cams'][cam_name]['sensor2ego_translation']) keysensor2keyego = keysensor2keyego_rot.new_zeros((4, 4)) keysensor2keyego[3, 3] = 1 keysensor2keyego[:3, :3] = keysensor2keyego_rot keysensor2keyego[:3, -1] = keysensor2keyego_tran keyego2keysensor = keysensor2keyego.inverse() keysensor2sweepsensor = ( keyego2keysensor @ global2keyego @ sweepego2global @ sweepsensor2sweepego).inverse() sweepsensor2keyego = global2keyego @ sweepego2global @ \ sweepsensor2sweepego return sweepsensor2keyego, keysensor2sweepsensor def get_sensor2lidar_transformation(self, cam_info, cam_name, sample_info): w, x, y, z = cam_info['cams'][cam_name]['sensor2ego_rotation'] # sweep sensor to sweep ego sweepsensor2sweepego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sweepsensor2sweepego_tran = torch.Tensor( cam_info['cams'][cam_name]['sensor2ego_translation']) sweepsensor2sweepego = sweepsensor2sweepego_rot.new_zeros( (4, 4)) sweepsensor2sweepego[3, 3] = 1 sweepsensor2sweepego[:3, :3] = sweepsensor2sweepego_rot sweepsensor2sweepego[:3, -1] = sweepsensor2sweepego_tran # sweep ego to global w, x, y, z = cam_info['cams'][cam_name]['ego2global_rotation'] sweepego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) sweepego2global_tran = torch.Tensor( cam_info['cams'][cam_name]['ego2global_translation']) sweepego2global = sweepego2global_rot.new_zeros((4, 4)) sweepego2global[3, 3] = 1 sweepego2global[:3, :3] = sweepego2global_rot sweepego2global[:3, -1] = sweepego2global_tran # global to lidar ego w, x, y, z = sample_info['ego2global_rotation'] lidarego2global_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) lidarego2global_tran = torch.Tensor(sample_info['ego2global_translation']) lidarego2global = lidarego2global_rot.new_zeros((4, 4)) lidarego2global[3, 3] = 1 lidarego2global[:3, :3] = lidarego2global_rot lidarego2global[:3, -1] = lidarego2global_tran global2lidarego = lidarego2global.inverse() # lidar ego to lidar w, x, y, z = sample_info['lidar2ego_rotation'] lidar2ego_rot = torch.Tensor( Quaternion(w, x, y, z).rotation_matrix) lidar2ego_tran = torch.Tensor(sample_info['lidar2ego_translation']) lidar2ego = lidar2ego_rot.new_zeros((4, 4)) lidar2ego[3, 3] = 1 lidar2ego[:3, :3] = lidar2ego_rot lidar2ego[:3, -1] = lidar2ego_tran ego2lidar = lidar2ego.inverse() # camera to lidar sweepsensor2lidar = ego2lidar @ global2lidarego @ sweepego2global @ sweepsensor2sweepego return sweepsensor2lidar def get_seq_inputs(self, results, flip=None, scale=None): cam_names = self.choose_cams() results['cam_names'] = cam_names if self.use_lyft: filename = results['input_dict'][0]['img_filename'][0] else: cam_data = results['input_dict'][0]['curr']['cams'][cam_names[0]] filename = cam_data['data_path'] filename = os.path.join(self.data_root, filename.split('/')[-3], filename.split('/')[-2], filename.split('/')[-1]) img = Image.open(filename) img_augs = self.sample_augmentation(H=img.height, W=img.width, flip=flip, scale=scale) resize, resize_dims, crop, flip, rotate = img_augs sequence_length = results['sequence_length'] imgs_seq = [] rots_seq = [] trans_seq = [] intrins_seq = [] post_rots_seq = [] post_trans_seq = [] gt_depths_seq = list() canvas_seq = [] sensor2sensors_seq = [] for counter in range(sequence_length): input_dict_curr = results['input_dict'][counter] imgs = [] rots = [] trans = [] intrins = [] post_rots = [] post_trans = [] gt_depths = list() canvas = [] sensor2sensors = [] for cam_idx, cam_name in enumerate(cam_names): if self.use_lyft: cam_data = None filename = input_dict_curr['img_filename'][cam_idx] else: cam_data = input_dict_curr['curr']['cams'][cam_name] filename = cam_data['data_path'] filename = os.path.join(self.data_root, filename.split('/')[-3], filename.split('/')[-2], filename.split('/')[-1]) img = Image.open(filename) post_rot = torch.eye(2) post_tran = torch.zeros(2) if self.use_lyft: intrin = torch.Tensor(input_dict_curr['cam_intrinsics'][cam_idx]) else: intrin = torch.Tensor(cam_data['cam_intrinsic']) # from camera to lidar sensor2lidar = torch.tensor(input_dict_curr['lidar2cam_dic'][cam_name]).inverse().float() rot = sensor2lidar[:3, :3] tran = sensor2lidar[:3, 3] img, post_rot2, post_tran2 = \ self.img_transform(img, post_rot, post_tran, resize=resize, resize_dims=resize_dims, crop=crop, flip=flip, rotate=rotate) # for convenience, make augmentation matrices 3x3 post_tran = torch.zeros(3) post_rot = torch.eye(3) post_tran[:2] = post_tran2 post_rot[:2, :2] = post_rot2 # TODO: open source depth enhancement gt_depths.append(torch.zeros(1)) canvas.append(np.array(img)) if self.colorjitter and self.is_train: img = self.pipeline_colorjitter(img) imgs.append(self.normalize_img(img, img_norm_cfg=self.img_norm_cfg)) intrins.append(intrin) rots.append(rot) trans.append(tran) post_rots.append(post_rot) post_trans.append(post_tran) sensor2sensors.append(sensor2lidar) imgs = torch.stack(imgs) rots = torch.stack(rots) trans = torch.stack(trans) intrins = torch.stack(intrins) post_rots = torch.stack(post_rots) post_trans = torch.stack(post_trans) gt_depths = torch.stack(gt_depths) sensor2sensors = torch.stack(sensor2sensors) imgs_seq.append(imgs) rots_seq.append(rots) trans_seq.append(trans) intrins_seq.append(intrins) post_rots_seq.append(post_rots) post_trans_seq.append(post_trans) gt_depths_seq.append(gt_depths) canvas_seq.append(canvas) sensor2sensors_seq.append(sensor2sensors) imgs_seq = torch.stack(imgs_seq) rots_seq = torch.stack(rots_seq) trans_seq = torch.stack(trans_seq) intrins_seq = torch.stack(intrins_seq) post_rots_seq = torch.stack(post_rots_seq) post_trans_seq = torch.stack(post_trans_seq) gt_depths_seq = torch.stack(gt_depths_seq) sensor2sensors_seq = torch.stack(sensor2sensors_seq) results['canvas'] = canvas return imgs_seq, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, gt_depths_seq, sensor2sensors_seq def __call__(self, results): results['img_inputs_seq'] = self.get_seq_inputs(results) return results def bev_transform(rotate_angle, scale_ratio, flip_dx, flip_dy): rotate_angle = torch.tensor(rotate_angle / 180 * np.pi) rot_sin = torch.sin(rotate_angle) rot_cos = torch.cos(rotate_angle) rot_mat = torch.Tensor([[rot_cos, -rot_sin, 0], [rot_sin, rot_cos, 0], [0, 0, 1]]) scale_mat = torch.Tensor([[scale_ratio, 0, 0], [0, scale_ratio, 0], [0, 0, scale_ratio]]) flip_mat = torch.Tensor([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) if flip_dx: flip_mat = flip_mat @ torch.Tensor([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) if flip_dy: flip_mat = flip_mat @ torch.Tensor([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) rot_mat = flip_mat @ (scale_mat @ rot_mat) return rot_mat @PIPELINES.register_module() class LoadAnnotationsBEVDepth(): def __init__(self, bda_aug_conf, classes, is_train=True, input_modality=None): self.bda_aug_conf = bda_aug_conf self.is_train = is_train self.classes = classes if input_modality == None: input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) self.input_modality = input_modality def sample_bda_augmentation(self): """Generate bda augmentation values based on bda_config.""" if self.is_train: rotate_bda = np.random.uniform(*self.bda_aug_conf['rot_lim']) scale_bda = np.random.uniform(*self.bda_aug_conf['scale_lim']) flip_dx = np.random.uniform() < self.bda_aug_conf['flip_dx_ratio'] flip_dy = np.random.uniform() < self.bda_aug_conf['flip_dy_ratio'] else: rotate_bda = 0 scale_bda = 1.0 flip_dx = False flip_dy = False return rotate_bda, scale_bda, flip_dx, flip_dy def __call__(self, results): rotate_bda, scale_bda, flip_dx, flip_dy = self.sample_bda_augmentation() bda_mat = torch.zeros(4, 4) bda_mat[3, 3] = 1 bda_rot = bev_transform(rotate_bda, scale_bda, flip_dx, flip_dy) bda_mat[:3, :3] = bda_rot results['bda_mat'] = bda_rot if 'points' in results.keys(): results['points'].rotate(bda_rot) if self.input_modality['use_camera']: assert len(results['img_inputs']) == 8 imgs, rots, trans, intrins, post_rots, post_trans, gt_depths, sensor2sensors = results['img_inputs'] results['img_inputs'] = (imgs, rots, trans, intrins, post_rots, post_trans, bda_rot, imgs.shape[-2:], gt_depths, sensor2sensors) return results class PhotoMetricDistortionMultiViewImage(object): """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def __call__(self, img): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ # convert PIL Image to Ndarray float32 img = np.array(img, dtype=np.float32) assert img.dtype == np.float32, \ 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ ' please set "to_float32=True" in "LoadImageFromFile" pipeline' # random brightness if random.randint(2): delta = random.uniform(-self.brightness_delta, self.brightness_delta) img += delta # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last mode = random.randint(2) if mode == 1: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if random.randint(2): img[..., 1] *= random.uniform(self.saturation_lower, self.saturation_upper) # random hue if random.randint(2): img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if mode == 0: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # randomly swap channels if random.randint(2): img = img[..., random.permutation(3)] img = Image.fromarray(img.astype(np.uint8)) return img ================================================ FILE: projects/occ_plugin/datasets/pipelines/loading_instance.py ================================================ # Developed by Junyi Ma based on the codebase of PowerBEV # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np from mmdet.datasets.builder import PIPELINES import os import torch from pyquaternion import Quaternion from nuscenes.utils.data_classes import Box import time @PIPELINES.register_module() class LoadInstanceWithFlow(object): def __init__(self, cam4docc_dataset_path, grid_size=[512, 512, 40], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], background=0, use_flow=True, use_separate_classes=False, use_lyft=False): ''' Loading sequential occupancy labels and instance flows for training and testing cam4docc_dataset_path: data path of Cam4DOcc dataset, including 'segmentation', 'instance', and 'flow' grid_size: number of grids along H W L, default: [512, 512, 40] pc_range: predefined ranges along H W L, default: [-51.2, -51.2, -5.0, 51.2, 51.2, 3.0] background: background pixel value for segmentation/instance/flow maps, default: 0 use_flow: whether use flow for training schemes, default: True ''' self.cam4docc_dataset_path = cam4docc_dataset_path self.pc_range = pc_range self.resolution = [(self.pc_range[3+i] - self.pc_range[i])/grid_size[i] for i in range(len(self.pc_range[:3]))] self.start_position = [self.pc_range[i] + self.resolution[i] / 2.0 for i in range(len(self.pc_range[:3]))] self.dimension = grid_size self.pc_range = np.array(self.pc_range) self.resolution = np.array(self.resolution) self.start_position = np.array(self.start_position) self.dimension = np.array(self.dimension) self.background = background self.use_flow = use_flow self.use_separate_classes = use_separate_classes self.use_lyft = use_lyft def get_poly_region(self, instance_annotation, present_egopose, present_ego2lidar): """ Obtain the bounding box polygon of the instance """ present_ego_translation, present_ego_rotation = present_egopose present_ego2lidar_translation, present_ego2lidar_rotation = present_ego2lidar box = Box( instance_annotation['translation'], instance_annotation['size'], Quaternion(instance_annotation['rotation']) ) box.translate(present_ego_translation) box.rotate(present_ego_rotation) box.translate(present_ego2lidar_translation) box.rotate(present_ego2lidar_rotation) pts=box.corners().T X_min_box = pts.min(axis=0)[0] X_max_box = pts.max(axis=0)[0] Y_min_box = pts.min(axis=0)[1] Y_max_box = pts.max(axis=0)[1] Z_min_box = pts.min(axis=0)[2] Z_max_box = pts.max(axis=0)[2] if self.pc_range[0] <= X_min_box and X_max_box <= self.pc_range[3] \ and self.pc_range[1] <= Y_min_box and Y_max_box <= self.pc_range[4] \ and self.pc_range[2] <= Z_min_box and Z_max_box <= self.pc_range[5]: pts = np.round((pts - self.start_position[:3] + self.resolution[:3] / 2.0) / self.resolution[:3]).astype(np.int32) return pts else: return None def fill_occupancy(self, occ_instance, occ_segmentation, occ_attribute_label, instance_fill_info): x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.float) x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2]) y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.float) y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2]) z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.float) z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2]) mesh_grid_3d = torch.stack((x_grid, y_grid, z_grid), -1) mesh_grid_3d = mesh_grid_3d.view(-1, 3) occ_instance = torch.from_numpy(occ_instance).view(-1, 1) occ_segmentation = torch.from_numpy(occ_segmentation).view(-1, 1) occ_attribute_label = torch.from_numpy(occ_attribute_label).view(-1, 1) for instance_info in instance_fill_info: poly_region_pts = instance_info['poly_region'] semantic_id = instance_info['semantic_id'] instance_id = instance_info['instance_id'] attribute_label=instance_info['attribute_label'] X_min_box = poly_region_pts.min(axis=0)[0] X_max_box = poly_region_pts.max(axis=0)[0] Y_min_box = poly_region_pts.min(axis=0)[1] Y_max_box = poly_region_pts.max(axis=0)[1] Z_min_box = poly_region_pts.min(axis=0)[2] Z_max_box = poly_region_pts.max(axis=0)[2] mask_cur_instance = (mesh_grid_3d[:,0] >= X_min_box) & (X_max_box >= mesh_grid_3d[:,0]) \ & (mesh_grid_3d[:,1] >= Y_min_box) & (Y_max_box >= mesh_grid_3d[:,1]) \ & (mesh_grid_3d[:,2] >= Z_min_box) & (Z_max_box >= mesh_grid_3d[:,2]) occ_instance[mask_cur_instance] = instance_id occ_segmentation[mask_cur_instance] = semantic_id occ_attribute_label[mask_cur_instance] = attribute_label occ_instance = occ_instance.view(self.dimension[0], self.dimension[1], self.dimension[2]).long() occ_segmentation = occ_segmentation.view(self.dimension[0], self.dimension[1], self.dimension[2]).long() occ_attribute_label = occ_attribute_label.view(self.dimension[0], self.dimension[1], self.dimension[2]).long() return occ_instance, occ_segmentation, occ_attribute_label def get_label(self, input_seq_data): """ Generate labels for semantic segmentation, instance segmentation, z position, attribute from the raw data of nuScenes """ timestep = self.counter # Background is ID 0 segmentation = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background instance = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background attribute_label = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background instance_dict = input_seq_data['instance_dict'] egopose_list = input_seq_data['egopose_list'] ego2lidar_list = input_seq_data['ego2lidar_list'] time_receptive_field = input_seq_data['time_receptive_field'] instance_fill_info = [] for instance_token, instance_annotation in instance_dict.items(): if timestep not in instance_annotation['timestep']: continue pointer = instance_annotation['timestep'].index(timestep) annotation = { 'translation': instance_annotation['translation'][pointer], 'rotation': instance_annotation['rotation'][pointer], 'size': instance_annotation['size'], } poly_region = self.get_poly_region(annotation, egopose_list[time_receptive_field - 1], ego2lidar_list[time_receptive_field - 1]) if isinstance(poly_region, np.ndarray): if self.counter >= time_receptive_field and instance_token not in self.visible_instance_set: continue self.visible_instance_set.add(instance_token) prepare_for_fill = dict( poly_region=poly_region, instance_id=instance_annotation['instance_id'], attribute_label=instance_annotation['attribute_label'][pointer], semantic_id=instance_annotation['semantic_id'], ) instance_fill_info.append(prepare_for_fill) instance, segmentation, attribute_label = self.fill_occupancy(instance, segmentation, attribute_label, instance_fill_info) segmentation = segmentation.unsqueeze(0) instance = instance.unsqueeze(0) attribute_label = attribute_label.unsqueeze(0).unsqueeze(0) return segmentation, instance, attribute_label @staticmethod def generate_flow(flow, occ_instance_seq, instance, instance_id): """ Generate ground truth for the flow of each instance based on instance segmentation """ seg_len, wx, wy, wz = occ_instance_seq.shape ratio = 4 occ_instance_seq = occ_instance_seq.reshape(seg_len, wx//ratio, ratio, wy//ratio, ratio, wz//ratio, ratio).permute(0,1,3,5,2,4,6).reshape(seg_len, wx//ratio, wy//ratio, wz//ratio, ratio**3) empty_mask = occ_instance_seq.sum(-1) == 0 occ_instance_seq = occ_instance_seq.to(torch.int64) occ_space = occ_instance_seq[~empty_mask] occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1 occ_instance_seq[~empty_mask] = occ_space occ_instance_seq = torch.mode(occ_instance_seq, dim=-1)[0] occ_instance_seq[occ_instance_seq<0] = 0 occ_instance_seq = occ_instance_seq.long() _, wx, wy, wz = occ_instance_seq.shape x, y, z = torch.meshgrid(torch.arange(wx, dtype=torch.float), torch.arange(wy, dtype=torch.float), torch.arange(wz, dtype=torch.float)) grid = torch.stack((x, y, z), dim=0) # Set the first frame init_pointer = instance['timestep'][0] instance_mask = (occ_instance_seq[init_pointer] == instance_id) flow[init_pointer, 0, instance_mask] = grid[0, instance_mask].mean(dim=0, keepdim=True).round() - grid[0, instance_mask] flow[init_pointer, 1, instance_mask] = grid[1, instance_mask].mean(dim=0, keepdim=True).round() - grid[1, instance_mask] flow[init_pointer, 2, instance_mask] = grid[2, instance_mask].mean(dim=0, keepdim=True).round() - grid[2, instance_mask] for i, timestep in enumerate(instance['timestep']): if i == 0: continue instance_mask = (occ_instance_seq[timestep] == instance_id) prev_instance_mask = (occ_instance_seq[timestep-1] == instance_id) if instance_mask.sum() == 0 or prev_instance_mask.sum() == 0: continue flow[timestep, 0, instance_mask] = grid[0, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[0, instance_mask] flow[timestep, 1, instance_mask] = grid[1, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[1, instance_mask] flow[timestep, 2, instance_mask] = grid[2, prev_instance_mask].mean(dim=0, keepdim=True).round() - grid[2, instance_mask] return flow def get_flow_label(self, input_seq_data, ignore_index=255): """ Generate the global map of the flow ground truth """ occ_instance = input_seq_data['instance'] instance_dict = input_seq_data['instance_dict'] instance_map = input_seq_data['instance_map'] seq_len, wx, wy, wz = occ_instance.shape ratio = 4 flow = ignore_index * torch.ones(seq_len, 3, wx//ratio, wy//ratio, wz//ratio) # ignore flow generation for faster pipelines if not self.use_flow: return flow for token, instance in instance_dict.items(): flow = self.generate_flow(flow, occ_instance, instance, instance_map[token]) return flow.float() # set ignore index to 0 for vis @staticmethod def convert_instance_mask_to_center_and_offset_label(input_seq_data, ignore_index=255, sigma=3): occ_instance = input_seq_data['instance'] num_instances=len(input_seq_data['instance_map']) seq_len, wx, wy, wz = occ_instance.shape center_label = torch.zeros(seq_len, 1, wx, wy, wz) offset_label = ignore_index * torch.ones(seq_len, 3, wx, wy, wz) # x is vertical displacement, y is horizontal displacement x, y, z = torch.meshgrid(torch.arange(wx, dtype=torch.float), torch.arange(wy, dtype=torch.float), torch.arange(wz, dtype=torch.float)) # Ignore id 0 which is the background for instance_id in range(1, num_instances+1): for t in range(seq_len): instance_mask = (occ_instance[t] == instance_id) xc = x[instance_mask].mean().round().long() yc = y[instance_mask].mean().round().long() zc = z[instance_mask].mean().round().long() off_x = xc - x off_y = yc - y off_z = zc - z g = torch.exp(-(off_x ** 2 + off_y ** 2 + off_z ** 2) / sigma ** 2) center_label[t, 0] = torch.maximum(center_label[t, 0], g) offset_label[t, 0, instance_mask] = off_x[instance_mask] offset_label[t, 1, instance_mask] = off_y[instance_mask] offset_label[t, 2, instance_mask] = off_z[instance_mask] return center_label, offset_label def __call__(self, results): assert 'segmentation' not in results.keys() assert 'instance' not in results.keys() assert 'attribute_label' not in results.keys() time_receptive_field = results['time_receptive_field'] prefix = "MMO" if self.use_separate_classes else "GMO" if self.use_lyft: prefix = prefix + "_lyft" seg_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "segmentation") if not os.path.exists(seg_label_dir): os.mkdir(seg_label_dir) seg_label_path = os.path.join(seg_label_dir, \ results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token']) instance_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "instance") if not os.path.exists(instance_label_dir): os.mkdir(instance_label_dir) instance_label_path = os.path.join(instance_label_dir, \ results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token']) flow_label_dir = os.path.join(self.cam4docc_dataset_path, prefix, "flow") if not os.path.exists(flow_label_dir): os.mkdir(flow_label_dir) flow_label_path = os.path.join(flow_label_dir, \ results['input_dict'][time_receptive_field-1]['scene_token']+"_"+results['input_dict'][time_receptive_field-1]['lidar_token']) segmentation_list = [] if os.path.exists(seg_label_path+".npz"): gt_segmentation_arr = np.load(seg_label_path+".npz",allow_pickle=True)['arr_0'] for j in range(len(gt_segmentation_arr)): segmentation = np.zeros((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background gt_segmentation = gt_segmentation_arr[j] gt_segmentation = torch.from_numpy(gt_segmentation) # for i in range(gt_segmentation.shape[0]): # cur_ind = gt_segmentation[i, :3].long() # cur_label = gt_segmentation[i, -1] # segmentation[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label segmentation[gt_segmentation[:, 0].long(), gt_segmentation[:, 1].long(), gt_segmentation[:, 2].long()] = gt_segmentation[:, -1] segmentation = torch.from_numpy(segmentation).unsqueeze(0) segmentation_list.append(segmentation) instance_list = [] if os.path.exists(instance_label_path+".npz"): gt_instance_arr = np.load(instance_label_path+".npz",allow_pickle=True)['arr_0'] for j in range(len(gt_instance_arr)): instance = np.ones((self.dimension[0], self.dimension[1], self.dimension[2])) * self.background gt_instance = gt_instance_arr[j] gt_instance = torch.from_numpy(gt_instance) # for i in range(gt_instance.shape[0]): # cur_ind = gt_instance[i, :3].long() # cur_label = gt_instance[i, -1] # instance[cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label instance[gt_instance[:, 0].long(), gt_instance[:, 1].long(), gt_instance[:, 2].long()] = gt_instance[:, -1] instance = torch.from_numpy(instance).unsqueeze(0) instance_list.append(instance) flow_list = [] if os.path.exists(flow_label_path+".npz"): gt_flow_arr = np.load(flow_label_path+".npz",allow_pickle=True)['arr_0'] for j in range(len(gt_flow_arr)): flow = np.ones((3, self.dimension[0]//4, self.dimension[1]//4, self.dimension[2]//4)) * 255 gt_flow = gt_flow_arr[j] gt_flow = torch.from_numpy(gt_flow) # for i in range(gt_flow.shape[0]): # cur_ind = gt_flow[i, :3].long() # cur_label = gt_flow[i, 3:] # flow[0, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[0] # flow[1, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[1] # flow[2, cur_ind[0],cur_ind[1],cur_ind[2]] = cur_label[2] flow[:, gt_flow[:, 0].long(), gt_flow[:, 1].long(), gt_flow[:, 2].long()] = gt_flow[:, 3:].permute(1, 0) flow = torch.from_numpy(flow).unsqueeze(0) flow_list.append(flow) if os.path.exists(seg_label_path+".npz") and os.path.exists(instance_label_path+".npz") and os.path.exists(flow_label_path+".npz"): results['segmentation'] = torch.cat(segmentation_list, dim=0) results['instance'] = torch.cat(instance_list, dim=0) results['attribute_label'] = torch.from_numpy(np.zeros((self.dimension[0], self.dimension[1], self.dimension[2]))).unsqueeze(0) results['flow'] = torch.cat(flow_list, dim=0).float() for key, value in results.items(): if key in ['sample_token', 'centerness', 'offset', 'flow', 'time_receptive_field', "indices", \ 'segmentation','instance','attribute_label','sequence_length', 'instance_dict', 'instance_map', 'input_dict', 'egopose_list','ego2lidar_list','scene_token']: continue results[key] = torch.cat(value, dim=0) return results else: results['segmentation'] = [] results['instance'] = [] results['attribute_label'] = [] segmentation_saved_list = [] instance_saved_list = [] sequence_length = results['sequence_length'] self.visible_instance_set = set() for self.counter in range(sequence_length): segmentation, instance, attribute_label = self.get_label(results) results['segmentation'].append(segmentation) results['instance'].append(instance) results['attribute_label'].append(attribute_label) x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.long) x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2]) y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.long) y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2]) z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.long) z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2]) segmentation_for_save = torch.stack((x_grid, y_grid, z_grid), -1) segmentation_for_save = segmentation_for_save.view(-1, 3) segmentation_label = segmentation.squeeze(0).view(-1,1) segmentation_for_save = torch.cat((segmentation_for_save, segmentation_label), dim=-1) kept = segmentation_for_save[:,-1]!=0 segmentation_for_save= segmentation_for_save[kept] segmentation_saved_list.append(segmentation_for_save) x_grid = torch.linspace(0, self.dimension[0]-1, self.dimension[0], dtype=torch.long) x_grid = x_grid.view(self.dimension[0], 1, 1).expand(self.dimension[0], self.dimension[1], self.dimension[2]) y_grid = torch.linspace(0, self.dimension[1]-1, self.dimension[1], dtype=torch.long) y_grid = y_grid.view(1, self.dimension[1], 1).expand(self.dimension[0], self.dimension[1], self.dimension[2]) z_grid = torch.linspace(0, self.dimension[2]-1, self.dimension[2], dtype=torch.long) z_grid = z_grid.view(1, 1, self.dimension[2]).expand(self.dimension[0], self.dimension[1], self.dimension[2]) instance_for_save = torch.stack((x_grid, y_grid, z_grid), -1) instance_for_save = instance_for_save.view(-1, 3) instance_label = instance.squeeze(0).view(-1,1) instance_for_save = torch.cat((instance_for_save, instance_label), dim=-1) kept = instance_for_save[:,-1]!=0 instance_for_save= instance_for_save[kept] instance_saved_list.append(instance_for_save) segmentation_saved_list2 = [item.cpu().detach().numpy() for item in segmentation_saved_list] instance_saved_list2 = [item.cpu().detach().numpy() for item in instance_saved_list] np.savez(seg_label_path, segmentation_saved_list2) np.savez(instance_label_path, instance_saved_list2) results['segmentation'] = torch.cat(results['segmentation'], dim=0) results['instance'] = torch.cat(results['instance'], dim=0) results['attribute_label'] = torch.from_numpy(np.zeros((self.dimension[0], self.dimension[1], self.dimension[2]))).unsqueeze(0) results['flow'] = self.get_flow_label(results, ignore_index=255) flow_saved_list = [] sequence_length = results['sequence_length'] d0 = self.dimension[0]//4 d1 = self.dimension[1]//4 d2 = self.dimension[2]//4 for cnt in range(sequence_length): flow = results['flow'][cnt, ...] x_grid = torch.linspace(0, d0-1, d0, dtype=torch.long) x_grid = x_grid.view(d0, 1, 1).expand(d0, d1, d2) y_grid = torch.linspace(0, d1-1, d1, dtype=torch.long) y_grid = y_grid.view(1, d1, 1).expand(d0, d1, d2) z_grid = torch.linspace(0, d2-1, d2, dtype=torch.long) z_grid = z_grid.view(1, 1, d2).expand(d0, d1, d2) flow_for_save = torch.stack((x_grid, y_grid, z_grid), -1) flow_for_save = flow_for_save.view(-1, 3) flow_label = flow.permute(1,2,3,0).view(-1,3) flow_for_save = torch.cat((flow_for_save, flow_label), dim=-1) kept = (flow_for_save[:,-1]!=255) & (flow_for_save[:,-2]!=255) & (flow_for_save[:,-3]!=255) flow_for_save= flow_for_save[kept] flow_saved_list.append(flow_for_save) flow_saved_list2 = [item.cpu().detach().numpy() for item in flow_saved_list] np.savez(flow_label_path, flow_saved_list2) for key, value in results.items(): if key in ['sample_token', 'centerness', 'offset', 'flow', 'time_receptive_field', "indices", \ 'segmentation','instance','attribute_label','sequence_length', 'instance_dict', 'instance_map', 'input_dict', 'egopose_list','ego2lidar_list','scene_token']: continue results[key] = torch.cat(value, dim=0) return results ================================================ FILE: projects/occ_plugin/datasets/pipelines/loading_occupancy.py ================================================ # Developed by Junyi Ma based on the codebase of OpenOccupancy # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np import numba as nb from mmdet.datasets.builder import PIPELINES import yaml, os import torch import torch.nn.functional as F import copy @PIPELINES.register_module() class LoadOccupancy(object): def __init__(self, to_float32=True, occ_path=None, grid_size=[512, 512, 40], pc_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], unoccupied=0, gt_resize_ratio=1, use_fine_occ=False, test_mode=False): ''' Read sequential fine-grained occupancy labels from nuScenes-Occupancy if use_fine_occ=True ''' self.to_float32 = to_float32 self.occ_path = occ_path self.grid_size = np.array(grid_size) self.unoccupied = unoccupied self.pc_range = np.array(pc_range) self.voxel_size = (self.pc_range[3:] - self.pc_range[:3]) / self.grid_size self.gt_resize_ratio = gt_resize_ratio self.use_fine_occ = use_fine_occ self.test_mode = test_mode def get_seq_pseudo_occ(self, results): sequence_length = results['sequence_length'] gt_occ_seq = [] for count in range(sequence_length): processed_label = np.ones(self.grid_size, dtype=np.uint8) * self.unoccupied processed_label = torch.from_numpy(processed_label) gt_occ_seq.append(processed_label) gt_occ_seq = torch.stack(gt_occ_seq) return gt_occ_seq def get_seq_occ(self, results): sequence_length = results['sequence_length'] gt_occ_seq = [] for count in range(sequence_length): scene_token_cur = results['input_dict'][count]['scene_token'] lidar_token_cur = results['input_dict'][count]['lidar_token'] rel_path = 'scene_{0}/occupancy/{1}.npy'.format(scene_token_cur, lidar_token_cur) # [z y x cls] or [z y x vx vy vz cls] pcd = np.load(os.path.join(self.occ_path, rel_path)) pcd_label = pcd[..., -1:] pcd_label[pcd_label==0] = 255 pcd_np_cor = self.voxel2world(pcd[..., [2,1,0]] + 0.5) untransformed_occ = copy.deepcopy(pcd_np_cor) egopose_list = results['egopose_list'] ego2lidar_list = results['ego2lidar_list'] time_receptive_field = results['time_receptive_field'] present_global2ego = egopose_list[time_receptive_field - 1] present_ego2lidar = ego2lidar_list[time_receptive_field - 1] cur_global2ego = egopose_list[count] cur_ego2lidar = ego2lidar_list[count] pcd_np_cor = np.dot(cur_ego2lidar[1].inverse.rotation_matrix, pcd_np_cor.T) pcd_np_cor = pcd_np_cor.T pcd_np_cor = pcd_np_cor - cur_ego2lidar[0] # trans # cur_ego -> global pcd_np_cor = np.dot(cur_global2ego[1].inverse.rotation_matrix, pcd_np_cor.T) # rot pcd_np_cor = pcd_np_cor.T pcd_np_cor = pcd_np_cor - cur_global2ego[0] # trans # global -> present_ego pcd_np_cor = pcd_np_cor + present_global2ego[0] # trans pcd_np_cor = np.dot(present_global2ego[1].rotation_matrix, pcd_np_cor.T) pcd_np_cor = pcd_np_cor.T # present_ego -> present_lidar pcd_np_cor = pcd_np_cor + present_ego2lidar[0] # trans pcd_np_cor = np.dot(present_ego2lidar[1].rotation_matrix, pcd_np_cor.T) # rot pcd_np_cor = pcd_np_cor.T pcd_np_cor = self.world2voxel(pcd_np_cor) # make sure the point is in the grid pcd_np_cor = np.clip(pcd_np_cor, np.array([0,0,0]), self.grid_size - 1) transformed_occ = copy.deepcopy(pcd_np_cor) pcd_np = np.concatenate([pcd_np_cor, pcd_label], axis=-1) # 255: noise, 1-16 normal classes, 0 unoccupied pcd_np = pcd_np[np.lexsort((pcd_np_cor[:, 0], pcd_np_cor[:, 1], pcd_np_cor[:, 2])), :] pcd_np = pcd_np.astype(np.int64) processed_label = np.ones(self.grid_size, dtype=np.uint8) * self.unoccupied processed_label = nb_process_label(processed_label, pcd_np) processed_label = torch.from_numpy(processed_label) # TODO: hard coding for otheridx in [0,1,7,8,11,12,13,14,15,16,17,18,255]: processed_label[processed_label==otheridx] = 0 for vehidx in [2,3,4,5,6,9,10]: processed_label[processed_label==vehidx] = 1 gt_occ_seq.append(processed_label) gt_occ_seq = torch.stack(gt_occ_seq) return gt_occ_seq def __call__(self, results): if self.use_fine_occ: results['gt_occ'] = self.get_seq_occ(results) else: results['gt_occ'] = self.get_seq_pseudo_occ(results) return results def voxel2world(self, voxel): """ voxel: [N, 3] """ return voxel * self.voxel_size[None, :] + self.pc_range[:3][None, :] def world2voxel(self, world): """ world: [N, 3] """ return (world - self.pc_range[:3][None, :]) / self.voxel_size[None, :] def __repr__(self): """str: Return a string that describes the module.""" repr_str = self.__class__.__name__ repr_str += f'(to_float32={self.to_float32}' return repr_str def project_points(self, points, rots, trans, intrins, post_rots, post_trans): # from lidar to camera points = points.reshape(-1, 1, 3) points = points - trans.reshape(1, -1, 3) inv_rots = rots.inverse().unsqueeze(0) points = (inv_rots @ points.unsqueeze(-1)) # from camera to raw pixel points = (intrins.unsqueeze(0) @ points).squeeze(-1) points_d = points[..., 2:3] points_uv = points[..., :2] / points_d # from raw pixel to transformed pixel points_uv = post_rots[:, :2, :2].unsqueeze(0) @ points_uv.unsqueeze(-1) points_uv = points_uv.squeeze(-1) + post_trans[..., :2].unsqueeze(0) points_uvd = torch.cat((points_uv, points_d), dim=2) return points_uvd # b1:boolean, u1: uint8, i2: int16, u2: uint16 @nb.jit('b1[:](i2[:,:],u2[:,:],b1[:])', nopython=True, cache=True, parallel=False) def nb_process_img_points(basic_valid_occ, depth_canva, nb_valid_mask): # basic_valid_occ M 3 # depth_canva H W # label_size = M # for original occ, small: 2w mid: ~8w base: ~30w canva_idx = -1 * np.ones_like(depth_canva, dtype=np.int16) for i in range(basic_valid_occ.shape[0]): occ = basic_valid_occ[i] if occ[2] < depth_canva[occ[1], occ[0]]: if canva_idx[occ[1], occ[0]] != -1: nb_valid_mask[canva_idx[occ[1], occ[0]]] = False canva_idx[occ[1], occ[0]] = i depth_canva[occ[1], occ[0]] = occ[2] nb_valid_mask[i] = True return nb_valid_mask # u1: uint8, u8: uint16, i8: int64 @nb.jit('u1[:,:,:](u1[:,:,:],i8[:,:])', nopython=True, cache=True, parallel=False) def nb_process_label_withvel(processed_label, sorted_label_voxel_pair): label_size = 256 counter = np.zeros((label_size,), dtype=np.uint16) counter[sorted_label_voxel_pair[0, 3]] = 1 cur_sear_ind = sorted_label_voxel_pair[0, :3] for i in range(1, sorted_label_voxel_pair.shape[0]): cur_ind = sorted_label_voxel_pair[i, :3] if not np.all(np.equal(cur_ind, cur_sear_ind)): processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter) counter = np.zeros((label_size,), dtype=np.uint16) cur_sear_ind = cur_ind counter[sorted_label_voxel_pair[i, 3]] += 1 processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter) return processed_label # u1: uint8, u8: uint16, i8: int64 @nb.jit('u1[:,:,:](u1[:,:,:],i8[:,:])', nopython=True, cache=True, parallel=False) def nb_process_label(processed_label, sorted_label_voxel_pair): label_size = 256 counter = np.zeros((label_size,), dtype=np.uint16) counter[sorted_label_voxel_pair[0, 3]] = 1 cur_sear_ind = sorted_label_voxel_pair[0, :3] for i in range(1, sorted_label_voxel_pair.shape[0]): cur_ind = sorted_label_voxel_pair[i, :3] if not np.all(np.equal(cur_ind, cur_sear_ind)): processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter) counter = np.zeros((label_size,), dtype=np.uint16) cur_sear_ind = cur_ind counter[sorted_label_voxel_pair[i, 3]] += 1 processed_label[cur_sear_ind[0], cur_sear_ind[1], cur_sear_ind[2]] = np.argmax(counter) return processed_label ================================================ FILE: projects/occ_plugin/datasets/pipelines/transform_3d.py ================================================ import numpy as np from numpy import random import mmcv from mmdet.datasets.builder import PIPELINES from mmcv.parallel import DataContainer as DC @PIPELINES.register_module() class PadMultiViewImage(object): """Pad the multi-view image. There are two padding modes: (1) pad to a fixed size and (2) pad to the minimum size that is divisible by some number. Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", Args: size (tuple, optional): Fixed padding size. size_divisor (int, optional): The divisor of padded size. pad_val (float, optional): Padding value, 0 by default. """ def __init__(self, size=None, size_divisor=None, pad_val=0): self.size = size self.size_divisor = size_divisor self.pad_val = pad_val # only one of size and size_divisor should be valid assert size is not None or size_divisor is not None assert size is None or size_divisor is None def _pad_img(self, results): """Pad images according to ``self.size``.""" if self.size is not None: padded_img = [mmcv.impad( img, shape=self.size, pad_val=self.pad_val) for img in results['img']] elif self.size_divisor is not None: padded_img = [mmcv.impad_to_multiple( img, self.size_divisor, pad_val=self.pad_val) for img in results['img']] results['ori_shape'] = [img.shape for img in results['img']] results['img'] = padded_img results['img_shape'] = [img.shape for img in padded_img] results['pad_shape'] = [img.shape for img in padded_img] results['pad_fixed_size'] = self.size results['pad_size_divisor'] = self.size_divisor def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ self._pad_img(results) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.size}, ' repr_str += f'size_divisor={self.size_divisor}, ' repr_str += f'pad_val={self.pad_val})' return repr_str @PIPELINES.register_module() class NormalizeMultiviewImage(object): """Normalize the image. Added key is "img_norm_cfg". Args: mean (sequence): Mean values of 3 channels. std (sequence): Std values of 3 channels. to_rgb (bool): Whether to convert the image from BGR to RGB, default is true. """ def __init__(self, mean, std, to_rgb=True): self.mean = np.array(mean, dtype=np.float32) self.std = np.array(std, dtype=np.float32) self.to_rgb = to_rgb def __call__(self, results): """Call function to normalize images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Normalized results, 'img_norm_cfg' key is added into result dict. """ results['img'] = [mmcv.imnormalize(img, self.mean, self.std, self.to_rgb) for img in results['img']] results['img_norm_cfg'] = dict( mean=self.mean, std=self.std, to_rgb=self.to_rgb) return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})' return repr_str @PIPELINES.register_module() class PhotoMetricDistortionMultiViewImage: """Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last. 1. random brightness 2. random contrast (mode 0) 3. convert color from BGR to HSV 4. random saturation 5. random hue 6. convert color from HSV to BGR 7. random contrast (mode 1) 8. randomly swap channels Args: brightness_delta (int): delta of brightness. contrast_range (tuple): range of contrast. saturation_range (tuple): range of saturation. hue_delta (int): delta of hue. """ def __init__(self, brightness_delta=32, contrast_range=(0.5, 1.5), saturation_range=(0.5, 1.5), hue_delta=18): self.brightness_delta = brightness_delta self.contrast_lower, self.contrast_upper = contrast_range self.saturation_lower, self.saturation_upper = saturation_range self.hue_delta = hue_delta def __call__(self, results): """Call function to perform photometric distortion on images. Args: results (dict): Result dict from loading pipeline. Returns: dict: Result dict with images distorted. """ imgs = results['img'] new_imgs = [] for img in imgs: assert img.dtype == np.float32, \ 'PhotoMetricDistortion needs the input image of dtype np.float32,'\ ' please set "to_float32=True" in "LoadImageFromFile" pipeline' # random brightness if random.randint(2): delta = random.uniform(-self.brightness_delta, self.brightness_delta) img += delta # mode == 0 --> do random contrast first # mode == 1 --> do random contrast last mode = random.randint(2) if mode == 1: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # convert color from BGR to HSV img = mmcv.bgr2hsv(img) # random saturation if random.randint(2): img[..., 1] *= random.uniform(self.saturation_lower, self.saturation_upper) # random hue if random.randint(2): img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta) img[..., 0][img[..., 0] > 360] -= 360 img[..., 0][img[..., 0] < 0] += 360 # convert color from HSV to BGR img = mmcv.hsv2bgr(img) # random contrast if mode == 0: if random.randint(2): alpha = random.uniform(self.contrast_lower, self.contrast_upper) img *= alpha # randomly swap channels if random.randint(2): img = img[..., random.permutation(3)] new_imgs.append(img) results['img'] = new_imgs return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(\nbrightness_delta={self.brightness_delta},\n' repr_str += 'contrast_range=' repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n' repr_str += 'saturation_range=' repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n' repr_str += f'hue_delta={self.hue_delta})' return repr_str @PIPELINES.register_module() class CustomCollect3D(object): """Collect data from the loader relevant to the specific task. This is usually the last stage of the data loader pipeline. Typically keys is set to some subset of "img", "proposals", "gt_bboxes", "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". The "img_meta" item is always populated. The contents of the "img_meta" dictionary depends on "meta_keys". By default this includes: - 'img_shape': shape of the image input to the network as a tuple \ (h, w, c). Note that images may be zero padded on the \ bottom/right if the batch tensor is larger than this shape. - 'scale_factor': a float indicating the preprocessing scale - 'flip': a boolean indicating if image flip transform was used - 'filename': path to the image file - 'ori_shape': original shape of the image as a tuple (h, w, c) - 'pad_shape': image shape after padding - 'lidar2img': transform from lidar to image - 'depth2img': transform from depth to image - 'cam2img': transform from camera to image - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ flipped horizontally - 'pcd_vertical_flip': a boolean indicating if point cloud is \ flipped vertically - 'box_mode_3d': 3D box mode - 'box_type_3d': 3D box type - 'img_norm_cfg': a dict of normalization information: - mean: per channel mean subtraction - std: per channel std divisor - to_rgb: bool indicating if bgr was converted to rgb - 'pcd_trans': point cloud transformations - 'sample_idx': sample index - 'pcd_scale_factor': point cloud scale factor - 'pcd_rotation': rotation applied to point cloud - 'pts_filename': path to point cloud file. Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str], optional): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') """ def __init__(self, keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 'transformation_3d_flow', 'scene_token', 'can_bus' )): self.keys = keys self.meta_keys = meta_keys def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` will be converted to :obj:`mmcv.DataContainer`. Args: results (dict): Result dict contains the data to collect. Returns: dict: The result dict contains the following keys - keys in ``self.keys`` - ``img_metas`` """ data = {} img_metas = {} for key in self.meta_keys: if key in results: img_metas[key] = results[key] data['img_metas'] = DC(img_metas, cpu_only=True) for key in self.keys: data[key] = results[key] return data def __repr__(self): """str: Return a string that describes the module.""" return self.__class__.__name__ + \ f'(keys={self.keys}, meta_keys={self.meta_keys})' @PIPELINES.register_module() class CustomOccCollect3D(object): """Collect data from the loader relevant to the specific task. This is usually the last stage of the data loader pipeline. Typically keys is set to some subset of "img", "proposals", "gt_bboxes", "gt_bboxes_ignore", "gt_labels", and/or "gt_masks". The "img_meta" item is always populated. The contents of the "img_meta" dictionary depends on "meta_keys". By default this includes: - 'img_shape': shape of the image input to the network as a tuple \ (h, w, c). Note that images may be zero padded on the \ bottom/right if the batch tensor is larger than this shape. - 'scale_factor': a float indicating the preprocessing scale - 'flip': a boolean indicating if image flip transform was used - 'filename': path to the image file - 'ori_shape': original shape of the image as a tuple (h, w, c) - 'pad_shape': image shape after padding - 'lidar2img': transform from lidar to image - 'depth2img': transform from depth to image - 'cam2img': transform from camera to image - 'pcd_horizontal_flip': a boolean indicating if point cloud is \ flipped horizontally - 'pcd_vertical_flip': a boolean indicating if point cloud is \ flipped vertically - 'box_mode_3d': 3D box mode - 'box_type_3d': 3D box type - 'img_norm_cfg': a dict of normalization information: - mean: per channel mean subtraction - std: per channel std divisor - to_rgb: bool indicating if bgr was converted to rgb - 'pcd_trans': point cloud transformations - 'sample_idx': sample index - 'pcd_scale_factor': point cloud scale factor - 'pcd_rotation': rotation applied to point cloud - 'pts_filename': path to point cloud file. Args: keys (Sequence[str]): Keys of results to be collected in ``data``. meta_keys (Sequence[str], optional): Meta keys to be converted to ``mmcv.DataContainer`` and collected in ``data[img_metas]``. Default: ('filename', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename') """ def __init__(self, keys, meta_keys=('filename', 'ori_shape', 'img_shape', 'lidar2img', 'depth2img', 'cam2img', 'pad_shape', 'scale_factor', 'flip', 'pcd_horizontal_flip', 'pcd_vertical_flip', 'box_mode_3d', 'box_type_3d', 'img_norm_cfg', 'pcd_trans', 'sample_idx', 'prev_idx', 'next_idx', 'pcd_scale_factor', 'pcd_rotation', 'pts_filename', 'transformation_3d_flow', 'scene_token', 'can_bus', 'pc_range', 'occ_size', 'lidar_token' )): self.keys = keys self.meta_keys = meta_keys def __call__(self, results): """Call function to collect keys in results. The keys in ``meta_keys`` will be converted to :obj:`mmcv.DataContainer`. Args: results (dict): Result dict contains the data to collect. Returns: dict: The result dict contains the following keys - keys in ``self.keys`` - ``img_metas`` """ data = {} img_metas = {} for key in self.meta_keys: if key in results: img_metas[key] = results[key] data['img_metas'] = DC(img_metas, cpu_only=True) for key in self.keys: if key in results.keys(): data[key] = results[key] print("self.keys", self.keys) # if 'gt_occ' in results.keys(): # data['gt_occ'] = results['gt_occ'] return data def __repr__(self): """str: Return a string that describes the module.""" return self.__class__.__name__ + \ f'(keys={self.keys}, meta_keys={self.meta_keys})' @PIPELINES.register_module() class RandomScaleImageMultiViewImage(object): """Random scale the image Args: scales """ def __init__(self, scales=[]): self.scales = scales assert len(self.scales)==1 def __call__(self, results): """Call function to pad images, masks, semantic segmentation maps. Args: results (dict): Result dict from loading pipeline. Returns: dict: Updated result dict. """ rand_ind = np.random.permutation(range(len(self.scales)))[0] rand_scale = self.scales[rand_ind] y_size = [int(img.shape[0] * rand_scale) for img in results['img']] x_size = [int(img.shape[1] * rand_scale) for img in results['img']] scale_factor = np.eye(4) scale_factor[0, 0] *= rand_scale scale_factor[1, 1] *= rand_scale results['img'] = [mmcv.imresize(img, (x_size[idx], y_size[idx]), return_scale=False) for idx, img in enumerate(results['img'])] lidar2img = [scale_factor @ l2i for l2i in results['lidar2img']] results['lidar2img'] = lidar2img results['img_shape'] = [img.shape for img in results['img']] results['ori_shape'] = [img.shape for img in results['img']] return results def __repr__(self): repr_str = self.__class__.__name__ repr_str += f'(size={self.scales}, ' return repr_str ================================================ FILE: projects/occ_plugin/datasets/samplers/__init__.py ================================================ from .group_sampler import DistributedGroupSampler from .distributed_sampler import DistributedSampler from .sampler import SAMPLER, build_sampler ================================================ FILE: projects/occ_plugin/datasets/samplers/distributed_sampler.py ================================================ import math import torch from torch.utils.data import DistributedSampler as _DistributedSampler from .sampler import SAMPLER @SAMPLER.register_module() class DistributedSampler(_DistributedSampler): def __init__(self, dataset=None, num_replicas=None, rank=None, shuffle=True, seed=0): super().__init__( dataset, num_replicas=num_replicas, rank=rank, shuffle=shuffle) # for the compatibility from PyTorch 1.3+ self.seed = seed if seed is not None else 0 def __iter__(self): # deterministically shuffle based on epoch if self.shuffle: assert False else: indices = torch.arange(len(self.dataset)).tolist() # add extra samples to make it evenly divisible # in case that indices is shorter than half of total_size indices = (indices * math.ceil(self.total_size / len(indices)))[:self.total_size] assert len(indices) == self.total_size # subsample per_replicas = self.total_size//self.num_replicas # indices = indices[self.rank:self.total_size:self.num_replicas] indices = indices[self.rank*per_replicas:(self.rank+1)*per_replicas] assert len(indices) == self.num_samples return iter(indices) ================================================ FILE: projects/occ_plugin/datasets/samplers/group_sampler.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import math import numpy as np import torch from mmcv.runner import get_dist_info from torch.utils.data import Sampler from .sampler import SAMPLER import random from IPython import embed @SAMPLER.register_module() class DistributedGroupSampler(Sampler): """Sampler that restricts data loading to a subset of the dataset. It is especially useful in conjunction with :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each process can pass a DistributedSampler instance as a DataLoader sampler, and load a subset of the original dataset that is exclusive to it. .. note:: Dataset is assumed to be of constant size. Arguments: dataset: Dataset used for sampling. num_replicas (optional): Number of processes participating in distributed training. rank (optional): Rank of the current process within num_replicas. seed (int, optional): random seed used to shuffle the sampler if ``shuffle=True``. This number should be identical across all processes in the distributed group. Default: 0. """ def __init__(self, dataset, samples_per_gpu=1, num_replicas=None, rank=None, seed=0): _rank, _num_replicas = get_dist_info() if num_replicas is None: num_replicas = _num_replicas if rank is None: rank = _rank self.dataset = dataset self.samples_per_gpu = samples_per_gpu self.num_replicas = num_replicas self.rank = rank self.epoch = 0 self.seed = seed if seed is not None else 0 assert hasattr(self.dataset, 'flag') self.flag = self.dataset.flag self.group_sizes = np.bincount(self.flag) self.num_samples = 0 for i, j in enumerate(self.group_sizes): self.num_samples += int( math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu / self.num_replicas)) * self.samples_per_gpu self.total_size = self.num_samples * self.num_replicas def __iter__(self): # deterministically shuffle based on epoch g = torch.Generator() g.manual_seed(self.epoch + self.seed) indices = [] for i, size in enumerate(self.group_sizes): if size > 0: indice = np.where(self.flag == i)[0] assert len(indice) == size # add .numpy() to avoid bug when selecting indice in parrots. # TODO: check whether torch.randperm() can be replaced by # numpy.random.permutation(). indice = indice[list( torch.randperm(int(size), generator=g).numpy())].tolist() extra = int( math.ceil( size * 1.0 / self.samples_per_gpu / self.num_replicas) ) * self.samples_per_gpu * self.num_replicas - len(indice) # pad indice tmp = indice.copy() for _ in range(extra // size): indice.extend(tmp) indice.extend(tmp[:extra % size]) indices.extend(indice) assert len(indices) == self.total_size indices = [ indices[j] for i in list( torch.randperm( len(indices) // self.samples_per_gpu, generator=g)) for j in range(i * self.samples_per_gpu, (i + 1) * self.samples_per_gpu) ] # subsample offset = self.num_samples * self.rank indices = indices[offset:offset + self.num_samples] assert len(indices) == self.num_samples return iter(indices) def __len__(self): return self.num_samples def set_epoch(self, epoch): self.epoch = epoch ================================================ FILE: projects/occ_plugin/datasets/samplers/sampler.py ================================================ from mmcv.utils.registry import Registry, build_from_cfg SAMPLER = Registry('sampler') def build_sampler(cfg, default_args): return build_from_cfg(cfg, SAMPLER, default_args) ================================================ FILE: projects/occ_plugin/occupancy/__init__.py ================================================ from .dense_heads import * from .detectors import * from .backbones import * from .image2bev import * from .voxel_encoder import * from .necks import * from .fuser import * ================================================ FILE: projects/occ_plugin/occupancy/apis/__init__.py ================================================ from .train import custom_train_model from .mmdet_train import custom_train_detector # from .test import custom_multi_gpu_test ================================================ FILE: projects/occ_plugin/occupancy/apis/mmdet_train.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Junyi Ma, following OpenOccupancy of Xiaofeng Wang # --------------------------------------------- import random import warnings import numpy as np import torch import torch.distributed as dist from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (HOOKS, DistSamplerSeedHook, EpochBasedRunner, Fp16OptimizerHook, OptimizerHook, build_optimizer, build_runner, get_dist_info) from mmcv.utils import build_from_cfg from mmdet.core import EvalHook from mmdet.datasets import (build_dataset, replace_ImageToTensor) from mmdet.utils import get_root_logger import time import os.path as osp from projects.occ_plugin.datasets.builder import build_dataloader from projects.occ_plugin.core.evaluation.eval_hooks import OccDistEvalHook, OccEvalHook from projects.occ_plugin.datasets import custom_build_dataset def custom_train_detector(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): logger = get_root_logger(cfg.log_level) dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] data_loaders = [ build_dataloader( ds, cfg.data.samples_per_gpu, cfg.data.workers_per_gpu, # cfg.gpus will be ignored if distributed len(cfg.gpu_ids), dist=distributed, seed=cfg.seed, shuffler_sampler=cfg.data.shuffler_sampler, nonshuffler_sampler=cfg.data.nonshuffler_sampler, ) for ds in dataset ] # torch.distributed.init_process_group(backend='nccl') if distributed: find_unused_parameters = cfg.get('find_unused_parameters', False) model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False, find_unused_parameters=find_unused_parameters) else: model = MMDataParallel( model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) # build runner optimizer = build_optimizer(model, cfg.optimizer) assert 'runner' in cfg runner = build_runner( cfg.runner, default_args=dict( model=model, optimizer=optimizer, work_dir=cfg.work_dir, logger=logger, meta=meta)) # an ugly workaround to make .log and .log.json filenames the same runner.timestamp = timestamp # fp16 setting TODO fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: optimizer_config = Fp16OptimizerHook( **cfg.optimizer_config, **fp16_cfg, distributed=distributed) elif distributed and 'type' not in cfg.optimizer_config: optimizer_config = OptimizerHook(**cfg.optimizer_config) else: optimizer_config = cfg.optimizer_config # register hooks runner.register_training_hooks(cfg.lr_config, optimizer_config, cfg.checkpoint_config, cfg.log_config, cfg.get('momentum_config', None)) if distributed: if isinstance(runner, EpochBasedRunner): runner.register_hook(DistSamplerSeedHook()) rank, world_size = get_dist_info() if cfg.resume_from: if rank == 0: print("-------------") print("resume from " + cfg.resume_from) print("-------------") runner.resume(cfg.resume_from) elif cfg.load_from: if rank == 0: print("-------------") print("load from " + cfg.load_from) print("-------------") runner.load_checkpoint(cfg.load_from) runner.run(data_loaders, cfg.workflow) ================================================ FILE: projects/occ_plugin/occupancy/apis/test.py ================================================ # --------------------------------------------- # Copyright (c) OpenMMLab. All rights reserved. # --------------------------------------------- # Modified by Zhiqi Li # --------------------------------------------- import os.path as osp import pickle import shutil import tempfile import time import mmcv import torch import torch.distributed as dist from mmcv.image import tensor2imgs from mmcv.runner import get_dist_info from mmdet.utils import get_root_logger from mmdet.core import encode_mask_results import numpy as np import pycocotools.mask as mask_util from fvcore.nn import FlopCountAnalysis, parameter_count_table def custom_encode_mask_results(mask_results): """Encode bitmap mask to RLE code. Semantic Masks only Args: mask_results (list | tuple[list]): bitmap mask results. In mask scoring rcnn, mask_results is a tuple of (segm_results, segm_cls_score). Returns: list | tuple: RLE encoded mask. """ cls_segms = mask_results num_classes = len(cls_segms) encoded_mask_results = [] for i in range(len(cls_segms)): encoded_mask_results.append( mask_util.encode( np.array( cls_segms[i][:, :, np.newaxis], order='F', dtype='uint8'))[0]) # encoded with RLE return [encoded_mask_results] def custom_single_gpu_test(model, data_loader, show=False, out_dir=None, show_score_thr=0.3): model.eval() iou_metric = 0 vpq_metric = 0 dataset = data_loader.dataset prog_bar = mmcv.ProgressBar(len(dataset)) logger = get_root_logger() logger.info(parameter_count_table(model)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if 'hist_for_iou' in result.keys(): iou_metric += result['hist_for_iou'] vpq_metric += result['vpq'] prog_bar.update() res = { 'hist_for_iou': iou_metric, 'vpq_len': len(dataset), 'vpq_metric': vpq_metric, } return res def custom_multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False, show=False, out_dir=None): """Test model with multiple gpus. This method tests model with multiple gpus and collects the results under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' it encodes results to gpu tensors and use gpu communication for results collection. On cpu mode it saves the results on different gpus to 'tmpdir' and collects them by the rank 0 worker. Args: model (nn.Module): Model to be tested. data_loader (nn.Dataloader): Pytorch data loader. tmpdir (str): Path of directory to save the temporary results from different gpus under cpu mode. gpu_collect (bool): Option to use either gpu or cpu to collect results. Returns: list: The prediction results. """ model.eval() # init predictions iou_metric = [] vpq_metric = [] dataset = data_loader.dataset rank, world_size = get_dist_info() if rank == 0: prog_bar = mmcv.ProgressBar(len(dataset)) time.sleep(2) # This line can prevent deadlock problem in some cases. logger = get_root_logger() logger.info(parameter_count_table(model)) for i, data in enumerate(data_loader): with torch.no_grad(): result = model(return_loss=False, rescale=True, **data) if 'hist_for_iou' in result.keys(): iou_metric.append(result['hist_for_iou']) if 'vpq' in result.keys(): vpq_metric.append(result['vpq']) batch_size = 1 if rank == 0: for _ in range(batch_size * world_size): prog_bar.update() # collect lists from multi-GPUs res = {} if 'hist_for_iou' in result.keys(): iou_metric = [sum(iou_metric)] iou_metric = collect_results_cpu(iou_metric, len(dataset), tmpdir) res['hist_for_iou'] = iou_metric if 'vpq' in result.keys(): res['vpq_len'] = len(dataset) vpq_metric = [sum(vpq_metric)] vpq_metric = collect_results_cpu(vpq_metric, len(dataset), tmpdir) res['vpq_metric'] = vpq_metric return res def collect_results_cpu(result_part, size, tmpdir=None, type='list'): rank, world_size = get_dist_info() # create a tmp dir if it is not specified if tmpdir is None: MAX_LEN = 512 # 32 is whitespace dir_tensor = torch.full((MAX_LEN,), 32, dtype=torch.uint8, device='cuda') if rank == 0: mmcv.mkdir_or_exist('.dist_test') tmpdir = tempfile.mkdtemp(dir='.dist_test') tmpdir = torch.tensor( bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') dir_tensor[:len(tmpdir)] = tmpdir dist.broadcast(dir_tensor, 0) tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() else: mmcv.mkdir_or_exist(tmpdir) # dump the part result to the dir mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) dist.barrier() # collect all parts if rank == 0: # load results of all parts from tmp dir part_list = [] for i in range(world_size): part_file = osp.join(tmpdir, f'part_{i}.pkl') part_list.append(mmcv.load(part_file)) # sort the results if type == 'list': ordered_results = [] for res in part_list: ordered_results.extend(list(res)) # the dataloader may pad some samples ordered_results = ordered_results[:size] else: raise NotImplementedError # remove tmp dir shutil.rmtree(tmpdir) dist.barrier() if rank != 0: return None return ordered_results ================================================ FILE: projects/occ_plugin/occupancy/apis/train.py ================================================ from .mmdet_train import custom_train_detector from mmseg.apis import train_segmentor from mmdet.apis import train_detector def custom_train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """A function wrapper for launching model training according to cfg. Because we need different eval_hook in runner. Should be deprecated in the future. """ if cfg.model.type in ['EncoderDecoder3D']: assert False else: custom_train_detector( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) def train_model(model, dataset, cfg, distributed=False, validate=False, timestamp=None, meta=None): """A function wrapper for launching model training according to cfg. Because we need different eval_hook in runner. Should be deprecated in the future. """ if cfg.model.type in ['EncoderDecoder3D']: train_segmentor( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) else: train_detector( model, dataset, cfg, distributed=distributed, validate=validate, timestamp=timestamp, meta=meta) ================================================ FILE: projects/occ_plugin/occupancy/backbones/__init__.py ================================================ from .resnet3d import CustomResNet3D from .pred_block import Predictor ================================================ FILE: projects/occ_plugin/occupancy/backbones/pred_block.py ================================================ # Developed by Junyi Ma # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import copy import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmdet3d.models.builder import BACKBONES from collections import OrderedDict from mmcv.cnn import build_norm_layer class Residual(nn.Module): def __init__( self, in_channels, out_channels, kernel_size=(3,3,1), dilation=1, norm_cfg=None ): super().__init__() out_channels = out_channels or in_channels # padding_size = ((kernel_size - 1) * dilation + 1) // 2 padding_size = [0,0,0] if dilation!=0: padding_size[0] = ((kernel_size[0] - 1) * dilation + 1) // 2 padding_size[1] = ((kernel_size[1] - 1) * dilation + 1) // 2 padding_size[2] = ((kernel_size[2] - 1) * dilation + 1) // 2 padding_size = tuple(padding_size) conv = nn.Conv3d(in_channels, out_channels, kernel_size=kernel_size, bias=False, dilation=dilation, padding=padding_size) self.layers = nn.Sequential(conv, build_norm_layer(norm_cfg, out_channels)[1], nn.LeakyReLU(inplace=True)) if out_channels == in_channels : self.projection = None else: projection = OrderedDict() projection.update( { 'conv_skip_proj': nn.Conv3d(in_channels, out_channels, kernel_size=1, bias=False), 'bn_skip_proj': build_norm_layer(norm_cfg, out_channels)[1], } ) self.projection = nn.Sequential(projection) def forward(self, x): x_residual = self.layers(x) if self.projection is not None: x_projected = self.projection(x) return x_residual + x_projected return x_residual + x @BACKBONES.register_module() class Predictor(nn.Module): def __init__( self, n_input_channels=None, in_timesteps=None, out_timesteps=None, norm_cfg=None, ): super(Predictor, self).__init__() self.predictor = nn.ModuleList() for nf in n_input_channels: self.predictor.append(nn.Sequential( Residual(nf * in_timesteps, nf * in_timesteps, norm_cfg=norm_cfg), Residual(nf * in_timesteps, nf * in_timesteps, norm_cfg=norm_cfg), Residual(nf * in_timesteps, nf * out_timesteps, norm_cfg=norm_cfg), Residual(nf * out_timesteps, nf * out_timesteps, norm_cfg=norm_cfg), Residual(nf * out_timesteps, nf * out_timesteps, norm_cfg=norm_cfg), )) def forward(self, x): assert len(x) == len(self.predictor), f'The number of input feature tensors ({len(x)}) must be the same as the number of STPredictor blocks {len(self.predictor)}.' y = [] for i in range(len(x)): b, c, _, _, _ = x[i].shape y.append(self.predictor[i](x[i])) return y ================================================ FILE: projects/occ_plugin/occupancy/backbones/resnet3d.py ================================================ import math from functools import partial from mmdet3d.models.builder import BACKBONES from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule import torch import torch.nn as nn import torch.nn.functional as F import pdb def get_inplanes(): return [64, 128, 256, 512] def conv3x3x3(in_planes, out_planes, stride=1): return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) def conv1x1x1(in_planes, out_planes, stride=1): return nn.Conv3d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False) class BasicBlock(nn.Module): expansion = 1 def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None): super().__init__() self.conv1 = conv3x3x3(in_planes, planes, stride) self.bn1 = build_norm_layer(norm_cfg, planes)[1] self.relu = nn.ReLU(inplace=True) self.conv2 = conv3x3x3(planes, planes) self.bn2 = build_norm_layer(norm_cfg, planes)[1] self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out class Bottleneck(nn.Module): expansion = 4 def __init__(self, in_planes, planes, stride=1, downsample=None, norm_cfg=None): super().__init__() self.conv1 = conv1x1x1(in_planes, planes) self.bn1 = build_norm_layer(norm_cfg, planes)[1] self.conv2 = conv3x3x3(planes, planes, stride) self.bn2 = build_norm_layer(norm_cfg, planes)[1] self.conv3 = conv1x1x1(planes, planes * self.expansion) self.bn3 = build_norm_layer(norm_cfg, planes * self.expansion)[1] self.relu = nn.ReLU(inplace=True) self.downsample = downsample self.stride = stride def forward(self, x): residual = x out = self.conv1(x) out = self.bn1(out) out = self.relu(out) out = self.conv2(out) out = self.bn2(out) out = self.relu(out) out = self.conv3(out) out = self.bn3(out) if self.downsample is not None: residual = self.downsample(x) out += residual out = self.relu(out) return out @BACKBONES.register_module() class CustomResNet3D(BaseModule): def __init__(self, depth, block_inplanes=[64, 128, 256, 512], block_strides=[1, 2, 2, 2], out_indices=(0, 1, 2, 3), n_input_channels=3, shortcut_type='B', norm_cfg=dict(type='BN3d', requires_grad=True), widen_factor=1.0): super().__init__() layer_metas = { 10: [1, 1, 1, 1], 18: [2, 2, 2, 2], 34: [3, 4, 6, 3], 50: [3, 4, 6, 3], 101: [3, 4, 23, 3], } if depth in [10, 18, 34]: block = BasicBlock else: assert depth in [50, 101] block = Bottleneck layers = layer_metas[depth] block_inplanes = [int(x * widen_factor) for x in block_inplanes] self.in_planes = block_inplanes[0] self.out_indices = out_indices # replace the first several downsampling layers with the channel-squeeze layers self.input_proj = nn.Sequential( nn.Conv3d(n_input_channels, self.in_planes, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False), build_norm_layer(norm_cfg, self.in_planes)[1], nn.ReLU(inplace=True), ) self.layers = nn.ModuleList() for i in range(len(block_inplanes)): self.layers.append(self._make_layer(block, block_inplanes[i], layers[i], shortcut_type, block_strides[i], norm_cfg=norm_cfg)) for m in self.modules(): if isinstance(m, nn.Conv3d): nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') elif isinstance(m, nn.BatchNorm3d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) def _downsample_basic_block(self, x, planes, stride): out = F.avg_pool3d(x, kernel_size=1, stride=stride) zero_pads = torch.zeros(out.size(0), planes - out.size(1), out.size(2), out.size(3), out.size(4)) if isinstance(out.data, torch.cuda.FloatTensor): zero_pads = zero_pads.cuda() out = torch.cat([out.data, zero_pads], dim=1) return out def _make_layer(self, block, planes, blocks, shortcut_type, stride=1, norm_cfg=None): downsample = None if stride != 1 or self.in_planes != planes * block.expansion: if shortcut_type == 'A': downsample = partial(self._downsample_basic_block, planes=planes * block.expansion, stride=stride) else: downsample = nn.Sequential( conv1x1x1(self.in_planes, planes * block.expansion, stride), build_norm_layer(norm_cfg, planes * block.expansion)[1]) layers = [] layers.append( block(in_planes=self.in_planes, planes=planes, stride=stride, downsample=downsample, norm_cfg=norm_cfg)) self.in_planes = planes * block.expansion for i in range(1, blocks): layers.append(block(self.in_planes, planes, norm_cfg=norm_cfg)) return nn.Sequential(*layers) def forward(self, x): x = self.input_proj(x) res = [] for index, layer in enumerate(self.layers): x = layer(x) if index in self.out_indices: res.append(x) return res def generate_model(model_depth, **kwargs): assert model_depth in [10, 18, 34, 50, 101, 152, 200] if model_depth == 10: model = ResNet(BasicBlock, [1, 1, 1, 1], get_inplanes(), **kwargs) elif model_depth == 18: model = ResNet(BasicBlock, [2, 2, 2, 2], get_inplanes(), **kwargs) elif model_depth == 34: model = ResNet(BasicBlock, [3, 4, 6, 3], get_inplanes(), **kwargs) elif model_depth == 50: model = ResNet(Bottleneck, [3, 4, 6, 3], get_inplanes(), **kwargs) elif model_depth == 101: model = ResNet(Bottleneck, [3, 4, 23, 3], get_inplanes(), **kwargs) elif model_depth == 152: model = ResNet(Bottleneck, [3, 8, 36, 3], get_inplanes(), **kwargs) elif model_depth == 200: model = ResNet(Bottleneck, [3, 24, 36, 3], get_inplanes(), **kwargs) return model ================================================ FILE: projects/occ_plugin/occupancy/dense_heads/__init__.py ================================================ from .occ_head import OccHead from .flow_head import FlowHead ================================================ FILE: projects/occ_plugin/occupancy/dense_heads/flow_head.py ================================================ # Developed by Junyi Ma # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import copy import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import reduce_mean from mmdet.models import HEADS from mmcv.cnn import build_conv_layer, build_norm_layer from .lovasz_softmax import lovasz_softmax from projects.occ_plugin.utils.nusc_param import nusc_class_names from projects.occ_plugin.utils.semkitti import Smooth_L1_loss @HEADS.register_module() class FlowHead(nn.Module): def __init__( self, in_channels, out_channel, num_level=1, num_img_level=1, soft_weights=False, loss_weight_cfg=None, conv_cfg=dict(type='Conv3d', bias=False), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), fine_topk=20000, point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], final_occ_size=[256, 256, 20], empty_idx=0, visible_loss=False, balance_cls_weight=True, train_cfg=None, test_cfg=None, ): super(FlowHead, self).__init__() if type(in_channels) is not list: in_channels = [in_channels] self.in_channels = in_channels self.out_channel = out_channel self.num_level = num_level self.fine_topk = fine_topk self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float() self.final_occ_size = final_occ_size self.visible_loss = visible_loss # voxel-level prediction self.occ_convs = nn.ModuleList() for i in range(self.num_level): mid_channel = self.in_channels[i] occ_conv = nn.Sequential( build_conv_layer(conv_cfg, in_channels=self.in_channels[i], out_channels=mid_channel, kernel_size=3, stride=1, padding=1), build_norm_layer(norm_cfg, mid_channel)[1], nn.ReLU(inplace=True)) self.occ_convs.append(occ_conv) self.occ_pred_conv = nn.Sequential( build_conv_layer(conv_cfg, in_channels=mid_channel, out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0), build_norm_layer(norm_cfg, mid_channel//2)[1], nn.ReLU(inplace=True),) self.last_conv = build_conv_layer(conv_cfg, in_channels=mid_channel//2, out_channels=out_channel, kernel_size=1, stride=1, padding=0) self.last_conv.bias = nn.parameter.Parameter(torch.tensor([0.0, 0.0, 0.0], requires_grad=True)) self.soft_weights = soft_weights self.num_img_level = num_img_level self.num_point_sampling_feat = self.num_level if self.soft_weights: soft_in_channel = mid_channel self.voxel_soft_weights = nn.Sequential( build_conv_layer(conv_cfg, in_channels=soft_in_channel, out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0), build_norm_layer(norm_cfg, soft_in_channel//2)[1], nn.ReLU(inplace=True), build_conv_layer(conv_cfg, in_channels=soft_in_channel//2, out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0)) self.class_names = nusc_class_names self.empty_idx = empty_idx def forward_coarse_voxel(self, voxel_feats): output_occs = [] output = {} for feats, occ_conv in zip(voxel_feats, self.occ_convs): output_occs.append(occ_conv(feats)) if self.soft_weights: voxel_soft_weights = self.voxel_soft_weights(output_occs[0]) voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1) else: voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat out_voxel_feats = 0 _, _, H, W, D= output_occs[0].shape for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)): feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous() out_voxel_feats += feats * weights.unsqueeze(1) output['out_voxel_feats'] = [out_voxel_feats] out_voxel = self.occ_pred_conv(out_voxel_feats) out_voxel = self.last_conv(out_voxel) output['occ'] = [out_voxel] return output def forward(self, voxel_feats, img_feats=None, transform=None, **kwargs): assert type(voxel_feats) is list and len(voxel_feats) == self.num_level # forward voxel output = self.forward_coarse_voxel(voxel_feats) res = { 'output_voxels': output['occ'], } return res def loss_voxel(self, output_voxels, target_voxels, tag): B, C, H, W, D = output_voxels.shape tB, tC, tF, tH, tW, tD = target_voxels.shape target_voxels = target_voxels.view(tB*tC, tF, tH, tW, tD) assert torch.isnan(output_voxels).sum().item() == 0 output_voxels = output_voxels.permute(0,2,3,4,1) target_voxels = target_voxels.permute(0,2,3,4,1) loss_dict = {} loss_dict['loss_flow_l1_{}'.format(tag)] = (0.5) * (0.1) * Smooth_L1_loss(output_voxels, target_voxels, ignore_index=255) return loss_dict def loss_point(self, fine_coord, fine_output, target_voxels, tag): selected_gt = target_voxels[:, fine_coord[0,:], fine_coord[1,:], fine_coord[2,:]].long()[0] assert torch.isnan(selected_gt).sum().item() == 0, torch.isnan(selected_gt).sum().item() assert torch.isnan(fine_output).sum().item() == 0, torch.isnan(fine_output).sum().item() loss_dict = {} # igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels) loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(fine_output, selected_gt, ignore_index=255) loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(fine_output, selected_gt, ignore_index=255) loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(fine_output, selected_gt, ignore_index=255, non_empty_idx=self.empty_idx) loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(fine_output, dim=1), selected_gt, ignore=255) return loss_dict def loss(self, output_voxels=None, output_coords_fine=None, output_voxels_fine=None, target_voxels=None, **kwargs): loss_dict = {} for index, output_voxel in enumerate(output_voxels): loss_dict.update(self.loss_voxel(output_voxel, target_voxels, tag='c_{}'.format(index))) return loss_dict ================================================ FILE: projects/occ_plugin/occupancy/dense_heads/lovasz_softmax.py ================================================ # -*- coding:utf-8 -*- # author: Xinge """ Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim Berman 2018 ESAT-PSI KU Leuven (MIT License) """ from __future__ import print_function, division import torch from torch.autograd import Variable import torch.nn.functional as F import numpy as np try: from itertools import ifilterfalse except ImportError: # py3k from itertools import filterfalse as ifilterfalse def lovasz_grad(gt_sorted): """ Computes gradient of the Lovasz extension w.r.t sorted errors See Alg. 1 in paper """ p = len(gt_sorted) gts = gt_sorted.sum() intersection = gts - gt_sorted.float().cumsum(0) union = gts + (1 - gt_sorted).float().cumsum(0) jaccard = 1. - intersection / union if p > 1: # cover 1-pixel case jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] return jaccard def iou_binary(preds, labels, EMPTY=1., ignore=None, per_image=True): """ IoU for foreground class binary: 1 foreground, 0 background """ if not per_image: preds, labels = (preds,), (labels,) ious = [] for pred, label in zip(preds, labels): intersection = ((label == 1) & (pred == 1)).sum() union = ((label == 1) | ((pred == 1) & (label != ignore))).sum() if not union: iou = EMPTY else: iou = float(intersection) / float(union) ious.append(iou) iou = mean(ious) # mean accross images if per_image return 100 * iou def iou(preds, labels, C, EMPTY=1., ignore=None, per_image=False): """ Array of IoU for each (non ignored) class """ if not per_image: preds, labels = (preds,), (labels,) ious = [] for pred, label in zip(preds, labels): iou = [] for i in range(C): if i != ignore: # The ignored label is sometimes among predicted classes (ENet - CityScapes) intersection = ((label == i) & (pred == i)).sum() union = ((label == i) | ((pred == i) & (label != ignore))).sum() if not union: iou.append(EMPTY) else: iou.append(float(intersection) / float(union)) ious.append(iou) ious = [mean(iou) for iou in zip(*ious)] # mean accross images if per_image return 100 * np.array(ious) # --------------------------- BINARY LOSSES --------------------------- def lovasz_hinge(logits, labels, per_image=True, ignore=None): """ Binary Lovasz hinge loss logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) per_image: compute the loss per image instead of per batch ignore: void class id """ if per_image: loss = mean(lovasz_hinge_flat(*flatten_binary_scores(log.unsqueeze(0), lab.unsqueeze(0), ignore)) for log, lab in zip(logits, labels)) else: loss = lovasz_hinge_flat(*flatten_binary_scores(logits, labels, ignore)) return loss def lovasz_hinge_flat(logits, labels): """ Binary Lovasz hinge loss logits: [P] Variable, logits at each prediction (between -\infty and +\infty) labels: [P] Tensor, binary ground truth labels (0 or 1) ignore: label to ignore """ if len(labels) == 0: # only void pixels, the gradients should be 0 return logits.sum() * 0. signs = 2. * labels.float() - 1. errors = (1. - logits * Variable(signs)) errors_sorted, perm = torch.sort(errors, dim=0, descending=True) perm = perm.data gt_sorted = labels[perm] grad = lovasz_grad(gt_sorted) loss = torch.dot(F.relu(errors_sorted), Variable(grad)) return loss def flatten_binary_scores(scores, labels, ignore=None): """ Flattens predictions in the batch (binary case) Remove labels equal to 'ignore' """ scores = scores.view(-1) labels = labels.view(-1) if ignore is None: return scores, labels valid = (labels != ignore) vscores = scores[valid] vlabels = labels[valid] return vscores, vlabels class StableBCELoss(torch.nn.modules.Module): def __init__(self): super(StableBCELoss, self).__init__() def forward(self, input, target): neg_abs = - input.abs() loss = input.clamp(min=0) - input * target + (1 + neg_abs.exp()).log() return loss.mean() def binary_xloss(logits, labels, ignore=None): """ Binary Cross entropy loss logits: [B, H, W] Variable, logits at each pixel (between -\infty and +\infty) labels: [B, H, W] Tensor, binary ground truth masks (0 or 1) ignore: void class id """ logits, labels = flatten_binary_scores(logits, labels, ignore) loss = StableBCELoss()(logits, Variable(labels.float())) return loss # --------------------------- MULTICLASS LOSSES --------------------------- def lovasz_softmax(probas, labels, classes='present', per_image=False, ignore=None): """ Multi-class Lovasz-Softmax loss probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. per_image: compute the loss per image instead of per batch ignore: void class labels """ if per_image: loss = mean(lovasz_softmax_flat(*flatten_probas(prob.unsqueeze(0), lab.unsqueeze(0), ignore), classes=classes) for prob, lab in zip(probas, labels)) else: loss = lovasz_softmax_flat(*flatten_probas(probas, labels, ignore), classes=classes) return loss def lovasz_softmax_flat(probas, labels, classes='present'): """ Multi-class Lovasz-Softmax loss probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) labels: [P] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. """ if probas.numel() == 0: # only void pixels, the gradients should be 0 return probas * 0. C = probas.size(1) losses = [] class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes for c in class_to_sum: fg = (labels == c).float() # foreground for class c if (classes is 'present' and fg.sum() == 0): continue if C == 1: if len(classes) > 1: raise ValueError('Sigmoid output possible only with 1 class') class_pred = probas[:, 0] else: class_pred = probas[:, c] errors = (Variable(fg) - class_pred).abs() errors_sorted, perm = torch.sort(errors, 0, descending=True) perm = perm.data fg_sorted = fg[perm] losses.append(torch.dot(errors_sorted, Variable(lovasz_grad(fg_sorted)))) return mean(losses) def flatten_probas(probas, labels, ignore=None): """ Flattens predictions in the batch """ if probas.dim() == 2: if ignore is not None: valid = (labels != ignore) probas = probas[valid] labels = labels[valid] return probas, labels elif probas.dim() == 3: # assumes output of a sigmoid layer B, H, W = probas.size() probas = probas.view(B, 1, H, W) elif probas.dim() == 5: #3D segmentation B, C, L, H, W = probas.size() probas = probas.contiguous().view(B, C, L, H*W) B, C, H, W = probas.size() probas = probas.permute(0, 2, 3, 1).contiguous().view(-1, C) # B * H * W, C = P, C labels = labels.view(-1) if ignore is None: return probas, labels valid = (labels != ignore) vprobas = probas[valid.nonzero().squeeze()] vlabels = labels[valid] return vprobas, vlabels def xloss(logits, labels, ignore=None): """ Cross entropy loss """ return F.cross_entropy(logits, Variable(labels), ignore_index=255) def jaccard_loss(probas, labels,ignore=None, smooth = 100, bk_class = None): """ Something wrong with this loss Multi-class Lovasz-Softmax loss probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. per_image: compute the loss per image instead of per batch ignore: void class labels """ vprobas, vlabels = flatten_probas(probas, labels, ignore) true_1_hot = torch.eye(vprobas.shape[1])[vlabels] if bk_class: one_hot_assignment = torch.ones_like(vlabels) one_hot_assignment[vlabels == bk_class] = 0 one_hot_assignment = one_hot_assignment.float().unsqueeze(1) true_1_hot = true_1_hot*one_hot_assignment true_1_hot = true_1_hot.to(vprobas.device) intersection = torch.sum(vprobas * true_1_hot) cardinality = torch.sum(vprobas + true_1_hot) loss = (intersection + smooth / (cardinality - intersection + smooth)).mean() return (1-loss)*smooth def hinge_jaccard_loss(probas, labels,ignore=None, classes = 'present', hinge = 0.1, smooth =100): """ Multi-class Hinge Jaccard loss probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. ignore: void class labels """ vprobas, vlabels = flatten_probas(probas, labels, ignore) C = vprobas.size(1) losses = [] class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes for c in class_to_sum: if c in vlabels: c_sample_ind = vlabels == c cprobas = vprobas[c_sample_ind,:] non_c_ind =np.array([a for a in class_to_sum if a != c]) class_pred = cprobas[:,c] max_non_class_pred = torch.max(cprobas[:,non_c_ind],dim = 1)[0] TP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.) + smooth FN = torch.sum(torch.clamp(max_non_class_pred - class_pred, min = -hinge)+hinge) if (~c_sample_ind).sum() == 0: FP = 0 else: nonc_probas = vprobas[~c_sample_ind,:] class_pred = nonc_probas[:,c] max_non_class_pred = torch.max(nonc_probas[:,non_c_ind],dim = 1)[0] FP = torch.sum(torch.clamp(class_pred - max_non_class_pred, max = hinge)+1.) losses.append(1 - TP/(TP+FP+FN)) if len(losses) == 0: return 0 return mean(losses) # --------------------------- HELPER FUNCTIONS --------------------------- def isnan(x): return x != x def mean(l, ignore_nan=False, empty=0): """ nanmean compatible with generators. """ l = iter(l) if ignore_nan: l = ifilterfalse(isnan, l) try: n = 1 acc = next(l) except StopIteration: if empty == 'raise': raise ValueError('Empty mean') return empty for n, v in enumerate(l, 2): acc += v if n == 1: return acc return acc / n ================================================ FILE: projects/occ_plugin/occupancy/dense_heads/occ_head.py ================================================ # Developed by Junyi Ma based on the codebase of OpenOccupancy # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc import numpy as np import torch import torch.nn as nn import torch.nn.functional as F from mmdet.core import reduce_mean from mmdet.models import HEADS from mmcv.cnn import build_conv_layer, build_norm_layer from .lovasz_softmax import lovasz_softmax from projects.occ_plugin.utils.nusc_param import nusc_class_names from projects.occ_plugin.utils.semkitti import geo_scal_loss, sem_scal_loss, CE_ssc_loss @HEADS.register_module() class OccHead(nn.Module): def __init__( self, in_channels, out_channel, num_level=1, num_img_level=1, soft_weights=False, loss_weight_cfg=None, conv_cfg=dict(type='Conv3d', bias=False), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), fine_topk=20000, point_cloud_range=[-51.2, -51.2, -5.0, 51.2, 51.2, 3.0], final_occ_size=[256, 256, 20], empty_idx=0, visible_loss=False, balance_cls_weight=True, train_cfg=None, test_cfg=None, ): super(OccHead, self).__init__() if type(in_channels) is not list: in_channels = [in_channels] self.in_channels = in_channels self.out_channel = out_channel self.num_level = num_level self.fine_topk = fine_topk self.point_cloud_range = torch.tensor(np.array(point_cloud_range)).float() self.final_occ_size = final_occ_size self.visible_loss = visible_loss if loss_weight_cfg is None: self.loss_weight_cfg = { "loss_voxel_ce_weight": 1.0, "loss_voxel_sem_scal_weight": 1.0, "loss_voxel_geo_scal_weight": 1.0, "loss_voxel_lovasz_weight": 1.0, } else: self.loss_weight_cfg = loss_weight_cfg # voxel losses self.loss_voxel_ce_weight = self.loss_weight_cfg.get('loss_voxel_ce_weight', 1.0) self.loss_voxel_sem_scal_weight = self.loss_weight_cfg.get('loss_voxel_sem_scal_weight', 1.0) self.loss_voxel_geo_scal_weight = self.loss_weight_cfg.get('loss_voxel_geo_scal_weight', 1.0) self.loss_voxel_lovasz_weight = self.loss_weight_cfg.get('loss_voxel_lovasz_weight', 1.0) # voxel-level prediction self.occ_convs = nn.ModuleList() for i in range(self.num_level): mid_channel = self.in_channels[i] occ_conv = nn.Sequential( build_conv_layer(conv_cfg, in_channels=self.in_channels[i], out_channels=mid_channel, kernel_size=3, stride=1, padding=1), build_norm_layer(norm_cfg, mid_channel)[1], nn.ReLU(inplace=True)) self.occ_convs.append(occ_conv) self.occ_pred_conv = nn.Sequential( build_conv_layer(conv_cfg, in_channels=mid_channel, out_channels=mid_channel//2, kernel_size=1, stride=1, padding=0), build_norm_layer(norm_cfg, mid_channel//2)[1], nn.ReLU(inplace=True), build_conv_layer(conv_cfg, in_channels=mid_channel//2, out_channels=out_channel, kernel_size=1, stride=1, padding=0)) self.soft_weights = soft_weights self.num_img_level = num_img_level self.num_point_sampling_feat = self.num_level if self.soft_weights: soft_in_channel = mid_channel self.voxel_soft_weights = nn.Sequential( build_conv_layer(conv_cfg, in_channels=soft_in_channel, out_channels=soft_in_channel//2, kernel_size=1, stride=1, padding=0), build_norm_layer(norm_cfg, soft_in_channel//2)[1], nn.ReLU(inplace=True), build_conv_layer(conv_cfg, in_channels=soft_in_channel//2, out_channels=self.num_point_sampling_feat, kernel_size=1, stride=1, padding=0)) # num_point_sampling_feat=4 if balance_cls_weight: # out_channel self.class_weights = np.ones((out_channel,)) self.class_weights[1:] = 5 self.class_weights = torch.from_numpy(self.class_weights) else: self.class_weights = np.ones((out_channel,)) self.class_names = nusc_class_names self.empty_idx = empty_idx def forward_coarse_voxel(self, voxel_feats): output_occs = [] output = {} for feats, occ_conv in zip(voxel_feats, self.occ_convs): output_occs.append(occ_conv(feats)) if self.soft_weights: voxel_soft_weights = self.voxel_soft_weights(output_occs[0]) voxel_soft_weights = torch.softmax(voxel_soft_weights, dim=1) else: voxel_soft_weights = torch.ones([output_occs[0].shape[0], self.num_point_sampling_feat, 1, 1, 1],).to(output_occs[0].device) / self.num_point_sampling_feat out_voxel_feats = 0 _, _, H, W, D= output_occs[0].shape for feats, weights in zip(output_occs, torch.unbind(voxel_soft_weights, dim=1)): feats = F.interpolate(feats, size=[H, W, D], mode='trilinear', align_corners=False).contiguous() out_voxel_feats += feats * weights.unsqueeze(1) output['out_voxel_feats'] = [out_voxel_feats] out_voxel = self.occ_pred_conv(out_voxel_feats) output['occ'] = [out_voxel] return output def forward(self, voxel_feats, img_feats=None, transform=None, **kwargs): assert type(voxel_feats) is list and len(voxel_feats) == self.num_level # forward voxel output = self.forward_coarse_voxel(voxel_feats) res = { 'output_voxels': output['occ'], } return res def loss_voxel(self, output_voxels, target_voxels, tag): B, C, H, W, D = output_voxels.shape tB, tC, tH, tW, tD = target_voxels.shape target_voxels = target_voxels.view(tB*tC, tH, tW, tD) ratio = target_voxels.shape[2] // H if ratio != 1: target_voxels = target_voxels.reshape(B, H, ratio, W, ratio, D, ratio).permute(0,1,3,5,2,4,6).reshape(B, H, W, D, ratio**3) empty_mask = target_voxels.sum(-1) == self.empty_idx target_voxels = target_voxels.to(torch.int64) occ_space = target_voxels[~empty_mask] occ_space[occ_space==0] = -torch.arange(len(occ_space[occ_space==0])).to(occ_space.device) - 1 target_voxels[~empty_mask] = occ_space target_voxels = torch.mode(target_voxels, dim=-1)[0] target_voxels[target_voxels<0] = 255 target_voxels = target_voxels.long() assert torch.isnan(output_voxels).sum().item() == 0 assert torch.isnan(target_voxels).sum().item() == 0 loss_dict = {} loss_dict['loss_voxel_ce_{}'.format(tag)] = (0.5) * CE_ssc_loss(output_voxels, target_voxels, self.class_weights.type_as(output_voxels), ignore_index=255) return loss_dict def loss_point(self, fine_coord, fine_output, target_voxels, tag): selected_gt = target_voxels[:, fine_coord[0,:], fine_coord[1,:], fine_coord[2,:]].long()[0] assert torch.isnan(selected_gt).sum().item() == 0, torch.isnan(selected_gt).sum().item() assert torch.isnan(fine_output).sum().item() == 0, torch.isnan(fine_output).sum().item() loss_dict = {} # igore 255 = ignore noise. we keep the loss bascward for the label=0 (free voxels) loss_dict['loss_voxel_ce_{}'.format(tag)] = self.loss_voxel_ce_weight * CE_ssc_loss(fine_output, selected_gt, ignore_index=255) loss_dict['loss_voxel_sem_scal_{}'.format(tag)] = self.loss_voxel_sem_scal_weight * sem_scal_loss(fine_output, selected_gt, ignore_index=255) loss_dict['loss_voxel_geo_scal_{}'.format(tag)] = self.loss_voxel_geo_scal_weight * geo_scal_loss(fine_output, selected_gt, ignore_index=255, non_empty_idx=self.empty_idx) loss_dict['loss_voxel_lovasz_{}'.format(tag)] = self.loss_voxel_lovasz_weight * lovasz_softmax(torch.softmax(fine_output, dim=1), selected_gt, ignore=255) return loss_dict def loss(self, output_voxels=None, output_coords_fine=None, output_voxels_fine=None, target_voxels=None, **kwargs): loss_dict = {} for index, output_voxel in enumerate(output_voxels): loss_dict.update(self.loss_voxel(output_voxel, target_voxels, tag='c_{}'.format(index))) return loss_dict ================================================ FILE: projects/occ_plugin/occupancy/dense_heads/utils.py ================================================ # borrowed from https://github.com/GuoPingPan/RPVNet/blob/main/core/models/utils/utils.py import time import numpy as np import torch from torch.nn.functional import grid_sample import torchsparse.nn.functional as F from torchsparse import PointTensor, SparseTensor from torchsparse.nn.utils import get_kernel_offsets __all__ = ['initial_voxelize', 'point_to_voxel', 'voxel_to_point', 'range_to_point','point_to_range'] def initial_voxelize(z: PointTensor, after_res) -> SparseTensor: new_float_coord = torch.cat( [z.C[:, :3] / after_res, z.C[:, -1].view(-1, 1)], 1) pc_hash = F.sphash(torch.round(new_float_coord).int()) sparse_hash = torch.unique(pc_hash) idx_query = F.sphashquery(pc_hash, sparse_hash) counts = F.spcount(idx_query.int(), len(sparse_hash)) inserted_coords = F.spvoxelize(torch.round(new_float_coord), idx_query,counts) inserted_coords = torch.round(inserted_coords).int() inserted_feat = F.spvoxelize(z.F, idx_query, counts) new_tensor = SparseTensor(inserted_feat, inserted_coords, 1) new_tensor.cmaps.setdefault((1,1,1), new_tensor.coords) z.additional_features['idx_query'][(1,1,1)] = idx_query z.additional_features['counts'][(1,1,1)] = counts return new_tensor.to(z.F.device) def point_to_voxel(x: SparseTensor, z: PointTensor) -> SparseTensor: if z.additional_features is None or z.additional_features['idx_query'] is None \ or z.additional_features['idx_query'].get(x.s) is None: pc_hash = F.sphash( torch.cat([ torch.round(z.C[:, :3] / x.s[0]).int(), z.C[:, -1].int().view(-1, 1) ], 1)) sparse_hash = F.sphash(x.C) idx_query = F.sphashquery(pc_hash, sparse_hash) counts = F.spcount(idx_query.int(), x.C.shape[0]) else: idx_query = z.additional_features['idx_query'][x.s] counts = z.additional_features['counts'][x.s] inserted_feat = F.spvoxelize(z.F, idx_query, counts) new_tensor = SparseTensor(inserted_feat, x.C, x.s) new_tensor.cmaps = x.cmaps new_tensor.kmaps = x.kmaps return new_tensor def voxel_to_point(x: SparseTensor, z: PointTensor, nearest=False) -> torch.Tensor: if z.idx_query is None or z.weights is None or z.idx_query.get(x.s) is None \ or z.weights.get(x.s) is None: off = get_kernel_offsets(2, x.s, 1, device=z.F.device) old_hash = F.sphash( torch.cat([ torch.round(z.C[:, :3] / x.s[0]).int(), z.C[:, -1].int().view(-1, 1) ], 1), off) pc_hash = F.sphash(x.C.to(z.F.device)) idx_query = F.sphashquery(old_hash, pc_hash) weights = F.calc_ti_weights(z.C, idx_query, scale=x.s[0]).transpose(0, 1).contiguous() idx_query = idx_query.transpose(0, 1).contiguous() if nearest: weights[:, 1:] = 0. idx_query[:, 1:] = -1 new_feat = F.spdevoxelize(x.F, idx_query, weights) if x.s == (1,1,1): z.idx_query[x.s] = idx_query z.weights[x.s] = weights else: new_feat = F.spdevoxelize(x.F, z.idx_query.get(x.s), z.weights.get(x.s)) return new_feat def range_to_point(x,px,py): r2p = [] for batch,(p_x,p_y) in enumerate(zip(px,py)): pypx = torch.stack([p_x,p_y],dim=2).to(px[0].device) resampled = grid_sample(x[batch].unsqueeze(0),pypx.unsqueeze(0)) r2p.append(resampled.squeeze().permute(1,0)) return torch.concat(r2p,dim=0) def point_to_range(range_shape,pF,px,py): H, W = range_shape cnt = 0 r = [] # t1 = time.time() for batch,(p_x,p_y) in enumerate(zip(px,py)): image = torch.zeros(size=(H,W,pF.shape[1])).to(px[0].device) image_cumsum = torch.zeros(size=(H,W,pF.shape[1])) + 1e-5 p_x = torch.floor((p_x/2. + 0.5) * W).long() p_y = torch.floor((p_y/2. + 0.5) * H).long() ''' v1: directly assign ''' # image[p_y,p_x] = pF[cnt:cnt+p_x.shape[1]] ''' v2: use average ''' image[p_y,p_x] += pF[cnt:cnt+p_x.shape[1]] image_cumsum[p_y,p_x] += torch.ones(pF.shape[1]) image = image/image_cumsum.to(px[0].device) r.append(image.permute(2,0,1)) cnt += p_x.shape[1] return torch.stack(r,dim=0).to(px[0].device) ================================================ FILE: projects/occ_plugin/occupancy/detectors/__init__.py ================================================ from .ocfnet import OCFNet ================================================ FILE: projects/occ_plugin/occupancy/detectors/bevdepth.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import math import torch from mmcv.runner import force_fp32 import torch.nn.functional as F from mmdet.models import DETECTORS from mmdet3d.models import builder from torch.utils.checkpoint import checkpoint from mmdet3d.models.detectors import CenterPoint import pdb @DETECTORS.register_module() class BEVDet(CenterPoint): def __init__(self, img_view_transformer=None, img_bev_encoder_backbone=None, img_bev_encoder_neck=None, **kwargs): super(BEVDet, self).__init__(**kwargs) if img_view_transformer is not None: self.img_view_transformer = builder.build_neck(img_view_transformer) else: self.img_view_transformer = None if img_bev_encoder_backbone is not None: self.img_bev_encoder_backbone = builder.build_backbone(img_bev_encoder_backbone) else: self.img_bev_encoder_backbone = torch.nn.Identity() if img_bev_encoder_neck is not None: self.img_bev_encoder_neck = builder.build_neck(img_bev_encoder_neck) else: self.img_bev_encoder_neck = torch.nn.Identity() def image_encoder(self, img): imgs = img B, N, C, imH, imW = imgs.shape imgs = imgs.view(B * N, C, imH, imW) x = self.img_backbone(imgs) if self.with_img_neck: x = self.img_neck(x) if type(x) in [list, tuple]: x = x[0] _, output_dim, ouput_H, output_W = x.shape x = x.view(B, N, output_dim, ouput_H, output_W) return x @force_fp32() def bev_encoder(self, x): x = self.img_bev_encoder_backbone(x) x = self.img_bev_encoder_neck(x) if type(x) in [list, tuple]: x = x[0] return x def extract_img_feat(self, img, img_metas): """Extract features of images.""" x = self.image_encoder(img[0]) x = self.img_view_transformer([x] + img[1:7]) x = self.bev_encoder(x) return [x] def extract_feat(self, points, img, img_metas): """Extract features from images and points.""" img_feats = self.extract_img_feat(img, img_metas) pts_feats = None return (img_feats, pts_feats) def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img_inputs=None, proposals=None, gt_bboxes_ignore=None): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats = self.extract_feat( points, img=img_inputs, img_metas=img_metas) assert self.with_pts_bbox losses = dict() losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) return losses def forward_test(self, points=None, img_metas=None, img_inputs=None, **kwargs): """ Args: points (list[torch.Tensor]): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxC, which contains all points in the batch. img_metas (list[list[dict]]): the outer list indicates test-time augs (multiscale, flip, etc.) and the inner list indicates images in a batch img (list[torch.Tensor], optional): the outer list indicates test-time augmentations and inner torch.Tensor should have a shape NxCxHxW, which contains all images in the batch. Defaults to None. """ for var, name in [(img_inputs, 'img_inputs'), (img_metas, 'img_metas')]: if not isinstance(var, list): raise TypeError('{} must be a list, but got {}'.format( name, type(var))) num_augs = len(img_inputs) if num_augs != len(img_metas): raise ValueError( 'num of augmentations ({}) != num of image meta ({})'.format( len(img_inputs), len(img_metas))) if not isinstance(img_inputs[0][0],list): img_inputs = [img_inputs] if img_inputs is None else img_inputs points = [points] if points is None else points return self.simple_test(points[0], img_metas[0], img_inputs[0], **kwargs) else: return self.aug_test(None, img_metas[0], img_inputs[0], **kwargs) def aug_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" combine_type = self.test_cfg.get('combine_type','output') if combine_type=='output': return self.aug_test_combine_output(points, img_metas, img, rescale) elif combine_type=='feature': return self.aug_test_combine_feature(points, img_metas, img, rescale) else: assert False def simple_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" img_feats, _ = self.extract_feat(points, img=img, img_metas=img_metas) bbox_list = [dict() for _ in range(len(img_metas))] bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox return bbox_list def forward_dummy(self, points=None, img_metas=None, img_inputs=None, **kwargs): img_feats, _ = self.extract_feat(points, img=img_inputs, img_metas=img_metas) from mmdet3d.core.bbox.structures.box_3d_mode import LiDARInstance3DBoxes img_metas=[dict(box_type_3d=LiDARInstance3DBoxes)] bbox_list = [dict() for _ in range(1)] assert self.with_pts_bbox bbox_pts = self.simple_test_pts( img_feats, img_metas, rescale=False) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox return bbox_list @DETECTORS.register_module() class BEVDet4D(BEVDet): def __init__(self, pre_process=None, align_after_view_transfromation=False, detach=True, detach_pre_process=False, **kwargs): super(BEVDet4D, self).__init__(**kwargs) self.pre_process = pre_process is not None if self.pre_process: self.pre_process_net = builder.build_backbone(pre_process) self.align_after_view_transfromation = align_after_view_transfromation self.detach = detach self.detach_pre_process = detach_pre_process @force_fp32() def shift_feature(self, input, trans, rots): n, c, h, w = input.shape _, v, _ = trans[0].shape # generate grid xs = torch.linspace(0, w - 1, w, dtype=input.dtype, device=input.device).view(1, w).expand(h, w) ys = torch.linspace(0, h - 1, h, dtype=input.dtype, device=input.device).view(h, 1).expand(h, w) grid = torch.stack((xs, ys, torch.ones_like(xs)), -1) grid = grid.view(1, h, w, 3).expand(n,h,w,3).view(n, h, w, 3, 1) # get transformation from current lidar frame to adjacent lidar frame # transformation from current camera frame to current lidar frame c02l0 = torch.zeros((n, v, 4, 4), dtype=grid.dtype).to(grid) c02l0[:, :, :3, :3] = rots[0] c02l0[:, :, :3, 3] = trans[0] c02l0[:, :, 3, 3] = 1 # transformation from adjacent camera frame to current lidar frame c12l0 = torch.zeros((n, v, 4, 4), dtype=grid.dtype).to(grid) c12l0[:, :, :3, :3] = rots[1] c12l0[:, :, :3, 3] = trans[1] c12l0[:, :, 3, 3] = 1 # transformation from current lidar frame to adjacent lidar frame l02l1 = c02l0.matmul(torch.inverse(c12l0))[:, 0, :, :].view(n, 1, 1, 4, 4) ''' c02l0 * inv(c12l0) = c02l0 * inv(l12l0 * c12l1) = c02l0 * inv(c12l1) * inv(l12l0) = l02l1 # c02l0==c12l1 ''' l02l1 = l02l1[:, :, :, [True, True, False, True], :][:, :, :, :, [True, True, False, True]] feat2bev = torch.zeros((3, 3), dtype=grid.dtype).to(grid) feat2bev[0, 0] = self.img_view_transformer.dx[0] feat2bev[1, 1] = self.img_view_transformer.dx[1] feat2bev[0, 2] = self.img_view_transformer.bx[0] - \ self.img_view_transformer.dx[0] / 2. feat2bev[1, 2] = self.img_view_transformer.bx[1] - \ self.img_view_transformer.dx[1] / 2. feat2bev[2, 2] = 1 feat2bev = feat2bev.view(1, 3, 3) tf = torch.inverse(feat2bev).matmul(l02l1).matmul(feat2bev) # transform and normalize grid = tf.matmul(grid) normalize_factor = torch.tensor([w - 1.0, h - 1.0], dtype=input.dtype, device=input.device) grid = grid[:, :, :, :2, 0] / normalize_factor.view(1, 1, 1, 2) * 2.0 - 1.0 output = F.grid_sample(input, grid.to(input.dtype), align_corners=True) return output def prepare_bev_feat(self, img, rot, tran, intrin, post_rot, post_tran, bda): x = self.image_encoder(img) bev_feat = self.img_view_transformer([x, rot, tran, intrin, post_rot, post_tran, bda]) if self.pre_process: bev_feat = self.pre_process_net(bev_feat)[0] return bev_feat def extract_img_feat(self, img, img_metas): inputs = img """Extract features of images.""" B, N, _, H, W = inputs[0].shape N = N//2 imgs = inputs[0].view(B,N,2,3,H,W) imgs = torch.split(imgs,1,2) imgs = [t.squeeze(2) for t in imgs] rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7] extra = [rots.view(B,2,N,3,3), trans.view(B,2,N,3), intrins.view(B,2,N,3,3), post_rots.view(B,2,N,3,3), post_trans.view(B,2,N,3)] extra = [torch.split(t, 1, 1) for t in extra] extra = [[p.squeeze(1) for p in t] for t in extra] rots, trans, intrins, post_rots, post_trans = extra bev_feat_list = [] key_frame=True # back propagation for key frame only for img, rot, tran, intrin, post_rot, \ post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans): if self.align_after_view_transfromation: rot, tran = rots[0], trans[0] inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda) if not key_frame and self.detach: with torch.no_grad(): bev_feat = self.prepare_bev_feat(*inputs_curr) else: bev_feat = self.prepare_bev_feat(*inputs_curr) bev_feat_list.append(bev_feat) key_frame = False if self.align_after_view_transfromation: bev_feat_list[1] = self.shift_feature(bev_feat_list[1], trans, rots) bev_feat = torch.cat(bev_feat_list, dim=1) x = self.bev_encoder(bev_feat) return [x] class BEVDepth_Base(object): def extract_feat(self, points, img, img_metas): """Extract features from images and points.""" img_feats, depth = self.extract_img_feat(img, img_metas) pts_feats = None return (img_feats, pts_feats, depth) def simple_test(self, points, img_metas, img=None, rescale=False): """Test function without augmentaiton.""" img_feats, _, _ = self.extract_feat(points, img=img, img_metas=img_metas) bbox_list = [dict() for _ in range(len(img_metas))] bbox_pts = self.simple_test_pts(img_feats, img_metas, rescale=rescale) for result_dict, pts_bbox in zip(bbox_list, bbox_pts): result_dict['pts_bbox'] = pts_bbox return bbox_list def forward_train(self, points=None, img_metas=None, gt_bboxes_3d=None, gt_labels_3d=None, gt_labels=None, gt_bboxes=None, img_inputs=None, proposals=None, gt_bboxes_ignore=None): """Forward training function. Args: points (list[torch.Tensor], optional): Points of each sample. Defaults to None. img_metas (list[dict], optional): Meta information of each sample. Defaults to None. gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`], optional): Ground truth 3D boxes. Defaults to None. gt_labels_3d (list[torch.Tensor], optional): Ground truth labels of 3D boxes. Defaults to None. gt_labels (list[torch.Tensor], optional): Ground truth labels of 2D boxes in images. Defaults to None. gt_bboxes (list[torch.Tensor], optional): Ground truth 2D boxes in images. Defaults to None. img (torch.Tensor optional): Images of each sample with shape (N, C, H, W). Defaults to None. proposals ([list[torch.Tensor], optional): Predicted proposals used for training Fast RCNN. Defaults to None. gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth 2D boxes in images to be ignored. Defaults to None. Returns: dict: Losses of different branches. """ img_feats, pts_feats, depth = self.extract_feat( points, img=img_inputs, img_metas=img_metas) assert self.with_pts_bbox # assert len(img_inputs) == 8 depth_gt = img_inputs[7] loss_depth = self.img_view_transformer.get_depth_loss(depth_gt, depth) losses = dict(loss_depth=loss_depth) losses_pts = self.forward_pts_train(img_feats, gt_bboxes_3d, gt_labels_3d, img_metas, gt_bboxes_ignore) losses.update(losses_pts) # some modifications if hasattr(self.img_view_transformer, 'loss_depth_reg_weight') and self.img_view_transformer.loss_depth_reg_weight > 0: losses['loss_depth_reg'] = self.img_view_transformer.get_depth_reg_loss(depth_gt, depth) return losses @DETECTORS.register_module() class BEVDepth(BEVDepth_Base, BEVDet): def extract_img_feat(self, img, img_metas): """Extract features of images.""" x = self.image_encoder(img[0]) # img: imgs, rots, trans, intrins, post_rots, post_trans, gt_depths, sensor2sensors rots, trans, intrins, post_rots, post_trans, bda = img[1:7] mlp_input = self.img_view_transformer.get_mlp_input(rots, trans, intrins, post_rots, post_trans, bda) geo_inputs = [rots, trans, intrins, post_rots, post_trans, bda, mlp_input] x, depth = self.img_view_transformer([x] + geo_inputs) x = self.bev_encoder(x) return [x], depth @DETECTORS.register_module() class BEVDepth4D(BEVDepth_Base, BEVDet4D): def prepare_bev_feat(self, img, rot, tran, intrin, post_rot, post_tran, bda, mlp_input): x = self.image_encoder(img) bev_feat, depth = self.img_view_transformer([x, rot, tran, intrin, post_rot, post_tran, bda, mlp_input]) if self.detach_pre_process and self.pre_process: bev_feat = self.pre_process_net(bev_feat)[0] return bev_feat, depth def extract_img_feat(self, img, img_metas): inputs = img """Extract features of images.""" B, N, _, H, W = inputs[0].shape N = N//2 imgs = inputs[0].view(B,N,2,3,H,W) imgs = torch.split(imgs,1,2) imgs = [t.squeeze(2) for t in imgs] rots, trans, intrins, post_rots, post_trans, bda = inputs[1:7] extra = [rots.view(B,2,N,3,3), trans.view(B,2,N,3), intrins.view(B,2,N,3,3), post_rots.view(B,2,N,3,3), post_trans.view(B,2,N,3)] extra = [torch.split(t, 1, 1) for t in extra] extra = [[p.squeeze(1) for p in t] for t in extra] rots, trans, intrins, post_rots, post_trans = extra bev_feat_list = [] depth_list = [] key_frame=True # back propagation for key frame only for img, rot, tran, intrin, post_rot, \ post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans): if self.align_after_view_transfromation: rot, tran = rots[0], trans[0] mlp_input = self.img_view_transformer.get_mlp_input( rots[0], trans[0], intrin,post_rot, post_tran, bda) inputs_curr = (img, rot, tran, intrin, post_rot, post_tran, bda, mlp_input) if not key_frame and self.detach: with torch.no_grad(): bev_feat, depth = self.prepare_bev_feat(*inputs_curr) else: bev_feat, depth = self.prepare_bev_feat(*inputs_curr) if not self.detach_pre_process and self.pre_process: bev_feat = self.pre_process_net(bev_feat)[0] bev_feat_list.append(bev_feat) depth_list.append(depth) key_frame = False if self.align_after_view_transfromation: bev_feat_list[1] = self.shift_feature(bev_feat_list[1], trans, rots) bev_feat = torch.cat(bev_feat_list, dim=1) x = self.bev_encoder(bev_feat) return [x], depth_list[0] @DETECTORS.register_module() class BEVStereo(BEVDepth4D): def __init__(self, bevdet_model=False, **kwargs): super(BEVStereo, self).__init__(**kwargs) self.bevdet_model = bevdet_model def image_encoder(self, img): imgs = img B, N, C, imH, imW = imgs.shape imgs = imgs.view(B * N, C, imH, imW) x = self.img_backbone(imgs) stereo_feat = x[0].detach() # if isinstance(self.img_backbone, CustomSwin): # stereo_feat = stereo_feat.permute(0,2,3,1) # stereo_feat = self.img_backbone.norm0(stereo_feat) # stereo_feat = stereo_feat.permute(0,3,1,2) if self.bevdet_model: x = x[-2:] if self.with_img_neck: x = self.img_neck(x) if type(x) in [list, tuple]: x = x[0] _, output_dim, ouput_H, output_W = x.shape x = x.view(B, N, output_dim, ouput_H, output_W) return x, stereo_feat def extract_img_feat(self, img, img_metas): inputs = img """Extract features of images.""" B, N, _, H, W = inputs[0].shape N = N//2 imgs = inputs[0].view(B,N,2,3,H,W) imgs = torch.split(imgs,1,2) imgs = [t.squeeze(2) for t in imgs] rots, trans, intrins, post_rots, post_trans, bda, _, sensor2sensors = inputs[1:9] extra = [rots.view(B,2,N,3,3), trans.view(B,2,N,3), intrins.view(B,2,N,3,3), post_rots.view(B,2,N,3,3), post_trans.view(B,2,N,3), sensor2sensors.view(B,2,N,4,4)] sensor2ego_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots) sensor2ego_mats[:,:,:,:3,:3] = extra[0] sensor2ego_mats[:,:,:,:3,3] = extra[1] intrin_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots) intrin_mats[:,:,:,:3,:3] = extra[2] ida_mats = torch.eye(4).view(1,1,1,4,4).repeat(B,2,N,1,1).to(rots) ida_mats[:,:,:,:3,:3] = extra[3] ida_mats[:,:,:,:3,3] = extra[4] mats_dict = dict(sensor2ego_mats=sensor2ego_mats, intrin_mats=intrin_mats, ida_mats=ida_mats, sensor2sensor_mats=extra[5], bda_mat=bda) extra = [torch.split(t, 1, 1) for t in extra] extra = [[p.squeeze(1) for p in t] for t in extra] rots, trans, intrins, post_rots, post_trans, sensor2sensors = extra # forward stereo depth context_all_sweeps = list() depth_feat_all_sweeps = list() img_feats_all_sweeps = list() stereo_feats_all_sweeps = list() mu_all_sweeps = list() sigma_all_sweeps = list() mono_depth_all_sweeps = list() range_score_all_sweeps = list() key_frame=True # back propagation for key frame only for img, rot, tran, intrin, post_rot, post_tran in zip(imgs, rots, trans, intrins, post_rots, post_trans): if not key_frame: with torch.no_grad(): img_feats, stereo_feats = self.image_encoder(img) img_feats = img_feats.view(B * N, *img_feats.shape[2:]) mlp_input = \ self.img_view_transformer.get_mlp_input(rots[0], trans[0], intrin, post_rot, post_tran, bda) depth_feat, context, mu, sigma, range_score, mono_depth = \ self.img_view_transformer.depth_net(img_feats, mlp_input) context = self.img_view_transformer.context_downsample_net( context) else: img_feats, stereo_feats = self.image_encoder(img) img_feats = img_feats.view(B * N, *img_feats.shape[2:]) mlp_input = \ self.img_view_transformer.get_mlp_input(rots[0], trans[0], intrin, post_rot, post_tran, bda) depth_feat, context, mu, sigma, range_score, mono_depth = \ self.img_view_transformer.depth_net(img_feats, mlp_input) context = self.img_view_transformer.context_downsample_net( context) img_feats_all_sweeps.append(img_feats) stereo_feats_all_sweeps.append(stereo_feats) depth_feat_all_sweeps.append(depth_feat) context_all_sweeps.append(context) mu_all_sweeps.append(mu) sigma_all_sweeps.append(sigma) mono_depth_all_sweeps.append(mono_depth) range_score_all_sweeps.append(range_score) key_frame = False depth_score_all_sweeps = list() num_sweeps = 2 for ref_idx in range(num_sweeps): sensor2sensor_mats = list() for src_idx in range(num_sweeps): ref2keysensor_mats = sensor2sensors[ref_idx].inverse() key2srcsensor_mats = sensor2sensors[src_idx] ref2srcsensor_mats = key2srcsensor_mats @ ref2keysensor_mats sensor2sensor_mats.append(ref2srcsensor_mats) if ref_idx == 0: # last iteration on stage 1 does not have propagation # (photometric consistency filtering) if self.img_view_transformer.use_mask: stereo_depth, mask = self.img_view_transformer._forward_stereo( ref_idx, stereo_feats_all_sweeps, mono_depth_all_sweeps, mats_dict, sensor2sensor_mats, mu_all_sweeps, sigma_all_sweeps, range_score_all_sweeps, depth_feat_all_sweeps, ) else: stereo_depth = self.img_view_transformer._forward_stereo( ref_idx, stereo_feats_all_sweeps, mono_depth_all_sweeps, mats_dict, sensor2sensor_mats, mu_all_sweeps, sigma_all_sweeps, range_score_all_sweeps, depth_feat_all_sweeps, ) else: with torch.no_grad(): # last iteration on stage 1 does not have # propagation (photometric consistency filtering) if self.img_view_transformer.use_mask: stereo_depth, mask = self.img_view_transformer._forward_stereo( ref_idx, stereo_feats_all_sweeps, mono_depth_all_sweeps, mats_dict, sensor2sensor_mats, mu_all_sweeps, sigma_all_sweeps, range_score_all_sweeps, depth_feat_all_sweeps, ) else: stereo_depth = self.img_view_transformer._forward_stereo( ref_idx, stereo_feats_all_sweeps, mono_depth_all_sweeps, mats_dict, sensor2sensor_mats, mu_all_sweeps, sigma_all_sweeps, range_score_all_sweeps, depth_feat_all_sweeps, ) if self.img_view_transformer.use_mask: depth_score = ( mono_depth_all_sweeps[ref_idx] + self.img_view_transformer.depth_downsample_net( stereo_depth) * mask).softmax(1) else: depth_score = ( mono_depth_all_sweeps[ref_idx] + self.img_view_transformer.depth_downsample_net(stereo_depth)).softmax(1) depth_score_all_sweeps.append(depth_score) # forward view transformation bev_feat_list = [] key_frame=True # back propagation for key frame only for image_feat, depth_prob, rot, tran, intrin, post_rot, post_tran in \ zip(context_all_sweeps, depth_score_all_sweeps, rots, trans, intrins, post_rots, post_trans): if not key_frame: with torch.no_grad(): input_curr = (image_feat.view(B,N,*image_feat.shape[1:]), depth_prob, rot, tran, intrin, post_rot, post_tran, bda) bev_feat = self.img_view_transformer(input_curr) else: input_curr = (image_feat.view(B,N,*image_feat.shape[1:]), depth_prob, rot, tran, intrin, post_rot, post_tran, bda) bev_feat = self.img_view_transformer(input_curr) if self.pre_process: bev_feat = self.pre_process_net(bev_feat)[0] bev_feat_list.append(bev_feat) key_frame = False bev_feat = torch.cat(bev_feat_list, dim=1) x = self.bev_encoder(bev_feat) return [x], depth_score_all_sweeps[0] ================================================ FILE: projects/occ_plugin/occupancy/detectors/ocfnet.py ================================================ # Developed by Junyi Ma based on the codebase of OpenOccupancy and PowerBEV # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc from sys import api_version import torch import collections import torch.nn.functional as F import os from mmdet.models import DETECTORS from mmcv.runner import auto_fp16, force_fp32 from .bevdepth import BEVDepth from mmdet3d.models import builder import numpy as np import time import copy from typing import Tuple @DETECTORS.register_module() class OCFNet(BEVDepth): def __init__(self, loss_cfg=None, only_generate_dataset=False, disable_loss_depth=False, test_present=False, empty_idx=0, max_label=2, occ_encoder_backbone=None, occ_predictor=None, occ_encoder_neck=None, flow_encoder_backbone=None, flow_predictor=None, flow_encoder_neck=None, flow_head=None, loss_norm=False, point_cloud_range=None, time_receptive_field=None, n_future_frames=None, n_future_frames_plus=None, iou_thresh_for_vpq=None, record_time=False, save_pred=False, save_path=None, **kwargs): ''' OCFNet is our end-to-end baseline for 4D camera-only occupancy forecasting there are two streams for the forecasting task with aggregated voxel features as inputs: 1. occ_encoder_backbone -> occ_predictor -> occ_encoder_neck -> pts_bbox_head 2. flow_encoder_backbone -> flow_predictor -> flow_encoder_neck -> flow_head time_receptive_field: number of historical frames used for forecasting (including the present one), default: 3 n_future_frames: number of forecasted future frames, default: 4 n_future_frames_plus: number of estimated frames (> n_future_frames), default: 6 (if only forecasting occupancy states rather than instances, n_future_frames=n_future_frames_plus can be set) iou_thresh_for_vpq: iou threshold to associate instances in 3D instance prediction, default: 0.2 (adjusted by occupancy forecasting performance) ''' super().__init__(**kwargs) self.loss_cfg = loss_cfg self.disable_loss_depth = disable_loss_depth self.only_generate_dataset = only_generate_dataset self.loss_norm = loss_norm self.time_receptive_field = time_receptive_field self.n_future_frames = n_future_frames self.n_future_frames_plus = n_future_frames_plus self.eval_start_moment = self.n_future_frames_plus - self.n_future_frames - 1 self.iou_thresh_for_vpq = iou_thresh_for_vpq self.record_time = record_time self.time_stats = collections.defaultdict(list) self.empty_idx = empty_idx self.max_label = max_label self.occ_encoder_backbone = builder.build_backbone(occ_encoder_backbone) self.occ_predictor = builder.build_neck(occ_predictor) self.occ_encoder_neck = builder.build_neck(occ_encoder_neck) self.flow_encoder_backbone = builder.build_backbone(flow_encoder_backbone) self.flow_encoder_neck = builder.build_neck(flow_encoder_neck) self.flow_predictor = builder.build_neck(flow_predictor) self.flow_head = builder.build_head(flow_head) self.point_cloud_range = point_cloud_range self.spatial_extent3d = (self.point_cloud_range[3]-self.point_cloud_range[0], \ self.point_cloud_range[4]-self.point_cloud_range[1], \ self.point_cloud_range[5]-self.point_cloud_range[2]) self.ego_center_shift_proportion_x = abs(self.point_cloud_range[0])/(self.point_cloud_range[3]-self.point_cloud_range[0]) self.ego_center_shift_proportion_y = abs(self.point_cloud_range[1])/(self.point_cloud_range[4]-self.point_cloud_range[1]) self.ego_center_shift_proportion_z = abs(self.point_cloud_range[2])/(self.point_cloud_range[5]-self.point_cloud_range[2]) self.n_cam = 6 self.fine_grained = False self.vehicles_id = 1 self.test_present = test_present self.save_pred = save_pred self.save_path = save_path def image_encoder(self, img): imgs = img B, N, C, imH, imW = imgs.shape imgs = imgs.view(B * N, C, imH, imW) backbone_feats = self.img_backbone(imgs) if self.with_img_neck: x = self.img_neck(backbone_feats) if type(x) in [list, tuple]: x = x[0] else: x = backbone_feats _, output_dim, ouput_H, output_W = x.shape x = x.view(B, N, output_dim, ouput_H, output_W) return {'x': x, 'img_feats': [x.clone()]} @force_fp32() def occ_encoder(self, x): b, t, _, _, _, _ = x.shape x = x.reshape(b, -1, *x.shape[3:]) x = self.occ_encoder_backbone(x) x = self.occ_predictor(x) x = self.occ_encoder_neck(x) return x @force_fp32() def flow_encoder(self, x): b, t, _, _, _, _ = x.shape x = x.reshape(b, -1, *x.shape[3:]) x = self.flow_encoder_backbone(x) x = self.flow_predictor(x) x = self.flow_encoder_neck(x) return x def mat2pose_vec(self, matrix: torch.Tensor): """ Converts a 4x4 pose matrix into a 6-dof pose vector Args: matrix (ndarray): 4x4 pose matrix Returns: vector (ndarray): 6-dof pose vector comprising translation components (tx, ty, tz) and rotation components (rx, ry, rz) """ # M[1, 2] = -sinx*cosy, M[2, 2] = +cosx*cosy rotx = torch.atan2(-matrix[..., 1, 2], matrix[..., 2, 2]) # M[0, 2] = +siny, M[1, 2] = -sinx*cosy, M[2, 2] = +cosx*cosy cosy = torch.sqrt(matrix[..., 1, 2] ** 2 + matrix[..., 2, 2] ** 2) roty = torch.atan2(matrix[..., 0, 2], cosy) # M[0, 0] = +cosy*cosz, M[0, 1] = -cosy*sinz rotz = torch.atan2(-matrix[..., 0, 1], matrix[..., 0, 0]) rotation = torch.stack((rotx, roty, rotz), dim=-1) # Extract translation params translation = matrix[..., :3, 3] return torch.cat((translation, rotation), dim=-1) def pack_dbatch_and_dtime(self, x): b = x.shape[0] s = x.shape[1] x = x.view(b*s, *x.shape[2:]) return x def unpack_dbatch_and_dtime(self, x, b, s): assert (b*s) == x.shape[0] x = x.view(b, s, *x.shape[1:]) return x def extract_img_feat(self, img_inputs_seq, img_metas): ''' Extract features of sequential input images ''' if self.record_time: torch.cuda.synchronize() t0 = time.time() imgs_seq, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, gt_depths_seq, sensor2sensors_seq = img_inputs_seq self.batch_size = imgs_seq.shape[0] self.sequence_length = imgs_seq.shape[1] imgs_seq = imgs_seq[:,0:self.time_receptive_field,...].contiguous() rots_seq = rots_seq[:,0:self.time_receptive_field,...].contiguous() trans_seq = trans_seq[:,0:self.time_receptive_field,...].contiguous() intrins_seq = intrins_seq[:,0:self.time_receptive_field,...].contiguous() post_rots_seq = post_rots_seq[:,0:self.time_receptive_field,...].contiguous() post_trans_seq = post_trans_seq[:,0:self.time_receptive_field,...].contiguous() gt_depths_seq = gt_depths_seq[:,0:self.time_receptive_field,...].contiguous() sensor2sensors_seq = sensor2sensors_seq[:,0:self.time_receptive_field,...].contiguous() imgs_seq = self.pack_dbatch_and_dtime(imgs_seq) rots_seq = self.pack_dbatch_and_dtime(rots_seq) trans_seq = self.pack_dbatch_and_dtime(trans_seq) intrins_seq = self.pack_dbatch_and_dtime(intrins_seq) post_rots_seq = self.pack_dbatch_and_dtime(post_rots_seq) post_trans_seq = self.pack_dbatch_and_dtime(post_trans_seq) gt_depths_seq = self.pack_dbatch_and_dtime(gt_depths_seq) sensor2sensors_seq = self.pack_dbatch_and_dtime(sensor2sensors_seq) self.n_cam = imgs_seq.shape[1] img_enc_feats = self.image_encoder(imgs_seq) x = img_enc_feats['x'] img_feats = img_enc_feats['img_feats'] if self.record_time: torch.cuda.synchronize() t1 = time.time() self.time_stats['img_encoder'].append(t1 - t0) mlp_input_seq = self.img_view_transformer.get_mlp_input(rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq) geo_inputs = [rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, None, mlp_input_seq] x, depth = self.img_view_transformer([x] + geo_inputs) if self.record_time: torch.cuda.synchronize() t2 = time.time() self.time_stats['view_transformer'].append(t2 - t1) return x, depth, img_feats def warp_features(self, x, flow, tseq): ''' Warp features by motion flow ''' if flow is None: return x b, dc, dx, dy, dz = x.shape # normalize 3D motion flow flow[:,0,-1] =flow[:,0,-1]*dx/self.spatial_extent3d[0] flow[:,1,-1] =flow[:,1,-1]*dy/self.spatial_extent3d[1] flow[:,2,-1] =flow[:,2,-1]*dz/self.spatial_extent3d[2] nx, ny, nz = torch.meshgrid(torch.arange(dx, dtype=torch.float, device=x.device), \ torch.arange(dy, dtype=torch.float, device=x.device), \ torch.arange(dz, dtype=torch.float, device=x.device)) tmp = torch.ones((dx, dy, dz), device=x.device) grid = torch.stack((nx, ny, nz, tmp), dim=-1) # centralize shift shift_x = self.ego_center_shift_proportion_x * dx shift_y = self.ego_center_shift_proportion_y * dy shift_z = self.ego_center_shift_proportion_z * dz grid[:, :, :, 0] = grid[:, :, :, 0] - shift_x grid[:, :, :, 1] = grid[:, :, :, 1] - shift_y grid[:, :, :, 2] = grid[:, :, :, 2] - shift_z grid = grid.view(dx*dy*dz, grid.shape[-1]).unsqueeze(-1) #[N,4,1] transformation = flow.unsqueeze(1) # [bs, 1, 4, 4] transformed_grid = transformation @ grid # [bs, N, 4, 1] transformed_grid = transformed_grid.squeeze(-1) # [bs, N, 4] transformed_grid = transformed_grid.view(-1, 4) # de-centralize transformed_grid[:, 0] = (transformed_grid[:, 0] + shift_x) transformed_grid[:, 1] = (transformed_grid[:, 1] + shift_y) transformed_grid[:, 2] = (transformed_grid[:, 2] + shift_z) transformed_grid = transformed_grid.round().long() # de-normalize grid = grid.squeeze(-1) grid = grid.view(-1, 4) grid[:, 0] = (grid[:, 0] + shift_x) grid[:, 1] = (grid[:, 1] + shift_y) grid[:, 2] = (grid[:, 2] + shift_z) grid = grid.round().long() batch_ix = torch.cat([torch.full([transformed_grid.shape[0] // b, 1], ix, device=x.device, dtype=torch.long) for ix in range(b)]) kept = (transformed_grid[:,0] >= 0) & (transformed_grid[:,0] = 0) & (transformed_grid[:,1] = 0) & (transformed_grid[:,2] < dz) transformed_grid = transformed_grid[kept] batch_ix = batch_ix[kept] grid = grid[kept] warped_x = torch.zeros_like(x, device=x.device) # hard coding for reducing memory usage # erratum for new version split_num = 32 gap = transformed_grid.shape[0]//split_num for tt in range(split_num-1): start_idx_tt = int(tt*gap) end_idx_tt = int((tt+1)*gap) current_batch = batch_ix[start_idx_tt:end_idx_tt] ixx = transformed_grid[start_idx_tt:end_idx_tt, 0] ixy = transformed_grid[start_idx_tt:end_idx_tt, 1] ixz = transformed_grid[start_idx_tt:end_idx_tt, 2] ixx_ori = grid[start_idx_tt:end_idx_tt, 0] ixy_ori = grid[start_idx_tt:end_idx_tt, 1] ixz_ori = grid[start_idx_tt:end_idx_tt, 2] warped_x[current_batch, :, ixx, ixy, ixz] = x[current_batch, :, ixx_ori, ixy_ori, ixz_ori] # for i in range(transformed_grid.shape[0]): # current_batch = batch_ix[i] # ixx = transformed_grid[i, 0] # ixy = transformed_grid[i, 1] # ixz = transformed_grid[i, 2] # ixx_ori = grid[i, 0] # ixy_ori = grid[i, 1] # ixz_ori = grid[i, 2] # warped_x[current_batch, :, ixx, ixy, ixz] = x[current_batch, :, ixx_ori, ixy_ori, ixz_ori] return warped_x def cumulative_warp_occ(self, lifted_feature_seq, future_egomotion, mode='bilinear'): ''' Warp sequential voxel features to the present frame by ego pose updates ''' future_egomotion = future_egomotion[:, :self.time_receptive_field, ...].contiguous() out = [lifted_feature_seq[:, -1]] cum_future_egomotion = future_egomotion[:, -2] for t in reversed(range(self.time_receptive_field - 1)): out.append(self.warp_features(lifted_feature_seq[:, t], cum_future_egomotion, t)) cum_future_egomotion = cum_future_egomotion @ future_egomotion[:, t - 1] return torch.stack(out[::-1], 1) def extract_feat(self, img_inputs_seq, img_metas, future_egomotion): ''' Extract voxel features from input sequential images ''' voxel_feats = None depth, img_feats = None, None if img_inputs_seq is not None: voxel_feats, depth, img_feats = self.extract_img_feat(img_inputs_seq, img_metas) if self.record_time: torch.cuda.synchronize() t0 = time.time() voxel_feats = self.unpack_dbatch_and_dtime(voxel_feats, self.batch_size, self.time_receptive_field) voxel_feats = self.cumulative_warp_occ(voxel_feats.clone(), future_egomotion, mode='bilinear') if self.record_time: torch.cuda.synchronize() t1 = time.time() self.time_stats['feature warping'].append(t1 - t0) # egomotion-aware future_egomotion_vec = self.mat2pose_vec(future_egomotion) batch_size, sequence_length, nbr_pose_channels = future_egomotion_vec.shape dx, dy, dz = voxel_feats.shape[-3:] future_egomotions_spatial = future_egomotion_vec.view(batch_size, sequence_length, nbr_pose_channels, 1, 1, 1).expand(batch_size, sequence_length, nbr_pose_channels, dx, dy, dz) # at time 0, no egomotion so feed zero vector future_egomotions_spatial = torch.cat([torch.zeros_like(future_egomotions_spatial[:, :1]), future_egomotions_spatial[:, :(self.time_receptive_field-1)]], dim=1) voxel_feats = torch.cat([voxel_feats, future_egomotions_spatial], dim=-4) voxel_feats_enc = self.occ_encoder(voxel_feats) if type(voxel_feats_enc) is not list: voxel_feats_enc = [voxel_feats_enc] if self.record_time: torch.cuda.synchronize() t2 = time.time() self.time_stats['occ_encoder'].append(t2 - t1) flow_feats_enc = self.flow_encoder(voxel_feats) if type(flow_feats_enc) is not list: flow_feats_enc = [flow_feats_enc] if self.record_time: torch.cuda.synchronize() t3 = time.time() self.time_stats['flow_encoder'].append(t3 - t2) depth = depth.view(-1, self.n_cam, *depth.shape[-3:]) return (voxel_feats_enc, flow_feats_enc, img_feats, depth) @force_fp32(apply_to=('voxel_feats')) def forward_pts_train( self, voxel_feats, gt_occ=None, points_occ=None, img_metas=None, transform=None, img_feats=None, ): if self.record_time: torch.cuda.synchronize() t0 = time.time() outs = self.pts_bbox_head( voxel_feats=voxel_feats, points=points_occ, img_metas=img_metas, img_feats=img_feats, transform=transform, ) if self.record_time: torch.cuda.synchronize() t1 = time.time() self.time_stats['occ_head'].append(t1 - t0) losses = self.pts_bbox_head.loss( output_voxels=outs['output_voxels'], target_voxels=gt_occ, target_points=points_occ, img_metas=img_metas, ) if self.record_time: torch.cuda.synchronize() t2 = time.time() self.time_stats['loss_occ'].append(t2 - t1) return losses @force_fp32(apply_to=('voxel_feats')) def forward_flow_train( self, voxel_feats, gt_occ=None, points_occ=None, img_metas=None, transform=None, img_feats=None, ): if self.record_time: torch.cuda.synchronize() t0 = time.time() outs = self.flow_head( voxel_feats=voxel_feats, points=points_occ, img_metas=img_metas, img_feats=img_feats, transform=transform, ) if self.record_time: torch.cuda.synchronize() t1 = time.time() self.time_stats['flow_head'].append(t1 - t0) losses = self.flow_head.loss( output_voxels=outs['output_voxels'], target_voxels=gt_occ, target_points=points_occ, img_metas=img_metas, ) if self.record_time: torch.cuda.synchronize() t2 = time.time() self.time_stats['loss_flow'].append(t2 - t1) return losses def forward_train(self, img_inputs_seq=None, segmentation=None, instance=None, attribute_label=None, flow=None, future_egomotion=None, gt_occ=None, img_metas=None, points_occ=None, **kwargs, ): ''' Train OCFNet using bbox-wise occupancy labels if self.fine_grained=False, else using voxel-wise labels from nuScenes-Occupancy ''' # manually stop forward if self.only_generate_dataset: return {"pseudo_loss": torch.tensor(0.0, device=segmentation.device, requires_grad=True)} if not self.fine_grained: gt_occ = segmentation voxel_feats, flow_feats, img_feats, depth = self.extract_feat( img_inputs_seq=img_inputs_seq, img_metas=img_metas, future_egomotion=future_egomotion) # training losses losses = dict() if self.record_time: torch.cuda.synchronize() t0 = time.time() # TODO: we will release the version with depth fine-tuning in the future if not self.disable_loss_depth and depth is not None: depth_gt = img_inputs_seq[-2][:,0:self.time_receptive_field,...].contiguous() depth_gt = depth_gt.view(depth_gt.shape[0]*depth_gt.shape[1],*depth_gt.shape[2:]) depth = depth.view(-1, *depth.shape[2:]) losses['loss_depth'] = self.img_view_transformer.get_depth_loss(depth_gt, depth) if self.record_time: torch.cuda.synchronize() t1 = time.time() self.time_stats['loss_depth'].append(t1 - t0) transform = img_inputs_seq[1:8] if img_inputs_seq is not None else None voxel_feats_seq = [] for voxel_feats_stage in voxel_feats: bs, sfeatures = voxel_feats_stage.shape[:2] voxel_feats_stage_ = voxel_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *voxel_feats_stage.shape[2:]) voxel_feats_seq.append(voxel_feats_stage_) gt_occ = gt_occ[:, -self.n_future_frames_plus:, ...] flow = flow[:, -self.n_future_frames_plus:, ...] losses_occupancy = self.forward_pts_train(voxel_feats_seq, gt_occ, points_occ, img_metas, img_feats=img_feats, transform=transform) losses.update(losses_occupancy) flow_feats_seq = [] for flow_feats_stage in flow_feats: bs, sfeatures = flow_feats_stage.shape[:2] flow_feats_stage_ = flow_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *flow_feats_stage.shape[2:]) flow_feats_seq.append(flow_feats_stage_) losses_flow = self.forward_flow_train(flow_feats_seq, flow, points_occ, img_metas, img_feats=img_feats, transform=transform) losses.update(losses_flow) if self.loss_norm: for loss_key in losses.keys(): if loss_key.startswith('loss'): losses[loss_key] = losses[loss_key] / (losses[loss_key].detach() + 1e-9) def logging_latencies(): # logging latencies avg_time = {key: sum(val) / len(val) for key, val in self.time_stats.items()} sum_time = sum(list(avg_time.values())) out_res = '' for key, val in avg_time.items(): out_res += '{}: {:.4f}, {:.1f}, '.format(key, val, val / sum_time) print(out_res) if self.record_time: logging_latencies() return losses def forward_test(self, img_inputs_seq=None, segmentation=None, instance=None, attribute_label=None, flow=None, future_egomotion=None, gt_occ=None, img_metas=None, points_occ=None, **kwargs, ): ''' Test OCFNet using IOU and VPQ metrics ''' # let batch size equals 1 while testing assert segmentation.shape[0] == 1 return self.simple_test(img_metas, img_inputs_seq, gt_occ=gt_occ, gt_flow=flow, segmentation=segmentation, instance=instance, future_egomotion=future_egomotion, **kwargs) def simple_test(self, img_metas, img_inputs_seq=None, rescale=False, points_occ=None, gt_occ=None, gt_flow=None, segmentation=None, instance=None, future_egomotion=None): # manually stop forward if self.only_generate_dataset: return {'hist_for_iou': 0, 'pred_c': 0, 'vpq':0} if not self.fine_grained: gt_occ = segmentation voxel_feats, flow_feats, img_feats, depth = self.extract_feat( img_inputs_seq=img_inputs_seq, img_metas=img_metas, future_egomotion=future_egomotion) transform = img_inputs_seq[1:8] if img_inputs_seq is not None else None voxel_feats_seq = [] for voxel_feats_stage in voxel_feats: bs, sfeatures = voxel_feats_stage.shape[:2] voxel_feats_stage_ = voxel_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *voxel_feats_stage.shape[2:]) voxel_feats_seq.append(voxel_feats_stage_) gt_occ = gt_occ[:, -self.n_future_frames_plus:, ...].contiguous() gt_occ = gt_occ.view(gt_occ.shape[0]*gt_occ.shape[1], *gt_occ.shape[2:]) instance = instance[:, -self.n_future_frames_plus:, ...].contiguous() instance = instance.view(instance.shape[0]*instance.shape[1], *instance.shape[2:]) output = self.pts_bbox_head( voxel_feats=voxel_feats_seq, points=points_occ, img_metas=img_metas, img_feats=img_feats, transform=transform, ) pred_c = output['output_voxels'][0] flow_feats_seq = [] for flow_feats_stage in flow_feats: bs, sfeatures = flow_feats_stage.shape[:2] flow_feats_stage_ = flow_feats_stage.view(bs*self.n_future_frames_plus, sfeatures//self.n_future_frames_plus, *flow_feats_stage.shape[2:]) flow_feats_seq.append(flow_feats_stage_) output_flow = self.flow_head( voxel_feats=flow_feats_seq, points=points_occ, img_metas=img_metas, img_feats=img_feats, transform=transform, ) gt_flow = gt_flow[:, -self.n_future_frames_plus:, ...].contiguous() gt_flow = gt_flow.view(gt_flow.shape[0]*gt_flow.shape[1], *gt_flow.shape[2:]) # pred_flow = output_flow['output_voxels'][0] # vpq = self.evaluate_instance_prediction(pred_c, pred_flow, gt_occ, instance) vpq = 0.1 if self.test_present: pred_c = pred_c[self.eval_start_moment:(self.eval_start_moment+1), ...] gt_occ = gt_occ[self.eval_start_moment:(self.eval_start_moment+1), ...] else: pred_c = pred_c[self.eval_start_moment+1:, ...] gt_occ = gt_occ[self.eval_start_moment+1:, ...] hist_for_iou = self.evaluate_occupancy_forecasting(pred_c, gt_occ, img_metas=img_metas, save_pred=self.save_pred, save_path=self.save_path) test_output = { 'hist_for_iou': hist_for_iou, 'pred_c': pred_c, 'vpq': vpq, } return test_output def evaluate_occupancy_forecasting(self, pred, gt, img_metas=None, save_pred=False, save_path=None): B, H, W, D = gt.shape pred = F.interpolate(pred, size=[H, W, D], mode='trilinear', align_corners=False).contiguous() hist_all = 0 iou_per_pred_list = [] pred_list = [] gt_list = [] for i in range(B): pred_cur = pred[i,...] pred_cur = torch.argmax(pred_cur, dim=0).cpu().numpy() gt_cur = gt[i, ...].cpu().numpy() gt_cur = gt_cur.astype(np.int) pred_list.append(pred_cur) gt_list.append(gt_cur) # ignore noise noise_mask = gt_cur != 255 # GMO and others for max_label=2 # multiple movable objects for max_label=9 hist_cur, iou_per_pred = fast_hist(pred_cur[noise_mask], gt_cur[noise_mask], max_label=self.max_label) hist_all = hist_all + hist_cur iou_per_pred_list.append(iou_per_pred) # whether save prediction results if save_pred: if not os.path.exists(save_path): os.mkdir(save_path) pred_for_save_list = [] for k in range(B): pred_for_save = torch.argmax(pred[k], dim=0).cpu() x_grid = torch.linspace(0, H-1, H, dtype=torch.long) x_grid = x_grid.view(H, 1, 1).expand(H, W, D) y_grid = torch.linspace(0, W-1, W, dtype=torch.long) y_grid = y_grid.view(1, W, 1).expand(H, W, D) z_grid = torch.linspace(0, D-1, D, dtype=torch.long) z_grid = z_grid.view(1, 1, D).expand(H, W, D) segmentation_for_save = torch.stack((x_grid, y_grid, z_grid), -1) segmentation_for_save = segmentation_for_save.view(-1, 3) segmentation_label = pred_for_save.squeeze(0).view(-1,1) segmentation_for_save = torch.cat((segmentation_for_save, segmentation_label), dim=-1) # N,4 kept = segmentation_for_save[:,-1]!=0 segmentation_for_save= segmentation_for_save[kept].cpu().numpy() pred_for_save_list.append(segmentation_for_save) np.savez(os.path.join(save_path, img_metas[0]["scene_token"]), pred_for_save_list) return hist_all def find_instance_centers(self, center_prediction: torch.Tensor, conf_threshold: float = 0.1, nms_kernel_size: float = 3): assert len(center_prediction.shape) == 4 center_prediction = F.threshold(center_prediction, threshold=conf_threshold, value=-1) nms_padding = (nms_kernel_size - 1) // 2 maxpooled_center_prediction = F.max_pool3d( center_prediction, kernel_size=nms_kernel_size, stride=1, padding=nms_padding ) # Filter all elements that are not the maximum (i.e. the center of the heatmap instance) center_prediction[center_prediction != maxpooled_center_prediction] = -1 return torch.nonzero(center_prediction > 0)[:, 1:] def group_pixels(self, centers: torch.Tensor, offset_predictions: torch.Tensor) -> torch.Tensor: dx, dy, dz = offset_predictions.shape[-3:] x_grid = ( torch.arange(dx, dtype=offset_predictions.dtype, device=offset_predictions.device) .view(1, dx, 1, 1) .repeat(1, 1, dy, dz) ) y_grid = ( torch.arange(dy, dtype=offset_predictions.dtype, device=offset_predictions.device) .view(1, 1, dy, 1) .repeat(1, dx, 1, dz) ) z_grid = ( torch.arange(dz, dtype=offset_predictions.dtype, device=offset_predictions.device) .view(1, 1, 1, dz) .repeat(1, dx, dy, 1) ) pixel_grid = torch.cat((x_grid, y_grid, z_grid), dim=0) center_locations = (pixel_grid + offset_predictions).view(3, dx*dy*dz, 1).permute(2, 1, 0) centers = centers.view(-1, 1, 3) distances = torch.norm(centers - center_locations, dim=-1) instance_id = torch.argmin(distances, dim=0).reshape(1, dx, dy, dz) + 1 return instance_id def update_instance_ids(self, instance_seg, old_ids, new_ids): indices = torch.arange(old_ids.max() + 1, device=instance_seg.device) for old_id, new_id in zip(old_ids, new_ids): indices[old_id] = new_id return indices[instance_seg].long() def make_instance_seg_consecutive(self, instance_seg): # Make the indices of instance_seg consecutive unique_ids = torch.unique(instance_seg) new_ids = torch.arange(len(unique_ids), device=instance_seg.device) instance_seg = self.update_instance_ids(instance_seg, unique_ids, new_ids) return instance_seg def get_instance_segmentation_and_centers(self, center_predictions: torch.Tensor, offset_predictions: torch.Tensor, foreground_mask: torch.Tensor, conf_threshold: float = 0.1, nms_kernel_size: float = 5, max_n_instance_centers: int = 100, ) -> Tuple[torch.Tensor, torch.Tensor]: dx, dy, dz = offset_predictions.shape[-3:] center_predictions = center_predictions.view(1, dx, dy, dz) offset_predictions = offset_predictions.view(3, dx, dy, dz) foreground_mask = foreground_mask.view(1, dx, dy, dz) centers = self.find_instance_centers(center_predictions, conf_threshold=conf_threshold, nms_kernel_size=nms_kernel_size) if not len(centers): return torch.zeros(center_predictions.shape, dtype=torch.int64, device=center_predictions.device) if len(centers) > max_n_instance_centers: centers = centers[:max_n_instance_centers].clone() instance_ids = self.group_pixels(centers, offset_predictions * foreground_mask.float()) instance_seg = (instance_ids * foreground_mask.float()).long() # Make the indices of instance_seg consecutive instance_seg = self.make_instance_seg_consecutive(instance_seg) return instance_seg.long() def flow_warp(self, occupancy, flow, mode='nearest', padding_mode='zeros'): ''' Warp ground-truth flow-origin occupancies according to predicted flows ''' _, num_waypoints, _, grid_dx_cells, grid_dy_cells, grid_dz_cells = occupancy.size() dx = torch.linspace(-1, 1, steps=grid_dx_cells) dy = torch.linspace(-1, 1, steps=grid_dy_cells) dz = torch.linspace(-1, 1, steps=grid_dz_cells) x_idx, y_idx, z_idx = torch.meshgrid(dx, dy, dz) identity_indices = torch.stack((x_idx, y_idx, z_idx), dim=0).to(device=occupancy.device) warped_occupancy = [] for k in range(num_waypoints): # 1 flow_origin_occupancy = occupancy[:, k] # B T 1 dx dy dz -> B 1 dx dy dz pred_flow = flow[:, k] # B T 3 dx dy dz -> B 3 dx dy dz # Normalize along the width and height direction normalize_pred_flow = torch.stack( (2.0 * pred_flow[:, 0] / (grid_dx_cells - 1), 2.0 * pred_flow[:, 1] / (grid_dy_cells - 1), 2.0 * pred_flow[:, 2] / (grid_dz_cells - 1),), dim=1, ) warped_indices = identity_indices + normalize_pred_flow warped_indices = warped_indices.permute(0, 2, 3, 4, 1) flow_origin_occupancy = flow_origin_occupancy.permute(0, 1, 4, 3, 2) sampled_occupancy = F.grid_sample( input=flow_origin_occupancy, grid=warped_indices, mode=mode, padding_mode='zeros', align_corners=True, ) warped_occupancy.append(sampled_occupancy) return warped_occupancy[0] def make_instance_id_temporally_consecutive(self, pred_inst, preds, backward_flow, ignore_index=255.0): assert pred_inst.shape[0] == 1, 'Assumes batch size = 1' # Initialise instance segmentations with prediction corresponding to the present consistent_instance_seg = [pred_inst.unsqueeze(0)] backward_flow = backward_flow.clone().detach() backward_flow[backward_flow == ignore_index] = 0.0 seq_len, _, dx, dy, dz = preds.shape for t in range(1, seq_len): init_warped_instance_seg = self.flow_warp(consistent_instance_seg[-1].unsqueeze(0).float(), backward_flow[t:t+1].unsqueeze(0)).int() warped_instance_seg = init_warped_instance_seg * preds[t:t+1, 0] consistent_instance_seg.append(warped_instance_seg) consistent_instance_seg = torch.cat(consistent_instance_seg, dim=1) return consistent_instance_seg def predict_instance_segmentation(self, pred_seg, pred_flow): pred_seg_sm = pred_seg.detach() pred_seg_sm = torch.argmax(pred_seg_sm, dim=1, keepdims=True) foreground_masks = pred_seg_sm.squeeze(1) == self.vehicles_id pred_inst_batch = self.get_instance_segmentation_and_centers( torch.softmax(pred_seg, dim=1)[0:1, self.vehicles_id].detach(), pred_flow[1:2].detach(), foreground_masks[1:2].detach(), nms_kernel_size=7, ) consistent_instance_seg = self.make_instance_id_temporally_consecutive( pred_inst_batch, pred_seg_sm[1:], pred_flow[1:].detach(), ) consistent_instance_seg = torch.cat([torch.zeros_like(pred_inst_batch.unsqueeze(0)), consistent_instance_seg], dim=1) return consistent_instance_seg.permute(1, 0, 2, 3, 4).long() # [1, 6, 512, 512, 40] def combine_mask(self, segmentation: torch.Tensor, instance: torch.Tensor, n_classes: int, n_all_things: int): ''' Shift all things ids by num_classes and combine things and stuff into a single mask ''' instance = instance.view(-1) instance_mask = instance > 0 instance = instance - 1 + n_classes segmentation = segmentation.clone().view(-1) segmentation_mask = segmentation < n_classes # Build an index from instance id to class id. instance_id_to_class_tuples = torch.cat( ( instance[instance_mask & segmentation_mask].unsqueeze(1), segmentation[instance_mask & segmentation_mask].unsqueeze(1), ), dim=1, ) instance_id_to_class = -instance_id_to_class_tuples.new_ones((n_all_things,)) instance_id_to_class[instance_id_to_class_tuples[:, 0]] = instance_id_to_class_tuples[:, 1] instance_id_to_class[torch.arange(n_classes, device=segmentation.device)] = torch.arange( n_classes, device=segmentation.device ) segmentation[instance_mask] = instance[instance_mask] segmentation += 1 segmentation[~segmentation_mask] = 0 return segmentation, instance_id_to_class def panoptic_metrics(self, pred_segmentation, pred_instance, gt_segmentation, gt_instance, unique_id_mapping): # GMO and others n_classes = 2 self.keys = ['iou', 'true_positive', 'false_positive', 'false_negative'] # hard coding result = {key: torch.zeros(n_classes, dtype=torch.float32, device=gt_instance.device) for key in self.keys} assert pred_segmentation.dim() == 3 assert pred_segmentation.shape == pred_instance.shape == gt_segmentation.shape == gt_instance.shape n_instances = int(torch.cat([pred_instance, gt_instance]).max().item()) n_all_things = n_instances + n_classes # Classes + instances. n_things_and_void = n_all_things + 1 pred_segmentation = pred_segmentation.long().detach().cpu() pred_instance = pred_instance.long().detach().cpu() gt_segmentation = gt_segmentation.long().detach().cpu() gt_instance = gt_instance.long().detach().cpu() prediction, pred_to_cls = self.combine_mask(pred_segmentation, pred_instance, n_classes, n_all_things) target, target_to_cls = self.combine_mask(gt_segmentation, gt_instance, n_classes, n_all_things) # Compute ious between all stuff and things # hack for bincounting 2 arrays together x = prediction + n_things_and_void * target bincount_2d = torch.bincount(x.long(), minlength=n_things_and_void ** 2) if bincount_2d.shape[0] != n_things_and_void ** 2: raise ValueError('Incorrect bincount size.') conf = bincount_2d.reshape((n_things_and_void, n_things_and_void)) # Drop void class conf = conf[1:, 1:] # Confusion matrix contains intersections between all combinations of classes union = conf.sum(0).unsqueeze(0) + conf.sum(1).unsqueeze(1) - conf iou = torch.where(union > 0, (conf.float() + 1e-9) / (union.float() + 1e-9), torch.zeros_like(union).float()) mapping = (iou > self.iou_thresh_for_vpq).nonzero(as_tuple=False) # Check that classes match. is_matching = pred_to_cls[mapping[:, 1]] == target_to_cls[mapping[:, 0]] mapping = mapping[is_matching.detach().cpu().numpy()] tp_mask = torch.zeros_like(conf, dtype=torch.bool) tp_mask[mapping[:, 0], mapping[:, 1]] = True # First ids correspond to "stuff" i.e. semantic seg. # Instance ids are offset accordingly for target_id, pred_id in mapping: cls_id = pred_to_cls[pred_id] self.temporally_consistent = True # hard coding ! if self.temporally_consistent and cls_id == self.vehicles_id: if target_id.item() in unique_id_mapping and unique_id_mapping[target_id.item()] != pred_id.item(): # Not temporally consistent result['false_negative'][target_to_cls[target_id]] += 1 result['false_positive'][pred_to_cls[pred_id]] += 1 unique_id_mapping[target_id.item()] = pred_id.item() continue result['true_positive'][cls_id] += 1 result['iou'][cls_id] += iou[target_id][pred_id] unique_id_mapping[target_id.item()] = pred_id.item() for target_id in range(n_classes, n_all_things): # If this is a true positive do nothing. if tp_mask[target_id, n_classes:].any(): continue # If this target instance didn't match with any predictions and was present set it as false negative. if target_to_cls[target_id] != -1: result['false_negative'][target_to_cls[target_id]] += 1 for pred_id in range(n_classes, n_all_things): # If this is a true positive do nothing. if tp_mask[n_classes:, pred_id].any(): continue # If this predicted instance didn't match with any prediction, set that predictions as false positive. if pred_to_cls[pred_id] != -1 and (conf[:, pred_id] > 0).any(): result['false_positive'][pred_to_cls[pred_id]] += 1 return result def evaluate_instance_prediction(self, pred_seg, pred_flow, gt_seg, gt_instance): B, H, W, D = gt_seg.shape pred_consistent_instance_seg = self.predict_instance_segmentation(pred_seg, pred_flow) # add one feature dimension for interpolate pred_consistent_instance_seg = F.interpolate(pred_consistent_instance_seg.float(), size=[H, W, D], mode='nearest').contiguous() pred_consistent_instance_seg = pred_consistent_instance_seg.squeeze(1) # [6,512,512,40] iou = 0 true_positive = 0 false_positive = 0 false_negative = 0 # starting from the present frame pred_instance = pred_consistent_instance_seg[self.eval_start_moment:] gt_instance = gt_instance[self.eval_start_moment:].long() assert gt_instance.min() == 0, 'ID 0 of gt_instance must be background' pred_segmentation = (pred_instance > 0).long() gt_segmentation = (gt_instance > 0).long() unique_id_mapping = {} for t in range(pred_segmentation.shape[0]): result = self.panoptic_metrics( pred_segmentation[t].detach(), pred_instance[t].detach(), gt_segmentation[t], gt_instance[t], unique_id_mapping, ) iou += result['iou'] true_positive += result['true_positive'] false_positive += result['false_positive'] false_negative += result['false_negative'] denominator = torch.maximum( (true_positive + false_positive / 2 + false_negative / 2), torch.ones_like(true_positive) ) pq = iou / denominator return pq.cpu().numpy() def forward_dummy(self, points=None, img_metas=None, img_inputs=None, points_occ=None, **kwargs, ): voxel_feats, flow_feats, img_feats, depth = self.extract_feat(img=img_inputs, img_metas=img_metas) transform = img_inputs[1:8] if img_inputs is not None else None output = self.pts_bbox_head( voxel_feats=voxel_feats, points=points_occ, img_metas=img_metas, img_feats=img_feats, transform=transform, ) return output def fast_hist(pred, label, max_label=18): pred = copy.deepcopy(pred.flatten()) label = copy.deepcopy(label.flatten()) bin_count = np.bincount(max_label * label.astype(int) + pred, minlength=max_label ** 2) iou_per_pred = (bin_count[-1]/(bin_count[-1]+bin_count[1]+bin_count[2])) return bin_count[:max_label ** 2].reshape(max_label, max_label),iou_per_pred ================================================ FILE: projects/occ_plugin/occupancy/fuser/__init__.py ================================================ from .addfuse import AddFuser from .visfuse import VisFuser from .convfuse import ConvFuser ================================================ FILE: projects/occ_plugin/occupancy/fuser/addfuse.py ================================================ import random from typing import List import torch from torch import nn from mmdet3d.models.builder import FUSION_LAYERS @FUSION_LAYERS.register_module() class AddFuser(nn.Module): def __init__(self, in_channels, out_channels, dropout, input_modality=None) -> None: super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.dropout = dropout if input_modality == None: input_modality = dict( use_lidar=True, use_camera=True, use_radar=False, use_map=False, use_external=False) self.use_lidar = input_modality['use_lidar'] self.use_img = input_modality['use_camera'] if self.use_img: self.img_enc = nn.Sequential( nn.Conv3d(in_channels, out_channels, 3, padding=1, bias=False), nn.BatchNorm3d(out_channels), nn.ReLU(True), ) if self.use_lidar: self.pts_enc = nn.Sequential( nn.Conv3d(in_channels, out_channels, 3, padding=1, bias=False), nn.BatchNorm3d(out_channels), nn.ReLU(True), ) def forward(self, img_voxel_feats, pts_voxel_feats): features = [] if self.use_img: img_voxel_feats = self.img_enc(img_voxel_feats) features.append(img_voxel_feats) if self.use_lidar: pts_voxel_feats = self.pts_enc(pts_voxel_feats) features.append(pts_voxel_feats) weights = [1] * len(features) if self.training and random.random() < self.dropout: index = random.randint(0, len(features) - 1) weights[index] = 0 return sum(w * f for w, f in zip(weights, features)) / sum(weights) ================================================ FILE: projects/occ_plugin/occupancy/fuser/convfuse.py ================================================ import random from typing import List import torch from torch import nn from mmdet3d.models.builder import FUSION_LAYERS @FUSION_LAYERS.register_module() class ConvFuser(nn.Module): def __init__(self, in_channels, out_channels) -> None: super().__init__() self.in_channels = in_channels self.out_channels = out_channels self.occ_enc = nn.Sequential( nn.Conv3d(in_channels*2, out_channels, 3, padding=1, bias=False), nn.BatchNorm3d(out_channels), nn.ReLU(True), ) def forward(self, img_voxel_feats, pts_voxel_feats): return self.occ_enc(torch.cat([img_voxel_feats, pts_voxel_feats], dim=1)) ================================================ FILE: projects/occ_plugin/occupancy/fuser/visfuse.py ================================================ import random from typing import List import torch from torch import nn import torch.nn.functional as F from mmdet3d.models.builder import FUSION_LAYERS from mmcv.cnn import build_norm_layer @FUSION_LAYERS.register_module() class VisFuser(nn.Module): def __init__(self, in_channels, out_channels, norm_cfg=None) -> None: super().__init__() self.in_channels = in_channels self.out_channels = out_channels if norm_cfg is None: norm_cfg = dict(type='BN3d', eps=1e-3, momentum=0.01) self.img_enc = nn.Sequential( nn.Conv3d(in_channels, out_channels, 7, padding=3, bias=False), build_norm_layer(norm_cfg, out_channels)[1], # nn.BatchNorm3d(out_channels), nn.ReLU(True), ) self.pts_enc = nn.Sequential( nn.Conv3d(in_channels, out_channels, 7, padding=3, bias=False), build_norm_layer(norm_cfg, out_channels)[1], # nn.BatchNorm3d(out_channels), nn.ReLU(True), ) self.vis_enc = nn.Sequential( nn.Conv3d(2*out_channels, 16, 3, padding=1, bias=False), build_norm_layer(norm_cfg, 16)[1], # nn.BatchNorm3d(16), nn.ReLU(True), nn.Conv3d(16, 1, 1, padding=0, bias=False), nn.Sigmoid(), ) def forward(self, img_voxel_feats, pts_voxel_feats): img_voxel_feats = self.img_enc(img_voxel_feats) pts_voxel_feats = self.pts_enc(pts_voxel_feats) vis_weight = self.vis_enc(torch.cat([img_voxel_feats, pts_voxel_feats], dim=1)) voxel_feats = vis_weight * img_voxel_feats + (1 - vis_weight) * pts_voxel_feats return voxel_feats ================================================ FILE: projects/occ_plugin/occupancy/image2bev/ViewTransformerLSSBEVDepth.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import math import torch import torch.nn as nn from mmcv.runner import BaseModule from mmdet3d.models.builder import NECKS from projects.occ_plugin.ops.occ_pooling import occ_pool from mmcv.cnn import build_conv_layer, build_norm_layer from mmcv.runner import force_fp32 from torch.cuda.amp.autocast_mode import autocast from mmdet.models.backbones.resnet import BasicBlock import torch.nn.functional as F from torch.utils.checkpoint import checkpoint from scipy.special import erf from scipy.stats import norm import numpy as np import copy import pdb def gen_dx_bx(xbound, ybound, zbound): dx = torch.Tensor([row[2] for row in [xbound, ybound, zbound]]) bx = torch.Tensor([row[0] + row[2]/2.0 for row in [xbound, ybound, zbound]]) nx = torch.Tensor([(row[1] - row[0]) / row[2] for row in [xbound, ybound, zbound]]) return dx, bx, nx def cumsum_trick(x, geom_feats, ranks): x = x.cumsum(0) kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool) kept[:-1] = (ranks[1:] != ranks[:-1]) x, geom_feats = x[kept], geom_feats[kept] x = torch.cat((x[:1], x[1:] - x[:-1])) return x, geom_feats class QuickCumsum(torch.autograd.Function): @staticmethod def forward(ctx, x, geom_feats, ranks): x = x.cumsum(0) kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool) kept[:-1] = (ranks[1:] != ranks[:-1]) x, geom_feats = x[kept], geom_feats[kept] x = torch.cat((x[:1], x[1:] - x[:-1])) # save kept for backward ctx.save_for_backward(kept) # no gradient for geom_feats ctx.mark_non_differentiable(geom_feats) return x, geom_feats @staticmethod def backward(ctx, gradx, gradgeom): kept, = ctx.saved_tensors back = torch.cumsum(kept, 0) back[kept] -= 1 val = gradx[back] return val, None, None class ViewTransformerLiftSplatShoot(BaseModule): def __init__(self, grid_config=None, data_config=None, numC_input=512, numC_Trans=64, downsample=16, accelerate=False, use_bev_pool=True, vp_megvii=False, vp_stero=False, **kwargs): super(ViewTransformerLiftSplatShoot, self).__init__() if grid_config is None: grid_config = { 'xbound': [-51.2, 51.2, 0.8], 'ybound': [-51.2, 51.2, 0.8], 'zbound': [-10.0, 10.0, 20.0], 'dbound': [1.0, 60.0, 1.0],} self.grid_config = grid_config dx, bx, nx = gen_dx_bx(self.grid_config['xbound'], self.grid_config['ybound'], self.grid_config['zbound'], ) self.dx = nn.Parameter(dx, requires_grad=False) self.bx = nn.Parameter(bx, requires_grad=False) self.nx = nn.Parameter(nx, requires_grad=False) if data_config is None: data_config = {'input_size': (256, 704)} self.data_config = data_config self.downsample = downsample self.frustum = self.create_frustum() # D x H x W x 3 self.D, _, _, _ = self.frustum.shape self.numC_input = numC_input self.numC_Trans = numC_Trans self.depth_net = nn.Conv2d(self.numC_input, self.D + self.numC_Trans, kernel_size=1, padding=0) self.geom_feats = None self.accelerate = accelerate self.use_bev_pool = use_bev_pool self.vp_megvii = vp_megvii self.vp_stereo = vp_stero def get_depth_dist(self, x): return x.softmax(dim=1) def create_frustum(self): # make grid in image plane ogfH, ogfW = self.data_config['input_size'] fH, fW = ogfH // self.downsample, ogfW // self.downsample ds = torch.arange(*self.grid_config['dbound'], dtype=torch.float).view(-1, 1, 1).expand(-1, fH, fW) # dbound=[2.0, 58.0, 0.5] D, _, _ = ds.shape xs = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(1, 1, fW).expand(D, fH, fW) ys = torch.linspace(0, ogfH - 1, fH, dtype=torch.float).view(1, fH, 1).expand(D, fH, fW) # D x H x W x 3 frustum = torch.stack((xs, ys, ds), -1) return nn.Parameter(frustum, requires_grad=False) def get_geometry(self, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda): """Determine the (x,y,z) locations (in the ego frame) of the points in the point cloud. Returns B x N x D x H/downsample x W/downsample x 3 """ B, N, _ = trans_seq.shape # undo post-transformation # B x N x D x H x W x 3 points = self.frustum - post_trans_seq.view(B, N, 1, 1, 1, 3) points = torch.inverse(post_rots_seq).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1)) # cam_to_ego points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3], points[:, :, :, :, :, 2:3] ), 5) if intrins_seq.shape[3] == 4: shift = intrins_seq[:, :, :3, 3] points = points - shift.view(B, N, 1, 1, 1, 3, 1) intrins_seq = intrins_seq[:, :, :3, :3] combine = rots_seq.matmul(torch.inverse(intrins_seq)) points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1) points += trans_seq.view(B, N, 1, 1, 1, 3) return points def voxel_pooling(self, geom_feats, x): B, N, D, H, W, C = x.shape Nprime = B * N * D * H * W nx = self.nx.to(torch.long) # flatten x x = x.reshape(Nprime, C) # flatten indices geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long() geom_feats = geom_feats.view(Nprime, 3) batch_ix = torch.cat([torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) for ix in range(B)]) geom_feats = torch.cat((geom_feats, batch_ix), 1) # filter out points that are outside box kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \ & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \ & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2]) x = x[kept] geom_feats = geom_feats[kept] if self.use_bev_pool: final = occ_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1]) final = final.transpose(dim0=-2, dim1=-1) else: # get tensors from the same voxel next to each other ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B) \ + geom_feats[:, 1] * (self.nx[2] * B) \ + geom_feats[:, 2] * B \ + geom_feats[:, 3] sorts = ranks.argsort() x, geom_feats, ranks = x[sorts], geom_feats[sorts], ranks[sorts] # cumsum trick x, geom_feats = QuickCumsum.apply(x, geom_feats, ranks) # griddify (B x C x Z x X x Y) final = torch.zeros((B, C, nx[2], nx[1], nx[0]), device=x.device) final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 1], geom_feats[:, 0]] = x # collapse Z final = torch.cat(final.unbind(dim=2), 1) return final def voxel_pooling_accelerated(self, rots, trans, intrins, post_rots, post_trans, bda, x): B, N, D, H, W, C = x.shape Nprime = B * N * D * H * W nx = self.nx.to(torch.long) # flatten x x = x.reshape(Nprime, C) max = 300 # flatten indices if self.geom_feats is None: geom_feats = self.get_geometry(rots, trans, intrins, post_rots, post_trans, bda) geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long() geom_feats = geom_feats.view(Nprime, 3) batch_ix = torch.cat([torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) for ix in range(B)]) geom_feats = torch.cat((geom_feats, batch_ix), 1) # filter out points that are outside box kept1 = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \ & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \ & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2]) idx = torch.range(0, x.shape[0] - 1, dtype=torch.long) x = x[kept1] idx = idx[kept1] geom_feats = geom_feats[kept1] # get tensors from the same voxel next to each other ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B) \ + geom_feats[:, 1] * (self.nx[2] * B) \ + geom_feats[:, 2] * B \ + geom_feats[:, 3] sorts = ranks.argsort() x, geom_feats, ranks, idx = x[sorts], geom_feats[sorts], ranks[sorts], idx[sorts] repeat_id = torch.ones(geom_feats.shape[0], device=geom_feats.device, dtype=geom_feats.dtype) curr = 0 repeat_id[0] = 0 curr_rank = ranks[0] for i in range(1, ranks.shape[0]): if curr_rank == ranks[i]: curr += 1 repeat_id[i] = curr else: curr_rank = ranks[i] curr = 0 repeat_id[i] = curr kept2 = repeat_id < max repeat_id, geom_feats, x, idx = repeat_id[kept2], geom_feats[kept2], x[kept2], idx[kept2] geom_feats = torch.cat([geom_feats, repeat_id.unsqueeze(-1)], dim=-1) self.geom_feats = geom_feats self.idx = idx else: geom_feats = self.geom_feats idx = self.idx x = x[idx] # griddify (B x C x Z x X x Y) final = torch.zeros((B, C, nx[2], nx[1], nx[0], max), device=x.device) final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 1], geom_feats[:, 0], geom_feats[:, 4]] = x final = final.sum(-1) # collapse Z final = torch.cat(final.unbind(dim=2), 1) return final def voxel_pooling_bevdepth(self, geom_feats, x): nx = self.nx.to(torch.long) geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).int() # FIXME # final = voxel_pooling(geom_feats, x.contiguous(), nx) final = self.voxel_pooling(geom_feats, x.contiguous(), nx) return final def forward(self, input): x, rots, trans, intrins, post_rots, post_trans, bda = input B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) x = self.depth_net(x) depth = self.get_depth_dist(x[:, :self.D]) img_feat = x[:, self.D:(self.D + self.numC_Trans)] # Lift volume = depth.unsqueeze(1) * img_feat.unsqueeze(2) volume = volume.view(B, N, self.numC_Trans, self.D, H, W) volume = volume.permute(0, 1, 3, 4, 5, 2) # Splat if self.accelerate: bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins, post_rots, post_trans, bda, volume) else: geom = self.get_geometry(rots, trans, intrins, post_rots, post_trans, bda) if self.vp_megvii: bev_feat = self.voxel_pooling_bevdepth(geom, volume) else: bev_feat = self.voxel_pooling(geom, volume) return bev_feat class _ASPPModule(nn.Module): def __init__(self, inplanes, planes, kernel_size, padding, dilation, BatchNorm): super(_ASPPModule, self).__init__() self.atrous_conv = nn.Conv2d(inplanes, planes, kernel_size=kernel_size, stride=1, padding=padding, dilation=dilation, bias=False) self.bn = BatchNorm self.relu = nn.ReLU() self._init_weight() def forward(self, x): x = self.atrous_conv(x) x = self.bn(x) return self.relu(x) def _init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() class ASPP(nn.Module): def __init__(self, inplanes, mid_channels=256, norm_cfg=dict(type='BN2d')): super(ASPP, self).__init__() dilations = [1, 6, 12, 18] self.aspp1 = _ASPPModule(inplanes, mid_channels, 1, padding=0, dilation=dilations[0], BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1]) self.aspp2 = _ASPPModule(inplanes, mid_channels, 3, padding=dilations[1], dilation=dilations[1], BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1]) self.aspp3 = _ASPPModule(inplanes, mid_channels, 3, padding=dilations[2], dilation=dilations[2], BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1]) self.aspp4 = _ASPPModule(inplanes, mid_channels, 3, padding=dilations[3], dilation=dilations[3], BatchNorm=build_norm_layer(norm_cfg, mid_channels)[1]) self.global_avg_pool = nn.Sequential( nn.AdaptiveAvgPool2d((1, 1)), nn.Conv2d(inplanes, mid_channels, 1, stride=1, bias=False), build_norm_layer(norm_cfg, mid_channels)[1], nn.ReLU(), ) self.conv1 = nn.Conv2d(int(mid_channels * 5), mid_channels, 1, bias=False) self.bn1 = build_norm_layer(norm_cfg, mid_channels)[1] self.relu = nn.ReLU() self.dropout = nn.Dropout(0.5) self._init_weight() def forward(self, x): x1 = self.aspp1(x) x2 = self.aspp2(x) x3 = self.aspp3(x) x4 = self.aspp4(x) x5 = self.global_avg_pool(x) x5 = F.interpolate(x5, size=x4.size()[2:], mode='bilinear', align_corners=True) x = torch.cat((x1, x2, x3, x4, x5), dim=1) x = self.conv1(x) x = self.bn1(x) x = self.relu(x) return self.dropout(x) def _init_weight(self): for m in self.modules(): if isinstance(m, nn.Conv2d): torch.nn.init.kaiming_normal_(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data.fill_(1) m.bias.data.zero_() class Mlp(nn.Module): def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, drop=0.0): super().__init__() out_features = out_features or in_features hidden_features = hidden_features or in_features self.fc1 = nn.Linear(in_features, hidden_features) self.act = act_layer() self.drop1 = nn.Dropout(drop) self.fc2 = nn.Linear(hidden_features, out_features) self.drop2 = nn.Dropout(drop) def forward(self, x): x = self.fc1(x) x = self.act(x) x = self.drop1(x) x = self.fc2(x) x = self.drop2(x) return x class SELayer(nn.Module): def __init__(self, channels, act_layer=nn.ReLU, gate_layer=nn.Sigmoid): super().__init__() self.conv_reduce = nn.Conv2d(channels, channels, 1, bias=True) self.act1 = act_layer() self.conv_expand = nn.Conv2d(channels, channels, 1, bias=True) self.gate = gate_layer() def forward(self, x, x_se): x_se = self.conv_reduce(x_se) x_se = self.act1(x_se) x_se = self.conv_expand(x_se) return x * self.gate(x_se) class DepthNet(nn.Module): def __init__(self, in_channels, mid_channels, context_channels, depth_channels, cam_channels=27, norm_cfg=None): super(DepthNet, self).__init__() self.reduce_conv = nn.Sequential( nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1), build_norm_layer(norm_cfg, mid_channels)[1], nn.ReLU(inplace=True), ) self.context_conv = nn.Conv2d(mid_channels, context_channels, kernel_size=1, stride=1, padding=0) self.bn = build_norm_layer(dict(type='GN', num_groups=9, requires_grad=True), cam_channels)[1] self.depth_mlp = Mlp(cam_channels, mid_channels, mid_channels) self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware self.context_mlp = Mlp(cam_channels, mid_channels, mid_channels) self.context_se = SELayer(mid_channels) # NOTE: add camera-aware self.depth_conv = nn.Sequential( BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg), BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg), BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg), ASPP(mid_channels, mid_channels, norm_cfg=norm_cfg), build_conv_layer(cfg=dict( type='DCN', in_channels=mid_channels, out_channels=mid_channels, kernel_size=3, padding=1, groups=4, im2col_step=128, )), nn.Conv2d(mid_channels, depth_channels, kernel_size=1, stride=1, padding=0), ) def forward(self, x, mlp_input): mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) x = self.reduce_conv(x) context_se = self.context_mlp(mlp_input)[..., None, None] context = self.context_se(x, context_se) context = self.context_conv(context) depth_se = self.depth_mlp(mlp_input)[..., None, None] depth = self.depth_se(x, depth_se) depth = self.depth_conv(depth) return torch.cat([depth, context], dim=1) class DepthAggregation(nn.Module): """ pixel cloud feature extraction """ def __init__(self, in_channels, mid_channels, out_channels, norm_cfg): super(DepthAggregation, self).__init__() self.reduce_conv = nn.Sequential( nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False), build_norm_layer(norm_cfg, mid_channels)[1], nn.ReLU(inplace=True), ) self.conv = nn.Sequential( nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False), build_norm_layer(norm_cfg, mid_channels)[1], nn.ReLU(inplace=True), nn.Conv2d(mid_channels, mid_channels, kernel_size=3, stride=1, padding=1, bias=False), build_norm_layer(norm_cfg, mid_channels)[1], nn.ReLU(inplace=True), ) self.out_conv = nn.Sequential( nn.Conv2d(mid_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=True), # nn.BatchNorm3d(out_channels), # nn.ReLU(inplace=True), ) @autocast(False) def forward(self, x): x = checkpoint(self.reduce_conv, x) short_cut = x x = checkpoint(self.conv, x) x = short_cut + x x = self.out_conv(x) return x @NECKS.register_module() class ViewTransformerLSSBEVDepth(ViewTransformerLiftSplatShoot): def __init__(self, loss_depth_weight, cam_channels=27, loss_depth_reg_weight=0.0, use_voxel_net=False, norm_cfg=dict(type='BN2d', eps=1e-3, momentum=0.01), **kwargs): super(ViewTransformerLSSBEVDepth, self).__init__(**kwargs) self.loss_depth_weight = loss_depth_weight self.loss_depth_reg_weight = loss_depth_reg_weight self.cam_channels = cam_channels self.depth_net = DepthNet(self.numC_input, self.numC_input, self.numC_Trans, self.D, cam_channels=self.cam_channels, norm_cfg=norm_cfg) self.depth_aggregation_net = DepthAggregation(self.numC_Trans, self.numC_Trans, self.numC_Trans, norm_cfg=norm_cfg) if use_voxel_net else None def _forward_voxel_net(self, img_feat_with_depth): # BEVConv2D [n, c, d, h, w] -> [n, h, c, w, d] if self.depth_aggregation_net is None: return img_feat_with_depth img_feat_with_depth = img_feat_with_depth.permute( 0, 3, 1, 4, 2).contiguous() # [n, c, d, h, w] -> [n, h, c, w, d] n, h, c, w, d = img_feat_with_depth.shape img_feat_with_depth = img_feat_with_depth.view(-1, c, w, d) img_feat_with_depth = ( self.depth_aggregation_net(img_feat_with_depth).view( n, h, c, w, d).permute(0, 2, 4, 1, 3).contiguous().float()) return img_feat_with_depth def get_mlp_input(self, rot, tran, intrin, post_rot, post_tran, bda=None): B,N,_,_ = rot.shape if bda is None: bda = torch.eye(3).to(rot).view(1,3,3).repeat(B,1,1) bda = bda.view(B,1,3,3).repeat(1,N,1,1) if intrin.shape[-1] == 4: # for KITTI, the intrin matrix is 3x4 mlp_input = torch.stack([ intrin[:, :, 0, 0], intrin[:, :, 1, 1], intrin[:, :, 0, 2], intrin[:, :, 1, 2], intrin[:, :, 0, 3], intrin[:, :, 1, 3], intrin[:, :, 2, 3], post_rot[:, :, 0, 0], post_rot[:, :, 0, 1], post_tran[:, :, 0], post_rot[:, :, 1, 0], post_rot[:, :, 1, 1], post_tran[:, :, 1], bda[:, :, 0, 0], bda[:, :, 0, 1], bda[:, :, 1, 0], bda[:, :, 1, 1], bda[:, :, 2, 2], ], dim=-1) else: mlp_input = torch.stack([ intrin[:, :, 0, 0], intrin[:, :, 1, 1], intrin[:, :, 0, 2], intrin[:, :, 1, 2], post_rot[:, :, 0, 0], post_rot[:, :, 0, 1], post_tran[:, :, 0], post_rot[:, :, 1, 0], post_rot[:, :, 1, 1], post_tran[:, :, 1], bda[:, :, 0, 0], bda[:, :, 0, 1], bda[:, :, 1, 0], bda[:, :, 1, 1], bda[:, :, 2, 2], ], dim=-1) sensor2ego = torch.cat([rot, tran.reshape(B, N, 3, 1)], dim=-1).reshape(B, N, -1) mlp_input = torch.cat([mlp_input, sensor2ego], dim=-1) return mlp_input def get_downsampled_gt_depth(self, gt_depths): """ Input: gt_depths: [B, N, H, W] Output: gt_depths: [B*N*h*w, d] """ B, N, H, W = gt_depths.shape gt_depths = gt_depths.view(B * N, H // self.downsample, self.downsample, W // self.downsample, self.downsample, 1) gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous() gt_depths = gt_depths.view(-1, self.downsample * self.downsample) gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths) gt_depths = torch.min(gt_depths_tmp, dim=-1).values gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample) # [min - step / 2, min + step / 2] creates min depth gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2] / 2)) / self.grid_config['dbound'][2] gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths)) gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:] return gt_depths.float() def _prepare_depth_gt(self, gt_depths): """ Input: gt_depths: [B, N, H, W] Output: gt_depths: [B*N*H*W, d] """ gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2])) / \ self.grid_config['dbound'][2] gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths)) gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:] return gt_depths.float() @force_fp32() def get_depth_reg_loss(self, depth_labels, depth_preds): depth_labels = self.get_downsampled_gt_depth(depth_labels) # depth_labels = self._prepare_depth_gt(depth_labels) depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D) # foreground predictions & labels fg_mask = torch.max(depth_labels, dim=1).values > 0.0 depth_labels = depth_labels[fg_mask] depth_preds = depth_preds[fg_mask] # cls_targets ==> reg_targets ds = torch.arange(*self.grid_config['dbound'], dtype=torch.float).view(1, -1).type_as(depth_preds) depth_reg_labels = torch.sum(depth_labels * ds, dim=1) depth_reg_preds = torch.sum(depth_preds * ds, dim=1) with autocast(enabled=False): loss_depth = F.smooth_l1_loss(depth_reg_preds, depth_reg_labels, reduction='mean') return self.loss_depth_reg_weight * loss_depth @force_fp32() def get_depth_loss(self, depth_labels, depth_preds): depth_labels = self.get_downsampled_gt_depth(depth_labels) # depth_labels = self._prepare_depth_gt(depth_labels) depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view( -1, self.D) fg_mask = torch.max(depth_labels, dim=1).values > 0.0 depth_labels = depth_labels[fg_mask] depth_preds = depth_preds[fg_mask] with autocast(enabled=False): depth_loss = F.binary_cross_entropy( depth_preds, depth_labels, reduction='none', ).sum() / max(1.0, fg_mask.sum()) return self.loss_depth_weight * depth_loss def forward(self, input): (x, rots, trans, intrins, post_rots, post_trans, bda, mlp_input) = input[:8] B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) x = self.depth_net(x, mlp_input) depth_digit = x[:, :self.D, ...] img_feat = x[:, self.D:self.D+self.numC_Trans, ...] depth_prob = self.get_depth_dist(depth_digit) # Lift volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2) volume = self._forward_voxel_net(volume) volume = volume.view(B, N, self.numC_Trans, self.D, H, W) volume = volume.permute(0, 1, 3, 4, 5, 2) # Splat if self.accelerate: bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins, post_rots, post_trans, bda, volume) else: geom = self.get_geometry(rots, trans, intrins, post_rots, post_trans, bda) if self.vp_megvii: bev_feat = self.voxel_pooling_bevdepth(geom, volume) else: bev_feat = self.voxel_pooling(geom, volume) return bev_feat, depth_prob class ConvBnReLU3D(nn.Module): """Implements of 3d convolution + batch normalization + ReLU.""" def __init__( self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, pad: int = 1, dilation: int = 1, ) -> None: """initialization method for convolution3D + batch normalization + relu module Args: in_channels: input channel number of convolution layer out_channels: output channel number of convolution layer kernel_size: kernel size of convolution layer stride: stride of convolution layer pad: pad of convolution layer dilation: dilation of convolution layer """ super(ConvBnReLU3D, self).__init__() self.conv = nn.Conv3d(in_channels, out_channels, kernel_size, stride=stride, padding=pad, dilation=dilation, bias=False) self.bn = nn.BatchNorm3d(out_channels) def forward(self, x: torch.Tensor) -> torch.Tensor: """forward method""" return F.relu(self.bn(self.conv(x)), inplace=True) class DepthNetStereo(nn.Module): def __init__(self, in_channels, mid_channels, context_channels, depth_channels, d_bound, num_ranges=4, norm_cfg=dict(type='BN', requires_grad=True)): super(DepthNetStereo, self).__init__() self.reduce_conv = nn.Sequential( nn.Conv2d(in_channels, mid_channels, kernel_size=3, stride=1, padding=1), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), ) self.context_conv = nn.Conv2d(mid_channels, context_channels, kernel_size=1, stride=1, padding=0) self.bn = nn.BatchNorm1d(27) self.depth_mlp = Mlp(27, mid_channels, mid_channels) self.depth_se = SELayer(mid_channels) # NOTE: add camera-aware self.context_mlp = Mlp(27, mid_channels, mid_channels) self.context_se = SELayer(mid_channels) # NOTE: add camera-aware self.depth_feat_conv = nn.Sequential( BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg), BasicBlock(mid_channels, mid_channels, norm_cfg=norm_cfg), ASPP(mid_channels, mid_channels, norm_cfg=norm_cfg), build_conv_layer(cfg=dict( type='DCN', in_channels=mid_channels, out_channels=mid_channels, kernel_size=3, padding=1, groups=4, im2col_step=128, )), ) self.mu_sigma_range_net = nn.Sequential( BasicBlock(mid_channels, mid_channels), nn.ConvTranspose2d(mid_channels, mid_channels, 3, stride=2, padding=1, output_padding=1), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), nn.ConvTranspose2d(mid_channels, mid_channels, 3, stride=2, padding=1, output_padding=1), nn.BatchNorm2d(mid_channels), nn.ReLU(inplace=True), nn.Conv2d(mid_channels, num_ranges * 3, kernel_size=1, stride=1, padding=0), ) self.mono_depth_net = nn.Sequential( BasicBlock(mid_channels, mid_channels), nn.Conv2d(mid_channels, depth_channels, kernel_size=1, stride=1, padding=0), ) self.d_bound = d_bound self.num_ranges = num_ranges # @autocast(False) def forward(self, x, mlp_input): B, _, H, W = x.shape mlp_input = self.bn(mlp_input.reshape(-1, mlp_input.shape[-1])) x = self.reduce_conv(x) context_se = self.context_mlp(mlp_input)[..., None, None] context = self.context_se(x, context_se) context = self.context_conv(context) depth_se = self.depth_mlp(mlp_input)[..., None, None] depth_feat = self.depth_se(x, depth_se) depth_feat = checkpoint(self.depth_feat_conv, depth_feat) mono_depth = checkpoint(self.mono_depth_net, depth_feat) mu_sigma_score = checkpoint(self.mu_sigma_range_net, depth_feat) mu = mu_sigma_score[:, 0:self.num_ranges, ...] sigma = mu_sigma_score[:, self.num_ranges:2 * self.num_ranges, ...] range_score = mu_sigma_score[:, 2 * self.num_ranges:3 * self.num_ranges, ...] sigma = F.elu(sigma) + 1.0 + 1e-10 return x, context, mu, sigma, range_score, mono_depth @NECKS.register_module() class ViewTransformerLSSBEVStereo(ViewTransformerLSSBEVDepth): def __init__(self, num_ranges=4, use_mask=True, em_iteration=3, range_list=[[2, 8], [8, 16], [16, 28], [28, 58]], sampling_range=3, num_samples=3, k_list=None, min_sigma=1.0, num_groups=8, stereo_downsample_factor=4, norm_cfg=dict(type='BN2d'), **kwargs): super(ViewTransformerLSSBEVStereo, self).__init__(**kwargs) self.num_ranges = num_ranges self.depth_net = DepthNetStereo(self.numC_input, self.numC_input, self.numC_Trans, self.D, self.grid_config['dbound'], self.num_ranges, norm_cfg=norm_cfg) self.context_downsample_net = nn.Identity() self.use_mask = use_mask self.stereo_downsample_factor = stereo_downsample_factor self.num_ranges = num_ranges self.min_sigma = min_sigma self.sampling_range = sampling_range self.num_samples = num_samples self.num_groups=num_groups self.similarity_net = nn.Sequential( ConvBnReLU3D(in_channels=num_groups, out_channels=16, kernel_size=1, stride=1, pad=0), ConvBnReLU3D(in_channels=16, out_channels=8, kernel_size=1, stride=1, pad=0), nn.Conv3d(in_channels=8, out_channels=1, kernel_size=1, stride=1, padding=0), ) self.depth_downsample_net = nn.Sequential( nn.Conv2d(self.D, 256, 3, 2, 1), nn.BatchNorm2d(256), nn.ReLU(), nn.Conv2d(256, 256, 3, 2, 1), nn.BatchNorm2d(256), nn.ReLU(), nn.Conv2d(256, self.D, 1, 1, 0), ) if range_list is None: range_length = (self.grid_config['dbound'][1] - self.grid_config['dbound'][0]) / num_ranges self.range_list = [[ self.grid_config['dbound'][0] + range_length * i, self.grid_config['dbound'][0] + range_length * (i + 1) ] for i in range(num_ranges)] else: assert len(range_list) == num_ranges self.range_list = range_list self.em_iteration = em_iteration if k_list is None: self.register_buffer('k_list', torch.Tensor(self.depth_sampling())) else: self.register_buffer('k_list', torch.Tensor(k_list)) if self.use_mask: self.mask_net = nn.Sequential( nn.Conv2d(self.D*2, 64, 3, 1, 1), nn.BatchNorm2d(64), nn.ReLU(inplace=True), BasicBlock(64, 64), BasicBlock(64, 64), nn.Conv2d(64, 1, 1, 1, 0), nn.Sigmoid(), ) def depth_sampling(self): """Generate sampling range of candidates. Returns: list[float]: List of all candidates. """ P_total = erf(self.sampling_range / np.sqrt(2)) # Probability covered by the sampling range idx_list = np.arange(0, self.num_samples + 1) p_list = (1 - P_total) / 2 + ((idx_list / self.num_samples) * P_total) k_list = norm.ppf(p_list) k_list = (k_list[1:] + k_list[:-1]) / 2 return list(k_list) def create_depth_sample_frustum(self, depth_sample, downsample_factor=16): """Generate frustum""" # make grid in image plane ogfH, ogfW = self.data_config['input_size'] fH, fW = ogfH // downsample_factor, ogfW // downsample_factor batch_size, num_depth, _, _ = depth_sample.shape x_coords = (torch.linspace(0, ogfW - 1, fW, dtype=torch.float, device=depth_sample.device).view( 1, 1, 1, fW).expand(batch_size, num_depth, fH, fW)) y_coords = (torch.linspace(0, ogfH - 1, fH, dtype=torch.float, device=depth_sample.device).view( 1, 1, fH, 1).expand(batch_size, num_depth, fH, fW)) paddings = torch.ones_like(depth_sample) # D x H x W x 3 frustum = torch.stack((x_coords, y_coords, depth_sample, paddings), -1) return frustum def homo_warping( self, stereo_feat, key_intrin_mats, sweep_intrin_mats, sensor2sensor_mats, key_ida_mats, sweep_ida_mats, depth_sample, frustum, ): """Used for mvs method to transfer sweep image feature to key image feature. Args: src_fea(Tensor): image features. key_intrin_mats(Tensor): Intrin matrix for key sensor. sweep_intrin_mats(Tensor): Intrin matrix for sweep sensor. sensor2sensor_mats(Tensor): Transformation matrix from key sensor to sweep sensor. key_ida_mats(Tensor): Ida matrix for key frame. sweep_ida_mats(Tensor): Ida matrix for sweep frame. depth_sample (Tensor): Depth map of all candidates. depth_sample_frustum (Tensor): Pre-generated frustum. """ batch_size_with_num_cams, channels = stereo_feat.shape[ 0], stereo_feat.shape[1] height, width = stereo_feat.shape[2], stereo_feat.shape[3] with torch.no_grad(): points = frustum points = points.reshape(points.shape[0], -1, points.shape[-1]) points[..., 2] = 1 # Undo ida for key frame. points = key_ida_mats.reshape(batch_size_with_num_cams, *key_ida_mats.shape[2:]).inverse( ).unsqueeze(1) @ points.unsqueeze(-1) # Convert points from pixel coord to key camera coord. points[..., :3, :] *= depth_sample.reshape( batch_size_with_num_cams, -1, 1, 1) num_depth = frustum.shape[1] points = (key_intrin_mats.reshape( batch_size_with_num_cams, *key_intrin_mats.shape[2:]).inverse().unsqueeze(1) @ points) points = (sensor2sensor_mats.reshape( batch_size_with_num_cams, *sensor2sensor_mats.shape[2:]).unsqueeze(1) @ points) # points in sweep sensor coord. points = (sweep_intrin_mats.reshape( batch_size_with_num_cams, *sweep_intrin_mats.shape[2:]).unsqueeze(1) @ points) # points in sweep pixel coord. points[..., :2, :] = points[..., :2, :] / points[ ..., 2:3, :] # [B, 2, Ndepth, H*W] points = (sweep_ida_mats.reshape( batch_size_with_num_cams, *sweep_ida_mats.shape[2:]).unsqueeze(1) @ points).squeeze(-1) neg_mask = points[..., 2] < 1e-3 points[..., 0][neg_mask] = width * self.stereo_downsample_factor points[..., 1][neg_mask] = height * self.stereo_downsample_factor points[..., 2][neg_mask] = 1 proj_x_normalized = points[..., 0] / ( (width * self.stereo_downsample_factor - 1) / 2) - 1 proj_y_normalized = points[..., 1] / ( (height * self.stereo_downsample_factor - 1) / 2) - 1 grid = torch.stack([proj_x_normalized, proj_y_normalized], dim=2) # [B, Ndepth, H*W, 2] warped_stereo_fea = F.grid_sample( stereo_feat, grid.view(batch_size_with_num_cams, num_depth * height, width, 2), mode='bilinear', padding_mode='zeros', ) warped_stereo_fea = warped_stereo_fea.view(batch_size_with_num_cams, channels, num_depth, height, width) return warped_stereo_fea def _forward_mask( self, sweep_index, mono_depth_all_sweeps, mats_dict, depth_sample, depth_sample_frustum, sensor2sensor_mats, ): """Forward function to generate mask. Args: sweep_index (int): Index of sweep. mono_depth_all_sweeps (list[Tensor]): List of mono_depth for all sweeps. mats_dict (dict): sensor2ego_mats (Tensor): Transformation matrix from camera to ego with shape of (B, num_sweeps, num_cameras, 4, 4). intrin_mats (Tensor): Intrinsic matrix with shape of (B, num_sweeps, num_cameras, 4, 4). ida_mats (Tensor): Transformation matrix for ida with shape of (B, num_sweeps, num_cameras, 4, 4). sensor2sensor_mats (Tensor): Transformation matrix from key frame camera to sweep frame camera with shape of (B, num_sweeps, num_cameras, 4, 4). bda_mat (Tensor): Rotation matrix for bda with shape of (B, 4, 4). depth_sample (Tensor): Depth map of all candidates. depth_sample_frustum (Tensor): Pre-generated frustum. sensor2sensor_mats (Tensor): Transformation matrix from reference sensor to source sensor. Returns: Tensor: Generated mask. """ num_sweeps = len(mono_depth_all_sweeps) mask_all_sweeps = list() for idx in range(num_sweeps): if idx == sweep_index: continue warped_mono_depth = self.homo_warping( mono_depth_all_sweeps[idx], mats_dict['intrin_mats'][:, sweep_index, ...], mats_dict['intrin_mats'][:, idx, ...], sensor2sensor_mats[idx], mats_dict['ida_mats'][:, sweep_index, ...], mats_dict['ida_mats'][:, idx, ...], depth_sample, depth_sample_frustum.type_as(mono_depth_all_sweeps[idx]), ) mask = self.mask_net( torch.cat([ mono_depth_all_sweeps[sweep_index].detach(), warped_mono_depth.mean(2).detach() ], 1)) mask_all_sweeps.append(mask) return torch.stack(mask_all_sweeps).mean(0) def _generate_cost_volume( self, sweep_index, stereo_feats_all_sweeps, mats_dict, depth_sample, depth_sample_frustum, sensor2sensor_mats, ): """Generate cost volume based on depth sample. Args: sweep_index (int): Index of sweep. stereo_feats_all_sweeps (list[Tensor]): Stereo feature of all sweeps. mats_dict (dict): sensor2ego_mats (Tensor): Transformation matrix from camera to ego with shape of (B, num_sweeps, num_cameras, 4, 4). intrin_mats (Tensor): Intrinsic matrix with shape of (B, num_sweeps, num_cameras, 4, 4). ida_mats (Tensor): Transformation matrix for ida with shape of (B, num_sweeps, num_cameras, 4, 4). sensor2sensor_mats (Tensor): Transformation matrix from key frame camera to sweep frame camera with shape of (B, num_sweeps, num_cameras, 4, 4). bda_mat (Tensor): Rotation matrix for bda with shape of (B, 4, 4). depth_sample (Tensor): Depth map of all candidates. depth_sample_frustum (Tensor): Pre-generated frustum. sensor2sensor_mats (Tensor): Transformation matrix from reference sensor to source sensor. Returns: Tensor: Depth score for all sweeps. """ batch_size, num_channels, height, width = stereo_feats_all_sweeps[ 0].shape # thres = int(self.mvs_weighting.split("CW")[1]) num_sweeps = len(stereo_feats_all_sweeps) depth_score_all_sweeps = list() for idx in range(num_sweeps): if idx == sweep_index: continue warped_stereo_fea = self.homo_warping( stereo_feats_all_sweeps[idx], mats_dict['intrin_mats'][:, sweep_index, ...], mats_dict['intrin_mats'][:, idx, ...], sensor2sensor_mats[idx], mats_dict['ida_mats'][:, sweep_index, ...], mats_dict['ida_mats'][:, idx, ...], depth_sample, depth_sample_frustum.type_as(stereo_feats_all_sweeps[idx]), ) warped_stereo_fea = warped_stereo_fea.reshape( batch_size, self.num_groups, num_channels // self.num_groups, self.num_samples, height, width) ref_stereo_feat = stereo_feats_all_sweeps[sweep_index].reshape( batch_size, self.num_groups, num_channels // self.num_groups, height, width) feat_cost = torch.mean( (ref_stereo_feat.unsqueeze(3) * warped_stereo_fea), axis=2) depth_score = self.similarity_net(feat_cost).squeeze(1) depth_score_all_sweeps.append(depth_score) return torch.stack(depth_score_all_sweeps).mean(0) def _forward_stereo( self, sweep_index, stereo_feats_all_sweeps, mono_depth_all_sweeps, mats_dict, sensor2sensor_mats, mu_all_sweeps, sigma_all_sweeps, range_score_all_sweeps, depth_feat_all_sweeps, ): """Forward function to generate stereo depth. Args: sweep_index (int): Index of sweep. stereo_feats_all_sweeps (list[Tensor]): Stereo feature of all sweeps. mono_depth_all_sweeps (list[Tensor]): mats_dict (dict): sensor2ego_mats (Tensor): Transformation matrix from camera to ego with shape of (B, num_sweeps, num_cameras, 4, 4). intrin_mats (Tensor): Intrinsic matrix with shape of (B, num_sweeps, num_cameras, 4, 4). ida_mats (Tensor): Transformation matrix for ida with shape of (B, num_sweeps, num_cameras, 4, 4). sensor2sensor_mats (Tensor): Transformation matrix from key frame camera to sweep frame camera with shape of (B, num_sweeps, num_cameras, 4, 4). bda_mat (Tensor): Rotation matrix for bda with shape of (B, 4, 4). sensor2sensor_mats(Tensor): Transformation matrix from key sensor to sweep sensor. mu_all_sweeps (list[Tensor]): List of mu for all sweeps. sigma_all_sweeps (list[Tensor]): List of sigma for all sweeps. range_score_all_sweeps (list[Tensor]): List of all range score for all sweeps. depth_feat_all_sweeps (list[Tensor]): List of all depth feat for all sweeps. Returns: Tensor: stereo_depth """ batch_size_with_cams, _, feat_height, feat_width = \ stereo_feats_all_sweeps[0].shape device = stereo_feats_all_sweeps[0].device d_coords = torch.arange(*self.grid_config['dbound'], dtype=torch.float, device=device).reshape(1, -1, 1, 1) d_coords = d_coords.repeat(batch_size_with_cams, 1, feat_height, feat_width) stereo_depth = stereo_feats_all_sweeps[0].new_zeros( batch_size_with_cams, self.D, feat_height, feat_width) mask_score = stereo_feats_all_sweeps[0].new_zeros( batch_size_with_cams, self.D, feat_height * self.stereo_downsample_factor // self.downsample, feat_width * self.stereo_downsample_factor // self.downsample, ) score_all_ranges = list() range_score = range_score_all_sweeps[sweep_index].softmax(1) for range_idx in range(self.num_ranges): # Map mu to the corresponding interval. range_start = self.range_list[range_idx][0] mu_all_sweeps_single_range = [ mu[:, range_idx:range_idx + 1, ...].sigmoid() * (self.range_list[range_idx][1] - self.range_list[range_idx][0]) + range_start for mu in mu_all_sweeps ] sigma_all_sweeps_single_range = [ sigma[:, range_idx:range_idx + 1, ...] for sigma in sigma_all_sweeps ] batch_size_with_cams, _, feat_height, feat_width =\ stereo_feats_all_sweeps[0].shape mu = mu_all_sweeps_single_range[sweep_index] sigma = sigma_all_sweeps_single_range[sweep_index] for _ in range(self.em_iteration): depth_sample = torch.cat([mu + sigma * k for k in self.k_list], 1) depth_sample_frustum = self.create_depth_sample_frustum( depth_sample, self.stereo_downsample_factor) mu_score = self._generate_cost_volume( sweep_index, stereo_feats_all_sweeps, mats_dict, depth_sample, depth_sample_frustum, sensor2sensor_mats, ) mu_score = mu_score.softmax(1) scale_factor = torch.clamp( 0.5 / (1e-4 + mu_score[:, self.num_samples // 2:self.num_samples // 2 + 1, ...]), min=0.1, max=10) sigma = torch.clamp(sigma * scale_factor, min=0.1, max=10) mu = (depth_sample * mu_score).sum(1, keepdim=True) del depth_sample del depth_sample_frustum mu = torch.clamp(mu, max=self.range_list[range_idx][1], min=self.range_list[range_idx][0]) range_length = int( (self.range_list[range_idx][1] - self.range_list[range_idx][0]) // self.grid_config['dbound'][2]) if self.use_mask: depth_sample = F.avg_pool2d( mu, self.downsample // self.stereo_downsample_factor, self.downsample // self.stereo_downsample_factor, ) depth_sample_frustum = self.create_depth_sample_frustum( depth_sample, self.downsample) mask = self._forward_mask( sweep_index, mono_depth_all_sweeps, mats_dict, depth_sample, depth_sample_frustum, sensor2sensor_mats, ) mask_score[:, int((range_start - self.grid_config['dbound'][0]) // self.grid_config['dbound'][2]):range_length + int((range_start - self.grid_config['dbound'][0]) // self.grid_config['dbound'][2]), ..., ] += mask del depth_sample del depth_sample_frustum sigma = torch.clamp(sigma, self.min_sigma) mu_repeated = mu.repeat(1, range_length, 1, 1) eps = 1e-6 depth_score_single_range = (-1 / 2 * ( (d_coords[:, int((range_start - self.grid_config['dbound'][0]) // self.grid_config['dbound'][2]):range_length + int( (range_start - self.grid_config['dbound'][0]) // self.grid_config['dbound'][2]), ..., ] - mu_repeated) / torch.sqrt(sigma))**2) depth_score_single_range = depth_score_single_range.exp() score_all_ranges.append(mu_score.sum(1).unsqueeze(1)) depth_score_single_range = depth_score_single_range / ( sigma * math.sqrt(2 * math.pi) + eps) stereo_depth[:, int((range_start - self.grid_config['dbound'][0]) // self.grid_config['dbound'][2]):range_length + int((range_start - self.grid_config['dbound'][0]) // self.grid_config['dbound'][2]), ..., ] = ( depth_score_single_range * range_score[:, range_idx:range_idx + 1, ...]) # del range_score del depth_score_single_range del mu_repeated if self.use_mask: return stereo_depth, mask_score else: return stereo_depth def forward(self, input): img_feat, depth_prob, rots, trans, intrins, post_rots, post_trans, bda = input B, N, C, H, W = img_feat.shape img_feat = img_feat.view(B*N,C,H,W) # Lift volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2) volume = self._forward_voxel_net(volume) volume = volume.view(B, N, self.numC_Trans, self.D, H, W) volume = volume.permute(0, 1, 3, 4, 5, 2) # Splat if self.accelerate: bev_feat = self.voxel_pooling_accelerated(rots, trans, intrins, post_rots, post_trans, bda, volume) else: geom = self.get_geometry(rots, trans, intrins, post_rots, post_trans, bda) if self.vp_megvii: bev_feat = self.voxel_pooling_bevdepth(geom, volume) else: bev_feat = self.voxel_pooling(geom, volume) return bev_feat ================================================ FILE: projects/occ_plugin/occupancy/image2bev/ViewTransformerLSSVoxel.py ================================================ # Copyright (c) Phigent Robotics. All rights reserved. import math import torch import torch.nn as nn from mmcv.runner import BaseModule from mmdet3d.models.builder import NECKS from projects.occ_plugin.ops.occ_pooling import occ_pool from mmcv.cnn import build_conv_layer from mmcv.runner import force_fp32 from torch.cuda.amp.autocast_mode import autocast from projects.occ_plugin.utils.gaussian import generate_guassian_depth_target import torch.nn.functional as F import numpy as np import pdb from .ViewTransformerLSSBEVDepth import * import torch.cuda as cuda def get_gpu_memory_usage(): allocated = cuda.memory_allocated() reserved = cuda.memory_reserved() return allocated, reserved @NECKS.register_module() class ViewTransformerLiftSplatShootVoxel(ViewTransformerLSSBEVDepth): def __init__(self, loss_depth_weight, loss_depth_type='bce', **kwargs): super(ViewTransformerLiftSplatShootVoxel, self).__init__(loss_depth_weight=loss_depth_weight, **kwargs) self.loss_depth_type = loss_depth_type self.cam_depth_range = self.grid_config['dbound'] self.constant_std = 0.5 def get_downsampled_gt_depth(self, gt_depths): """ Input: gt_depths: [B, N, H, W] Output: gt_depths: [B*N*h*w, d] """ B, N, H, W = gt_depths.shape gt_depths = gt_depths.view(B * N, H // self.downsample, self.downsample, W // self.downsample, self.downsample, 1) gt_depths = gt_depths.permute(0, 1, 3, 5, 2, 4).contiguous() gt_depths = gt_depths.view(-1, self.downsample * self.downsample) gt_depths_tmp = torch.where(gt_depths == 0.0, 1e5 * torch.ones_like(gt_depths), gt_depths) gt_depths = torch.min(gt_depths_tmp, dim=-1).values gt_depths = gt_depths.view(B * N, H // self.downsample, W // self.downsample) # [min - step / 2, min + step / 2] creates min depth gt_depths = (gt_depths - (self.grid_config['dbound'][0] - self.grid_config['dbound'][2] / 2)) / self.grid_config['dbound'][2] gt_depths_vals = gt_depths.clone() gt_depths = torch.where((gt_depths < self.D + 1) & (gt_depths >= 0.0), gt_depths, torch.zeros_like(gt_depths)) gt_depths = F.one_hot(gt_depths.long(), num_classes=self.D + 1).view(-1, self.D + 1)[:, 1:] return gt_depths_vals, gt_depths.float() @force_fp32() def get_bce_depth_loss(self, depth_labels, depth_preds): _, depth_labels = self.get_downsampled_gt_depth(depth_labels) depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D) fg_mask = torch.max(depth_labels, dim=1).values > 0.0 depth_labels = depth_labels[fg_mask] depth_preds = depth_preds[fg_mask] with autocast(enabled=False): depth_loss = F.binary_cross_entropy(depth_preds, depth_labels, reduction='none').sum() / max(1.0, fg_mask.sum()) return depth_loss @force_fp32() def get_klv_depth_loss(self, depth_labels, depth_preds): depth_gaussian_labels, depth_values = generate_guassian_depth_target(depth_labels, self.downsample, self.cam_depth_range, constant_std=self.constant_std) depth_values = depth_values.view(-1) fg_mask = (depth_values >= self.cam_depth_range[0]) & (depth_values <= (self.cam_depth_range[1] - self.cam_depth_range[2])) depth_gaussian_labels = depth_gaussian_labels.view(-1, self.D)[fg_mask] depth_preds = depth_preds.permute(0, 2, 3, 1).contiguous().view(-1, self.D)[fg_mask] depth_loss = F.kl_div(torch.log(depth_preds + 1e-4), depth_gaussian_labels, reduction='batchmean', log_target=False) return depth_loss @force_fp32() def get_depth_loss(self, depth_labels, depth_preds): if self.loss_depth_type == 'bce': depth_loss = self.get_bce_depth_loss(depth_labels, depth_preds) elif self.loss_depth_type == 'kld': depth_loss = self.get_klv_depth_loss(depth_labels, depth_preds) else: pdb.set_trace() return self.loss_depth_weight * depth_loss def voxel_pooling(self, geom_feats, x): B, N, D, H, W, C = x.shape Nprime = B * N * D * H * W x = x.contiguous().view(Nprime, C) # flatten indices geom_feats = ((geom_feats - (self.bx - self.dx / 2.)) / self.dx).long() geom_feats = geom_feats.view(Nprime, 3) batch_ix = torch.cat([torch.full([Nprime // B, 1], ix, device=x.device, dtype=torch.long) for ix in range(B)]) geom_feats = torch.cat((geom_feats, batch_ix), 1) # filter out points that are outside box kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0]) \ & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1]) \ & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2]) x = x[kept] geom_feats = geom_feats[kept] final = occ_pool(x, geom_feats, B, self.nx[2], self.nx[0], self.nx[1]) final = final.permute(0, 1, 3, 4, 2) return final def forward(self, input): (x, rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda, mlp_input_seq) = input[:8] B, N, C, H, W = x.shape x = x.view(B * N, C, H, W) x = self.depth_net(x, mlp_input_seq) depth_digit = x[:, :self.D, ...] img_feat = x[:, self.D:self.D + self.numC_Trans, ...] depth_prob = self.get_depth_dist(depth_digit) volume = depth_prob.unsqueeze(1) * img_feat.unsqueeze(2) volume = volume.view(B, N, self.numC_Trans, self.D, H, W) volume = volume.permute(0, 1, 3, 4, 5, 2) geom = self.get_geometry(rots_seq, trans_seq, intrins_seq, post_rots_seq, post_trans_seq, bda) bev_feat = self.voxel_pooling(geom, volume) return bev_feat, depth_prob ================================================ FILE: projects/occ_plugin/occupancy/image2bev/__init__.py ================================================ from .ViewTransformerLSSBEVDepth import ViewTransformerLSSBEVDepth from .ViewTransformerLSSVoxel import ViewTransformerLiftSplatShootVoxel ================================================ FILE: projects/occ_plugin/occupancy/necks/__init__.py ================================================ from .second_fpn_3d import SECONDFPN3D from .fpn3d import FPN3D ================================================ FILE: projects/occ_plugin/occupancy/necks/fpn3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn from mmcv.cnn import ConvModule from mmdet.models import NECKS import torch.nn.functional as F import pdb @NECKS.register_module() class FPN3D(BaseModule): """FPN used in SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (list[int]): Input channels of multi-scale feature maps. out_channels (list[int]): Output channels of feature maps. upsample_strides (list[int]): Strides used to upsample the feature maps. norm_cfg (dict): Config dict of normalization layers. upsample_cfg (dict): Config dict of upsample layers. conv_cfg (dict): Config dict of conv layers. use_conv_for_no_stride (bool): Whether to use conv when stride is 1. """ def __init__(self, in_channels=[80, 160, 320, 640], out_channels=256, norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), conv_cfg=dict(type='Conv3d'), act_cfg=dict(type='ReLU'), with_cp=False, upsample_cfg=dict(mode='trilinear'), init_cfg=None): super(FPN3D, self).__init__(init_cfg=init_cfg) self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False self.upsample_cfg = upsample_cfg self.with_cp = with_cp self.num_out = len(self.in_channels) self.lateral_convs = nn.ModuleList() self.fpn_convs = nn.ModuleList() for i in range(self.num_out): l_conv = nn.Sequential( ConvModule(in_channels[i], out_channels, kernel_size=1, padding=0, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=False, inplace=True), ) fpn_conv = nn.Sequential( ConvModule(out_channels, out_channels, kernel_size=3, padding=1, conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg, bias=False, inplace=True), ) self.lateral_convs.append(l_conv) self.fpn_convs.append(fpn_conv) @auto_fp16() def forward(self, inputs): """Forward function. Args: x (torch.Tensor): 4D Tensor in (N, C, H, W) shape. Returns: list[torch.Tensor]: Multi-level feature maps. """ assert len(inputs) == len(self.in_channels) # build laterals laterals = [] for i, lateral_conv in enumerate(self.lateral_convs): if self.with_cp: lateral_i = torch.utils.checkpoint.checkpoint(lateral_conv, inputs[i]) else: lateral_i = lateral_conv(inputs[i]) laterals.append(lateral_i) # build down-top path for i in range(self.num_out - 1, 0, -1): prev_shape = laterals[i - 1].shape[2:] laterals[i - 1] = laterals[i - 1] + F.interpolate(laterals[i], size=prev_shape, align_corners=False, **self.upsample_cfg) # outs = [ # self.fpn_convs[i](laterals[i]) for i in range(self.num_out) # ] outs = [] for i, fpn_conv in enumerate(self.fpn_convs): if self.with_cp: out_i = torch.utils.checkpoint.checkpoint(fpn_conv, laterals[i]) else: out_i = fpn_conv(laterals[i]) outs.append(out_i) return outs ================================================ FILE: projects/occ_plugin/occupancy/necks/second_fpn_3d.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import numpy as np import torch from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule, auto_fp16 from torch import nn as nn from mmdet.models import NECKS import pdb @NECKS.register_module() class SECONDFPN3D(BaseModule): """FPN used in SECOND/PointPillars/PartA2/MVXNet. Args: in_channels (list[int]): Input channels of multi-scale feature maps. out_channels (list[int]): Output channels of feature maps. upsample_strides (list[int]): Strides used to upsample the feature maps. norm_cfg (dict): Config dict of normalization layers. upsample_cfg (dict): Config dict of upsample layers. conv_cfg (dict): Config dict of conv layers. use_conv_for_no_stride (bool): Whether to use conv when stride is 1. """ def __init__(self, in_channels=[128, 128, 256], out_channels=[256, 256, 256], upsample_strides=[1, 2, 4], norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), upsample_cfg=dict(type='deconv3d', bias=False), conv_cfg=dict(type='Conv3d', bias=False), use_conv_for_no_stride=False, init_cfg=None): # replacing GN with BN3D, performance drops from 42.5 to 40.9. # the difference may be exaggerated because the performance can fluncate a lot super(SECONDFPN3D, self).__init__(init_cfg=init_cfg) assert len(out_channels) == len(upsample_strides) == len(in_channels) self.in_channels = in_channels self.out_channels = out_channels self.fp16_enabled = False deblocks = [] for i, out_channel in enumerate(out_channels): stride = upsample_strides[i] if stride > 1 or (stride == 1 and not use_conv_for_no_stride): upsample_layer = build_upsample_layer( upsample_cfg, in_channels=in_channels[i], out_channels=out_channel, kernel_size=upsample_strides[i], stride=upsample_strides[i]) else: stride = np.round(1 / stride).astype(np.int64) upsample_layer = build_conv_layer( conv_cfg, in_channels=in_channels[i], out_channels=out_channel, kernel_size=stride, stride=stride) deblock = nn.Sequential(upsample_layer, build_norm_layer(norm_cfg, out_channel)[1], nn.ReLU(inplace=True)) deblocks.append(deblock) self.deblocks = nn.ModuleList(deblocks) if init_cfg is None: self.init_cfg = [ dict(type='Kaiming', layer='ConvTranspose2d'), dict(type='Constant', layer='NaiveSyncBatchNorm2d', val=1.0) ] @auto_fp16() def forward(self, x): """Forward function. Args: x (torch.Tensor): 4D Tensor in (N, C, H, W) shape. Returns: list[torch.Tensor]: Multi-level feature maps. """ assert len(x) == len(self.in_channels) ups = [deblock(x[i]) for i, deblock in enumerate(self.deblocks)] if len(ups) > 1: out = torch.cat(ups, dim=1) else: out = ups[0] return [out] ================================================ FILE: projects/occ_plugin/occupancy/voxel_encoder/__init__.py ================================================ from .sparse_lidar_enc import SparseLiDAREnc4x, SparseLiDAREnc8x ================================================ FILE: projects/occ_plugin/occupancy/voxel_encoder/sparse_lidar_enc.py ================================================ import math from functools import partial from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer from mmcv.runner import BaseModule import torch import torch.nn as nn import torch.nn.functional as F import spconv.pytorch as spconv from spconv.pytorch import functional as Fsp from mmdet3d.models.builder import MIDDLE_ENCODERS import copy def post_act_block(in_channels, out_channels, kernel_size, indice_key=None, stride=1, padding=0, conv_type='subm', norm_cfg=None): if conv_type == 'subm': conv = spconv.SubMConv3d(in_channels, out_channels, kernel_size, bias=False, indice_key=indice_key) elif conv_type == 'spconv': conv = spconv.SparseConv3d(in_channels, out_channels, kernel_size, stride=stride, padding=padding, bias=False, indice_key=indice_key) elif conv_type == 'inverseconv': conv = spconv.SparseInverseConv3d(in_channels, out_channels, kernel_size, indice_key=indice_key, bias=False) else: raise NotImplementedError m = spconv.SparseSequential( conv, build_norm_layer(norm_cfg, out_channels)[1], nn.ReLU(inplace=True), ) return m class SparseBasicBlock(spconv.SparseModule): def __init__(self, inplanes, planes, stride=1, norm_cfg=None, indice_key=None): super(SparseBasicBlock, self).__init__() self.net = spconv.SparseSequential( spconv.SubMConv3d(inplanes, planes, kernel_size=3, stride=stride, padding=1, bias=False, indice_key=indice_key), build_norm_layer(norm_cfg, planes)[1], nn.ReLU(inplace=True), spconv.SubMConv3d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False, indice_key=indice_key), build_norm_layer(norm_cfg, planes)[1], ) self.relu = nn.ReLU(inplace=True) def forward(self, x): identity = x out = self.net(x) out = out.replace_feature(out.features + identity.features) out = out.replace_feature(self.relu(out.features)) return out @MIDDLE_ENCODERS.register_module() class SparseLiDAREnc4x(nn.Module): def __init__(self, input_channel, norm_cfg, base_channel, out_channel, sparse_shape_xyz, **kwargs): super().__init__() block = post_act_block self.sparse_shape_xyz = sparse_shape_xyz self.conv_input = spconv.SparseSequential( spconv.SubMConv3d(input_channel, base_channel, 3), nn.GroupNorm(16, base_channel), nn.ReLU(inplace=True)) self.conv1 = spconv.SparseSequential( SparseBasicBlock(base_channel, base_channel, norm_cfg=norm_cfg, indice_key='res1'), SparseBasicBlock(base_channel, base_channel, norm_cfg=norm_cfg, indice_key='res1'), ) self.conv2 = spconv.SparseSequential( block(base_channel, base_channel*2, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'), SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res2'), SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res2'), ) self.conv3 = spconv.SparseSequential( block(base_channel*2, base_channel*4, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'), SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res3'), SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res3'), ) self.conv_out = spconv.SparseSequential( spconv.SubMConv3d(base_channel*4, out_channel, 3), nn.GroupNorm(16, out_channel), nn.ReLU(inplace=True)) def forward(self, voxel_features, coors, batch_size): # spconv encoding coors = coors.int() # FIXME bs=1 hardcode input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape_xyz[::-1], batch_size) x = self.conv_input(input_sp_tensor) x_conv1 = self.conv1(x) x_conv2 = self.conv2(x_conv1) x_conv3 = self.conv3(x_conv2) x = self.conv_out(x_conv3) return {'x': x.dense().permute(0,1,4,3,2), # B, C, W, H, D 'pts_feats': [x]} @MIDDLE_ENCODERS.register_module() class SparseLiDAREnc8x(nn.Module): def __init__(self, input_channel, norm_cfg, base_channel, out_channel, sparse_shape_xyz, **kwargs): super().__init__() block = post_act_block self.sparse_shape_xyz = sparse_shape_xyz self.conv_input = spconv.SparseSequential( spconv.SubMConv3d(input_channel, base_channel, 3), nn.GroupNorm(16, base_channel), nn.ReLU(inplace=True)) self.conv1 = spconv.SparseSequential( block(base_channel, base_channel*2, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv1', conv_type='spconv'), SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res1'), SparseBasicBlock(base_channel*2, base_channel*2, norm_cfg=norm_cfg, indice_key='res1'), ) self.conv2 = spconv.SparseSequential( block(base_channel*2, base_channel*4, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv2', conv_type='spconv'), SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res2'), SparseBasicBlock(base_channel*4, base_channel*4, norm_cfg=norm_cfg, indice_key='res2'), ) self.conv3 = spconv.SparseSequential( block(base_channel*4, base_channel*8, 3, norm_cfg=norm_cfg, stride=2, padding=1, indice_key='spconv3', conv_type='spconv'), SparseBasicBlock(base_channel*8, base_channel*8, norm_cfg=norm_cfg, indice_key='res3'), SparseBasicBlock(base_channel*8, base_channel*8, norm_cfg=norm_cfg, indice_key='res3'), ) self.conv_out = spconv.SparseSequential( spconv.SubMConv3d(base_channel*8, out_channel, 3), nn.GroupNorm(16, out_channel), nn.ReLU(inplace=True)) def forward(self, voxel_features, coors, batch_size): # spconv encoding coors = coors.int() # FIXME bs=1 hardcode input_sp_tensor = spconv.SparseConvTensor(voxel_features, coors, self.sparse_shape_xyz[::-1], batch_size) x = self.conv_input(input_sp_tensor) x_conv1 = self.conv1(x) x_conv2 = self.conv2(x_conv1) x_conv3 = self.conv3(x_conv2) x = self.conv_out(x_conv3) return {'x': x.dense().permute(0,1,4,3,2), # B, C, W, H, D 'pts_feats': [x]} ================================================ FILE: projects/occ_plugin/ops/__init__.py ================================================ from .occ_pooling import * ================================================ FILE: projects/occ_plugin/ops/occ_pooling/OCC_Pool.py ================================================ import torch from projects.occ_plugin.ops.occ_pooling import occ_pool_ext __all__ = ["occ_pool"] class QuickCumsum(torch.autograd.Function): @staticmethod def forward(ctx, x, geom_feats, ranks): x = x.cumsum(0) kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool) kept[:-1] = ranks[1:] != ranks[:-1] x, geom_feats = x[kept], geom_feats[kept] x = torch.cat((x[:1], x[1:] - x[:-1])) # save kept for backward ctx.save_for_backward(kept) # no gradient for geom_feats ctx.mark_non_differentiable(geom_feats) return x, geom_feats @staticmethod def backward(ctx, gradx, gradgeom): (kept,) = ctx.saved_tensors back = torch.cumsum(kept, 0) back[kept] -= 1 val = gradx[back] return val, None, None class QuickCumsumCuda(torch.autograd.Function): @staticmethod def forward(ctx, x, geom_feats, ranks, B, D, H, W): kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool) kept[1:] = ranks[1:] != ranks[:-1] interval_starts = torch.where(kept)[0].int() interval_lengths = torch.zeros_like(interval_starts) interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1] interval_lengths[-1] = x.shape[0] - interval_starts[-1] geom_feats = geom_feats.int() out = occ_pool_ext.occ_pool_forward( x, geom_feats, interval_lengths, interval_starts, B, D, H, W, ) ctx.save_for_backward(interval_starts, interval_lengths, geom_feats) ctx.saved_shapes = B, D, H, W return out @staticmethod def backward(ctx, out_grad): interval_starts, interval_lengths, geom_feats = ctx.saved_tensors B, D, H, W = ctx.saved_shapes out_grad = out_grad.contiguous() x_grad = occ_pool_ext.occ_pool_backward( out_grad, geom_feats, interval_lengths, interval_starts, B, D, H, W, ) return x_grad, None, None, None, None, None, None def occ_pool(feats, coords, B, D, H, W): assert feats.shape[0] == coords.shape[0] ranks = ( coords[:, 0] * (W * D * B) + coords[:, 1] * (D * B) + coords[:, 2] * B + coords[:, 3] ) indices = ranks.argsort() feats, coords, ranks = feats[indices], coords[indices], ranks[indices] x = QuickCumsumCuda.apply(feats, coords, ranks, B, D, H, W) x = x.permute(0, 4, 1, 2, 3).contiguous() return x ================================================ FILE: projects/occ_plugin/ops/occ_pooling/__init__.py ================================================ from .OCC_Pool import occ_pool ================================================ FILE: projects/occ_plugin/ops/occ_pooling/src/occ_pool.cpp ================================================ #include #include // CUDA function declarations void occ_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x, const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out); void occ_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad, const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad); /* Function: pillar pooling (forward, cuda) Args: x : input features, FloatTensor[n, c] geom_feats : input coordinates, IntTensor[n, 4] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] Return: out : output features, FloatTensor[b, d, h, w, c] */ at::Tensor occ_pool_forward( const at::Tensor _x, const at::Tensor _geom_feats, const at::Tensor _interval_lengths, const at::Tensor _interval_starts, int b, int d, int h, int w ) { int n = _x.size(0); int c = _x.size(1); int n_intervals = _interval_lengths.size(0); const at::cuda::OptionalCUDAGuard device_guard(device_of(_x)); const float* x = _x.data_ptr(); const int* geom_feats = _geom_feats.data_ptr(); const int* interval_lengths = _interval_lengths.data_ptr(); const int* interval_starts = _interval_starts.data_ptr(); auto options = torch::TensorOptions().dtype(_x.dtype()).device(_x.device()); at::Tensor _out = torch::zeros({b, d, h, w, c}, options); float* out = _out.data_ptr(); occ_pool( b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out ); return _out; } /* Function: pillar pooling (backward, cuda) Args: out_grad : input features, FloatTensor[b, d, h, w, c] geom_feats : input coordinates, IntTensor[n, 4] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] Return: x_grad : output features, FloatTensor[n, 4] */ at::Tensor occ_pool_backward( const at::Tensor _out_grad, const at::Tensor _geom_feats, const at::Tensor _interval_lengths, const at::Tensor _interval_starts, int b, int d, int h, int w ) { int n = _geom_feats.size(0); int c = _out_grad.size(4); int n_intervals = _interval_lengths.size(0); const at::cuda::OptionalCUDAGuard device_guard(device_of(_out_grad)); const float* out_grad = _out_grad.data_ptr(); const int* geom_feats = _geom_feats.data_ptr(); const int* interval_lengths = _interval_lengths.data_ptr(); const int* interval_starts = _interval_starts.data_ptr(); auto options = torch::TensorOptions().dtype(_out_grad.dtype()).device(_out_grad.device()); at::Tensor _x_grad = torch::zeros({n, c}, options); float* x_grad = _x_grad.data_ptr(); occ_pool_grad( b, d, h, w, n, c, n_intervals, out_grad, geom_feats, interval_starts, interval_lengths, x_grad ); return _x_grad; } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("occ_pool_forward", &occ_pool_forward, "occ_pool_forward"); m.def("occ_pool_backward", &occ_pool_backward, "occ_pool_backward"); } ================================================ FILE: projects/occ_plugin/ops/occ_pooling/src/occ_pool_cuda.cu ================================================ #include #include /* Function: pillar pooling Args: b : batch size d : depth of the feature map h : height of pooled feature map w : width of pooled feature map n : number of input points c : number of channels n_intervals : number of unique points x : input features, FloatTensor[n, c] geom_feats : input coordinates, IntTensor[n, 4] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] out : output features, FloatTensor[b, d, h, w, c] */ __global__ void occ_pool_kernel(int b, int d, int h, int w, int n, int c, int n_intervals, const float *__restrict__ x, const int *__restrict__ geom_feats, const int *__restrict__ interval_starts, const int *__restrict__ interval_lengths, float* __restrict__ out) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int index = idx / c; int cur_c = idx % c; if (index >= n_intervals) return; int interval_start = interval_starts[index]; int interval_length = interval_lengths[index]; const int* cur_geom_feats = geom_feats + interval_start * 4; const float* cur_x = x + interval_start * c + cur_c; float* cur_out = out + cur_geom_feats[3] * d * h * w * c + cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c + cur_geom_feats[1] * c + cur_c; float psum = 0; for(int i = 0; i < interval_length; i++){ psum += cur_x[i * c]; } *cur_out = psum; } /* Function: pillar pooling backward Args: b : batch size d : depth of the feature map h : height of pooled feature map w : width of pooled feature map n : number of input points c : number of channels n_intervals : number of unique points out_grad : gradient of the BEV fmap from top, FloatTensor[b, d, h, w, c] geom_feats : input coordinates, IntTensor[n, 4] interval_lengths : starting position for pooled point, IntTensor[n_intervals] interval_starts : how many points in each pooled point, IntTensor[n_intervals] x_grad : gradient of the image fmap, FloatTensor */ __global__ void occ_pool_grad_kernel(int b, int d, int h, int w, int n, int c, int n_intervals, const float *__restrict__ out_grad, const int *__restrict__ geom_feats, const int *__restrict__ interval_starts, const int *__restrict__ interval_lengths, float* __restrict__ x_grad) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int index = idx / c; int cur_c = idx % c; if (index >= n_intervals) return; int interval_start = interval_starts[index]; int interval_length = interval_lengths[index]; const int* cur_geom_feats = geom_feats + interval_start * 4; float* cur_x_grad = x_grad + interval_start * c + cur_c; const float* cur_out_grad = out_grad + cur_geom_feats[3] * d * h * w * c + cur_geom_feats[2] * h * w * c + cur_geom_feats[0] * w * c + cur_geom_feats[1] * c + cur_c; for(int i = 0; i < interval_length; i++){ cur_x_grad[i * c] = *cur_out_grad; } } void occ_pool(int b, int d, int h, int w, int n, int c, int n_intervals, const float* x, const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* out) { occ_pool_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>( b, d, h, w, n, c, n_intervals, x, geom_feats, interval_starts, interval_lengths, out ); } void occ_pool_grad(int b, int d, int h, int w, int n, int c, int n_intervals, const float* out_grad, const int* geom_feats, const int* interval_starts, const int* interval_lengths, float* x_grad) { occ_pool_grad_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>( b, d, h, w, n, c, n_intervals, out_grad, geom_feats, interval_starts, interval_lengths, x_grad ); } ================================================ FILE: projects/occ_plugin/utils/__init__.py ================================================ from .formating import cm_to_ious, format_results from .metric_util import per_class_iu, fast_hist_crop from .coordinate_transform import coarse_to_fine_coordinates, project_points_on_img from .geometry import convert_egopose_to_matrix_numpy, invert_matrix_egopose_numpy ================================================ FILE: projects/occ_plugin/utils/coordinate_transform.py ================================================ import torch def coarse_to_fine_coordinates(coarse_cor, ratio, topk=30000): """ Args: coarse_cor (torch.Tensor): [3, N]""" fine_cor = coarse_cor * ratio fine_cor = fine_cor[None].repeat(ratio**3, 1, 1) # [8, 3, N] device = fine_cor.device value = torch.meshgrid([torch.arange(ratio).to(device), torch.arange(ratio).to(device), torch.arange(ratio).to(device)]) value = torch.stack(value, dim=3).reshape(-1, 3) fine_cor = fine_cor + value[:,:,None] if fine_cor.shape[-1] < topk: return fine_cor.permute(1,0,2).reshape(3,-1) else: fine_cor = fine_cor[:,:,torch.randperm(fine_cor.shape[-1])[:topk]] return fine_cor.permute(1,0,2).reshape(3,-1) def project_points_on_img(points, rots, trans, intrins, post_rots, post_trans, bda_mat, pts_range, W_img, H_img, W_occ, H_occ, D_occ): with torch.no_grad(): voxel_size = ((pts_range[3:] - pts_range[:3]) / torch.tensor([W_occ-1, H_occ-1, D_occ-1])).to(points.device) points = points * voxel_size[None, None] + pts_range[:3][None, None].to(points.device) # project 3D point cloud (after bev-aug) onto multi-view images for corresponding 2D coordinates inv_bda = bda_mat.inverse() points = (inv_bda @ points.unsqueeze(-1)).squeeze(-1) # from lidar to camera points = points.view(-1, 1, 3) points = points - trans.view(1, -1, 3) inv_rots = rots.inverse().unsqueeze(0) points = (inv_rots @ points.unsqueeze(-1)) # from camera to raw pixel points = (intrins.unsqueeze(0) @ points).squeeze(-1) points_d = points[..., 2:3] points_uv = points[..., :2] / (points_d + 1e-5) # from raw pixel to transformed pixel points_uv = post_rots[..., :2, :2].unsqueeze(0) @ points_uv.unsqueeze(-1) points_uv = points_uv.squeeze(-1) + post_trans[..., :2].unsqueeze(0) points_uv[..., 0] = (points_uv[..., 0] / (W_img-1) - 0.5) * 2 points_uv[..., 1] = (points_uv[..., 1] / (H_img-1) - 0.5) * 2 mask = (points_d[..., 0] > 1e-5) \ & (points_uv[..., 0] > -1) & (points_uv[..., 0] < 1) \ & (points_uv[..., 1] > -1) & (points_uv[..., 1] < 1) return points_uv.permute(2,1,0,3), mask ================================================ FILE: projects/occ_plugin/utils/formating.py ================================================ from prettytable import PrettyTable import numpy as np def cm_to_ious(cm): # SC:[TN FP \n FN TP] mean_ious = [] cls_num = len(cm) for i in range(cls_num): tp = cm[i, i] p = cm[:, i].sum() g = cm[i, :].sum() union = p + g - tp mean_ious.append(tp / union) return mean_ious def format_results(mean_ious, return_dic=False): class_map = { 1: 'barrier', 2: 'bicycle', 3: 'bus', 4: 'car', 5: 'construction_vehicle', 6: 'motorcycle', 7: 'pedestrian', 8: 'traffic_cone', 9: 'trailer', 10: 'truck', 11: 'driveable_surface', 12: 'other_flat', 13: 'sidewalk', 14: 'terrain', 15: 'manmade', 16: 'vegetation', } x = PrettyTable() x.field_names = ['class', 'IoU'] class_names = list(class_map.values()) + ['mean'] class_ious = mean_ious + [sum(mean_ious) / len(mean_ious)] dic = {} for cls_name, cls_iou in zip(class_names, class_ious): dic[cls_name] = round(cls_iou, 3) x.add_row([cls_name, round(cls_iou, 3)]) if return_dic: return x, dic else: return x def format_iou_results(mean_ious, return_dic=False): if len(mean_ious) == 2: class_map = { 0: 'free', 1: 'movable objects', } else: class_map = { 0: 'free', 1: 'bicycle', 2: 'bus', 3: 'car', 4: 'construction', 5: 'motorcycle', 6: 'trailer', 7: 'truck', 8: 'pedestrian', } x = PrettyTable() x.field_names = ['class', 'IoU'] class_names = list(class_map.values()) class_ious = mean_ious dic = {} for cls_name, cls_iou in zip(class_names, class_ious): dic[cls_name] = np.round(cls_iou, 3) x.add_row([cls_name, np.round(cls_iou, 3)]) mean_ious = sum(mean_ious[1:]) / len(mean_ious[1:]) dic['mean'] = np.round(mean_ious, 3) x.add_row(['mean', np.round(mean_ious, 3)]) if return_dic: return x, dic else: return x def format_vel_results(mean_epe, return_dic=False): class_map = { 0: 'barrier', 1: 'bicycle', 2: 'bus', 3: 'car', 4: 'construction_vehicle', 5: 'motorcycle', 6: 'pedestrian', 7: 'traffic_cone', 8: 'trailer', 9: 'truck', } x = PrettyTable() x.field_names = ['class', 'EPE'] class_names = list(class_map.values()) class_epes = mean_epe dic = {} for cls_name, cls_iou in zip(class_names, class_epes): dic[cls_name] = np.round(cls_iou, 3) x.add_row([cls_name, np.round(cls_iou, 3)]) mean_all_epe = mean_epe.mean() dic['mean'] = np.round(mean_all_epe, 3) x.add_row(['mean', np.round(mean_all_epe, 3)]) if return_dic: return x, dic else: return x ================================================ FILE: projects/occ_plugin/utils/gaussian.py ================================================ import numpy as np import torch import torch.nn.functional as F from torch.distributions import Normal import pdb def gaussian_2d(shape, sigma=1): """Generate gaussian map. Args: shape (list[int]): Shape of the map. sigma (float): Sigma to generate gaussian map. Defaults to 1. Returns: np.ndarray: Generated gaussian map. """ m, n = [(ss - 1.) / 2. for ss in shape] y, x = np.ogrid[-m:m + 1, -n:n + 1] h = np.exp(-(x * x + y * y) / (2 * sigma * sigma)) h[h < np.finfo(h.dtype).eps * h.max()] = 0 return h def draw_heatmap_gaussian(heatmap, center, radius, k=1): """Get gaussian masked heatmap. Args: heatmap (torch.Tensor): Heatmap to be masked. center (torch.Tensor): Center coord of the heatmap. radius (int): Radius of gausian. K (int): Multiple of masked_gaussian. Defaults to 1. Returns: torch.Tensor: Masked heatmap. """ diameter = 2 * radius + 1 gaussian = gaussian_2d((diameter, diameter), sigma=diameter / 6) x, y = int(center[0]), int(center[1]) height, width = heatmap.shape[0:2] left, right = min(x, radius), min(width - x, radius + 1) top, bottom = min(y, radius), min(height - y, radius + 1) masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right] masked_gaussian = torch.from_numpy( gaussian[radius - top:radius + bottom, radius - left:radius + right]).to(heatmap.device, torch.float32) if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0: torch.max(masked_heatmap, masked_gaussian * k, out=masked_heatmap) return heatmap def gaussian_radius(det_size, min_overlap=0.5): """Get radius of gaussian. Args: det_size (tuple[torch.Tensor]): Size of the detection result. min_overlap (float): Gaussian_overlap. Defaults to 0.5. Returns: torch.Tensor: Computed radius. """ height, width = det_size a1 = 1 b1 = (height + width) c1 = width * height * (1 - min_overlap) / (1 + min_overlap) sq1 = torch.sqrt(b1**2 - 4 * a1 * c1) r1 = (b1 + sq1) / 2 a2 = 4 b2 = 2 * (height + width) c2 = (1 - min_overlap) * width * height sq2 = torch.sqrt(b2**2 - 4 * a2 * c2) r2 = (b2 + sq2) / 2 a3 = 4 * min_overlap b3 = -2 * min_overlap * (height + width) c3 = (min_overlap - 1) * width * height sq3 = torch.sqrt(b3**2 - 4 * a3 * c3) r3 = (b3 + sq3) / 2 return min(r1, r2, r3) def generate_guassian_depth_target(depth, stride, cam_depth_range, constant_std=None): depth = depth.flatten(0, 1) # [bs*s, 6, 896, 1600] -> [bs*s*6, 896, 1600] B, tH, tW = depth.shape kernel_size = stride # [4,4,4] center_idx = kernel_size * kernel_size // 2 H = tH // stride # 896//4 = 248 W = tW // stride # 1600//4 = 400 unfold_depth = F.unfold(depth.unsqueeze(1), kernel_size, dilation=1, padding=0, stride=stride) #B, Cxkxk, HxW unfold_depth = unfold_depth.view(B, -1, H, W).permute(0, 2, 3, 1).contiguous() # B, H, W, kxk valid_mask = (unfold_depth != 0) # BN, H, W, kxk if constant_std is None: valid_mask_f = valid_mask.float() # BN, H, W, kxk valid_num = torch.sum(valid_mask_f, dim=-1) # BN, H, W valid_num[valid_num == 0] = 1e10 mean = torch.sum(unfold_depth, dim=-1) / valid_num var_sum = torch.sum(((unfold_depth - mean.unsqueeze(-1))**2) * valid_mask_f, dim=-1) # BN, H, W std_var = torch.sqrt(var_sum / valid_num) std_var[valid_num == 1] = 1 # set std_var to 1 when only one point in patch else: std_var = torch.ones((B, H, W)).type_as(depth).float() * constant_std unfold_depth[~valid_mask] = 1e10 min_depth = torch.min(unfold_depth, dim=-1)[0] #BN, H, W min_depth[min_depth == 1e10] = 0 # x in raw depth x = torch.arange(cam_depth_range[0] - cam_depth_range[2] / 2, cam_depth_range[1], cam_depth_range[2]) # normalized by intervals dist = Normal(min_depth / cam_depth_range[2], std_var / cam_depth_range[2]) # BN, H, W, D cdfs = [] for i in x: cdf = dist.cdf(i) cdfs.append(cdf) cdfs = torch.stack(cdfs, dim=-1) depth_dist = cdfs[..., 1:] - cdfs[...,:-1] return depth_dist, min_depth ================================================ FILE: projects/occ_plugin/utils/geometry.py ================================================ import numpy as np import PIL import torch import torch.nn.functional as F from pyquaternion import Quaternion def convert_egopose_to_matrix_numpy(trans, rot): transformation_matrix = np.zeros((4, 4), dtype=np.float32) rotation = Quaternion(rot).rotation_matrix translation = np.array(trans) transformation_matrix[:3, :3] = rotation transformation_matrix[:3, 3] = translation transformation_matrix[3, 3] = 1.0 return transformation_matrix def invert_matrix_egopose_numpy(egopose): """ Compute the inverse transformation of a 4x4 egopose numpy matrix.""" inverse_matrix = np.zeros((4, 4), dtype=np.float32) rotation = egopose[:3, :3] translation = egopose[:3, 3] inverse_matrix[:3, :3] = rotation.T inverse_matrix[:3, 3] = -np.dot(rotation.T, translation) inverse_matrix[3, 3] = 1.0 return inverse_matrix ================================================ FILE: projects/occ_plugin/utils/metric_util.py ================================================ # -*- coding:utf-8 -*- # author: Xinge # @file: metric_util.py import numpy as np def fast_hist(pred, label, n): k = (label >= 0) & (label < n) bin_count = np.bincount( n * label[k].astype(int) + pred[k], minlength=n ** 2) return bin_count[:n ** 2].reshape(n, n) def per_class_iu(hist): return np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) def fast_hist_crop(output, target, unique_label): hist = fast_hist(output.flatten(), target.flatten(), np.max(unique_label) + 2) hist = hist[unique_label + 1, :] hist = hist[:, unique_label + 1] return hist class SSCMetrics: def __init__(self, class_names, ignore_idx=255, empty_idx=None): self.class_names = class_names self.n_classes = len(class_names) self.ignore_idx = ignore_idx self.empty_idx = empty_idx self.reset() def hist_info(self, n_cl, pred, gt): assert pred.shape == gt.shape k = (gt >= 0) & (gt < n_cl) # exclude 255 labeled = np.sum(k) correct = np.sum((pred[k] == gt[k])) return ( np.bincount( n_cl * gt[k].astype(int) + pred[k].astype(int), minlength=n_cl ** 2 ).reshape(n_cl, n_cl), correct, labeled, ) @staticmethod def compute_score(hist, correct, labeled): iu = np.diag(hist) / (hist.sum(1) + hist.sum(0) - np.diag(hist)) mean_IU = np.nanmean(iu) mean_IU_no_back = np.nanmean(iu[1:]) freq = hist.sum(1) / hist.sum() freq_IU = (iu[freq > 0] * freq[freq > 0]).sum() mean_pixel_acc = correct / labeled if labeled != 0 else 0 return iu, mean_IU, mean_IU_no_back, mean_pixel_acc def add_batch(self, y_pred, y_true, nonsurface=None): self.count += 1 mask = y_true != self.ignore_idx if self.empty_idx is not None: mask = mask & (y_true != self.empty_idx) if nonsurface is not None: mask = mask & nonsurface tp, fp, fn = self.get_score_completion(y_pred, y_true, mask) self.completion_tp += tp self.completion_fp += fp self.completion_fn += fn mask = y_true != self.ignore_idx if self.empty_idx is not None: mask = mask & (y_true != self.empty_idx) tp_sum, fp_sum, fn_sum = self.get_score_semantic_and_completion( y_pred, y_true, mask ) self.tps += tp_sum self.fps += fp_sum self.fns += fn_sum def get_stats(self): if self.completion_tp != 0: precision = self.completion_tp / (self.completion_tp + self.completion_fp) recall = self.completion_tp / (self.completion_tp + self.completion_fn) iou = self.completion_tp / ( self.completion_tp + self.completion_fp + self.completion_fn ) else: precision, recall, iou = 0, 0, 0 iou_ssc = self.tps / (self.tps + self.fps + self.fns + 1e-5) return { "precision": precision, "recall": recall, "iou": iou, "iou_ssc": iou_ssc, "iou_ssc_mean": np.mean(iou_ssc[1:]), } def reset(self): self.completion_tp = 0 self.completion_fp = 0 self.completion_fn = 0 self.tps = np.zeros(self.n_classes) self.fps = np.zeros(self.n_classes) self.fns = np.zeros(self.n_classes) self.hist_ssc = np.zeros((self.n_classes, self.n_classes)) self.labeled_ssc = 0 self.correct_ssc = 0 self.precision = 0 self.recall = 0 self.iou = 0 self.count = 1e-8 self.iou_ssc = np.zeros(self.n_classes, dtype=np.float32) self.cnt_class = np.zeros(self.n_classes, dtype=np.float32) def get_score_completion(self, predict, target, nonempty=None): predict = np.copy(predict) target = np.copy(target) """for scene completion, treat the task as two-classes problem, just empty or occupancy""" _bs = predict.shape[0] # batch size # ---- ignore predict[target == self.ignore_idx] = 0 target[target == self.ignore_idx] = 0 # ---- flatten target = target.reshape(_bs, -1) # (_bs, 129600) predict = predict.reshape(_bs, -1) # (_bs, _C, 129600), 60*36*60=129600 # ---- treat all non-empty object class as one category, set them to label 1 b_pred = np.zeros(predict.shape) b_true = np.zeros(target.shape) b_pred[predict != self.empty_idx] = 1 b_true[target != self.empty_idx] = 1 p, r, iou = 0.0, 0.0, 0.0 tp_sum, fp_sum, fn_sum = 0, 0, 0 for idx in range(_bs): y_true = b_true[idx, :] # GT y_pred = b_pred[idx, :] if nonempty is not None: nonempty_idx = nonempty[idx, :].reshape(-1) y_true = y_true[nonempty_idx == 1] y_pred = y_pred[nonempty_idx == 1] tp = np.array(np.where(np.logical_and(y_true == 1, y_pred == 1))).size fp = np.array(np.where(np.logical_and(y_true != 1, y_pred == 1))).size fn = np.array(np.where(np.logical_and(y_true == 1, y_pred != 1))).size tp_sum += tp fp_sum += fp fn_sum += fn return tp_sum, fp_sum, fn_sum def get_score_semantic_and_completion(self, predict, target, nonempty=None): target = np.copy(target) predict = np.copy(predict) _bs = predict.shape[0] # batch size _C = self.n_classes # _C = 12 # ---- ignore predict[target == self.ignore_idx] = 0 target[target == self.ignore_idx] = 0 # ---- flatten target = target.reshape(_bs, -1) # (_bs, 129600) predict = predict.reshape(_bs, -1) # (_bs, 129600), 60*36*60=129600 cnt_class = np.zeros(_C, dtype=np.int32) # count for each class iou_sum = np.zeros(_C, dtype=np.float32) # sum of iou for each class tp_sum = np.zeros(_C, dtype=np.int32) # tp fp_sum = np.zeros(_C, dtype=np.int32) # fp fn_sum = np.zeros(_C, dtype=np.int32) # fn for idx in range(_bs): y_true = target[idx, :] # GT y_pred = predict[idx, :] if nonempty is not None: nonempty_idx = nonempty[idx, :].reshape(-1) y_pred = y_pred[ np.where(np.logical_and(nonempty_idx == 1, y_true != self.ignore_idx)) ] y_true = y_true[ np.where(np.logical_and(nonempty_idx == 1, y_true != self.ignore_idx)) ] for j in range(_C): # for each class tp = np.array(np.where(np.logical_and(y_true == j, y_pred == j))).size fp = np.array(np.where(np.logical_and(y_true != j, y_pred == j))).size fn = np.array(np.where(np.logical_and(y_true == j, y_pred != j))).size tp_sum[j] += tp fp_sum[j] += fp fn_sum[j] += fn return tp_sum, fp_sum, fn_sum ================================================ FILE: projects/occ_plugin/utils/nusc_param.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np nusc_class_frequencies = np.array([2242961742295, 25985376, 1561108, 28862014, 196106643, 15920504, 2158753, 26539491, 4004729, 34838681, 75173306, 2255027978, 50959399, 646022466, 869055679, 1446141335, 1724391378]) nusc_class_names = [ "empty", "barrier", "bicycle", "bus", "car", "construction", "motorcycle", "pedestrian", "trafficcone", "trailer", "truck", "driveable_surface", "other", "sidewalk", "terrain", "mannade", "vegetation", ] classname_to_color = { # RGB. # 0: (0, 0, 0), # Black. noise 1: (112, 128, 144), # Slategrey barrier 2: (220, 20, 60), # Crimson bicycle 3: (255, 127, 80), # Orangered bus 4: (255, 158, 0), # Orange car 5: (233, 150, 70), # Darksalmon construction 6: (255, 61, 99), # Red motorcycle 7: (0, 0, 230), # Blue pedestrian 8: (47, 79, 79), # Darkslategrey trafficcone 9: (255, 140, 0), # Darkorange trailer 10: (255, 99, 71), # Tomato truck 11: (0, 207, 191), # nuTonomy green driveable_surface 12: (175, 0, 75), # flat other 13: (75, 0, 75), # sidewalk 14: (112, 180, 60), # terrain 15: (222, 184, 135), # Burlywood mannade 16: (0, 175, 0), # Green vegetation } def KL_sep(p, target): """ KL divergence on nonzeros classes """ nonzeros = target != 0 nonzero_p = p[nonzeros] kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum") return kl_term def geo_scal_loss(pred, ssc_target): # Get softmax probabilities pred = F.softmax(pred, dim=1) # Compute empty and nonempty probabilities empty_probs = pred[:, 0, :, :, :] nonempty_probs = 1 - empty_probs # Remove unknown voxels mask = ssc_target != 255 nonempty_target = ssc_target != 0 nonempty_target = nonempty_target[mask].float() nonempty_probs = nonempty_probs[mask] empty_probs = empty_probs[mask] intersection = (nonempty_target * nonempty_probs).sum() precision = intersection / nonempty_probs.sum() recall = intersection / nonempty_target.sum() spec = ((1 - nonempty_target) * (empty_probs)).sum() / (1 - nonempty_target).sum() return ( F.binary_cross_entropy(precision, torch.ones_like(precision)) + F.binary_cross_entropy(recall, torch.ones_like(recall)) + F.binary_cross_entropy(spec, torch.ones_like(spec)) ) def sem_scal_loss(pred, ssc_target): # Get softmax probabilities pred = F.softmax(pred, dim=1) loss = 0 count = 0 mask = ssc_target != 255 n_classes = pred.shape[1] for i in range(0, n_classes): # Get probability of class i p = pred[:, i, :, :, :] # Remove unknown voxels target_ori = ssc_target p = p[mask] target = ssc_target[mask] completion_target = torch.ones_like(target) completion_target[target != i] = 0 completion_target_ori = torch.ones_like(target_ori).float() completion_target_ori[target_ori != i] = 0 if torch.sum(completion_target) > 0: count += 1.0 nominator = torch.sum(p * completion_target) loss_class = 0 if torch.sum(p) > 0: precision = nominator / (torch.sum(p)) loss_precision = F.binary_cross_entropy( precision, torch.ones_like(precision) ) loss_class += loss_precision if torch.sum(completion_target) > 0: recall = nominator / (torch.sum(completion_target)) loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall)) loss_class += loss_recall if torch.sum(1 - completion_target) > 0: specificity = torch.sum((1 - p) * (1 - completion_target)) / ( torch.sum(1 - completion_target) ) loss_specificity = F.binary_cross_entropy( specificity, torch.ones_like(specificity) ) loss_class += loss_specificity loss += loss_class return loss / count def CE_ssc_loss(pred, target, class_weights): """ :param: prediction: the predicted tensor, must be [BS, C, H, W, D] """ criterion = nn.CrossEntropyLoss( weight=class_weights, ignore_index=255, reduction="mean" ) loss = criterion(pred, target.long()) return loss ================================================ FILE: projects/occ_plugin/utils/semkitti.py ================================================ import torch import torch.nn as nn import torch.nn.functional as F import numpy as np semantic_kitti_class_frequencies = np.array( [ 5.41773033e09, 1.57835390e07, 1.25136000e05, 1.18809000e05, 6.46799000e05, 8.21951000e05, 2.62978000e05, 2.83696000e05, 2.04750000e05, 6.16887030e07, 4.50296100e06, 4.48836500e07, 2.26992300e06, 5.68402180e07, 1.57196520e07, 1.58442623e08, 2.06162300e06, 3.69705220e07, 1.15198800e06, 3.34146000e05, ] ) kitti_class_names = [ "empty", "car", "bicycle", "motorcycle", "truck", "other-vehicle", "person", "bicyclist", "motorcyclist", "road", "parking", "sidewalk", "other-ground", "building", "fence", "vegetation", "trunk", "terrain", "pole", "traffic-sign", ] def KL_sep(p, target): """ KL divergence on nonzeros classes """ nonzeros = target != 0 nonzero_p = p[nonzeros] kl_term = F.kl_div(torch.log(nonzero_p), target[nonzeros], reduction="sum") return kl_term def geo_scal_loss(pred, ssc_target, ignore_index=255, non_empty_idx=0): # Get softmax probabilities pred = F.softmax(pred, dim=1) # Compute empty and nonempty probabilities empty_probs = pred[:, non_empty_idx] nonempty_probs = 1 - empty_probs # Remove unknown voxels mask = ssc_target != ignore_index nonempty_target = ssc_target != non_empty_idx nonempty_target = nonempty_target[mask].float() nonempty_probs = nonempty_probs[mask] empty_probs = empty_probs[mask] eps = 1e-5 intersection = (nonempty_target * nonempty_probs).sum() precision = intersection / (nonempty_probs.sum()+eps) recall = intersection / (nonempty_target.sum()+eps) spec = ((1 - nonempty_target) * (empty_probs)).sum() / ((1 - nonempty_target).sum()+eps) return ( F.binary_cross_entropy(precision, torch.ones_like(precision)) + F.binary_cross_entropy(recall, torch.ones_like(recall)) + F.binary_cross_entropy(spec, torch.ones_like(spec)) ) def sem_scal_loss(pred, ssc_target, ignore_index=255): # Get softmax probabilities pred = F.softmax(pred, dim=1) loss = 0 count = 0 mask = ssc_target != ignore_index n_classes = pred.shape[1] for i in range(0, n_classes): # Get probability of class i p = pred[:, i] # Remove unknown voxels target_ori = ssc_target p = p[mask] target = ssc_target[mask] completion_target = torch.ones_like(target) completion_target[target != i] = 0 completion_target_ori = torch.ones_like(target_ori).float() completion_target_ori[target_ori != i] = 0 if torch.sum(completion_target) > 0: count += 1.0 nominator = torch.sum(p * completion_target) loss_class = 0 if torch.sum(p) > 0: precision = nominator / (torch.sum(p)) loss_precision = F.binary_cross_entropy( precision, torch.ones_like(precision) ) loss_class += loss_precision if torch.sum(completion_target) > 0: recall = nominator / (torch.sum(completion_target)) loss_recall = F.binary_cross_entropy(recall, torch.ones_like(recall)) loss_class += loss_recall if torch.sum(1 - completion_target) > 0: specificity = torch.sum((1 - p) * (1 - completion_target)) / ( torch.sum(1 - completion_target) ) loss_specificity = F.binary_cross_entropy( specificity, torch.ones_like(specificity) ) loss_class += loss_specificity loss += loss_class return loss / count def CE_ssc_loss(pred, target, class_weights=None, ignore_index=255): """ :param: prediction: the predicted tensor, must be [BS, C, ...] """ criterion = nn.CrossEntropyLoss( weight=class_weights, ignore_index=ignore_index, reduction="mean" ) loss = criterion(pred, target.long()) return loss def Smooth_L1_loss(pred, target, ignore_index=255): # pred/target B, H, W, D, 3 kept = (target[:, :, :, :, 0] != ignore_index) & (target[:, :, :, :, 1] != ignore_index) & (target[:, :, :, :, 2] != ignore_index) criterion = nn.SmoothL1Loss( reduction="mean" ) loss = criterion(pred[kept], target[kept]) if torch.isnan(loss): pred = pred * 0 target = target * 0 loss = criterion(pred, target) return loss return loss def vel_loss(pred, gt): return F.l1_loss(pred, gt) ================================================ FILE: projects/occ_plugin/utils/voxel_to_points.py ================================================ import open3d as o3d import numpy as np def query_points_from_voxels(pred, gt, img_metas): # pred, [tensor of shape (num_class, x, y, z)]: predicted classes # gt, [tensor of shape (batch, num_points)]: target points with semantic labels # logits to pred cls_id pred = np.argmax(pred.detach().cpu().numpy(), axis=0) gt_ = gt.detach().cpu().numpy() pred_fore_mask = pred > 0 if pred_fore_mask.sum() == 0: return None # select foreground 3d voxel vertex x = np.linspace(0, pred.shape[0] - 1, pred.shape[0]) y = np.linspace(0, pred.shape[1] - 1, pred.shape[1]) z = np.linspace(0, pred.shape[2] - 1, pred.shape[2]) X, Y, Z = np.meshgrid(x, y, z, indexing='ij') vv = np.stack([X, Y, Z], axis=-1) # foreground predictions & coordinates pred = pred[pred_fore_mask] vv = vv[pred_fore_mask] vv[:, 0] = (vv[:, 0] + 0.5) * (img_metas['pc_range'][3] - img_metas['pc_range'][0]) / img_metas['occ_size'][0] + img_metas['pc_range'][0] vv[:, 1] = (vv[:, 1] + 0.5) * (img_metas['pc_range'][4] - img_metas['pc_range'][1]) / img_metas['occ_size'][1] + img_metas['pc_range'][1] vv[:, 2] = (vv[:, 2] + 0.5) * (img_metas['pc_range'][5] - img_metas['pc_range'][2]) / img_metas['occ_size'][2] + img_metas['pc_range'][2] pcd = o3d.geometry.PointCloud() pcd.points = o3d.utility.Vector3dVector(vv) # for every lidar point, search its nearest *foreground* voxel vertex as the semantic prediction kdtree = o3d.geometry.KDTreeFlann(pcd) indices = [] for vert in gt_[:, :3]: _, inds, _ = kdtree.search_knn_vector_3d(vert, 1) indices.append(inds[0]) pred_valid = pred[np.array(indices)] return pred_valid ================================================ FILE: run.sh ================================================ echo "-------------" echo "load config from local path:" $1 if [ -f $1 ]; then config=$1 else echo "need a config file" exit fi bash tools/dist_train.sh $config $2 ${@:3} ================================================ FILE: run_eval.sh ================================================ echo "-------------" echo "load config from local path:" $1 if [ -f $1 ]; then config=$1 else echo "need a config file" exit fi export PYTHONPATH="." ckpt=$2 gpu=$3 bash tools/dist_test.sh $config $ckpt $gpu ${@:4} ================================================ FILE: setup.py ================================================ from setuptools import find_packages, setup import os import torch from os import path as osp from torch.utils.cpp_extension import (BuildExtension, CppExtension, CUDAExtension) def make_cuda_ext(name, module, sources, sources_cuda=[], extra_args=[], extra_include_path=[]): define_macros = [] extra_compile_args = {'cxx': [] + extra_args} if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': define_macros += [('WITH_CUDA', None)] extension = CUDAExtension extra_compile_args['nvcc'] = extra_args + [ '-D__CUDA_NO_HALF_OPERATORS__', '-D__CUDA_NO_HALF_CONVERSIONS__', '-D__CUDA_NO_HALF2_OPERATORS__', ] sources += sources_cuda else: print('Compiling {} without CUDA'.format(name)) extension = CppExtension # raise EnvironmentError('CUDA is required to compile MMDetection!') return extension( name='{}.{}'.format(module, name), sources=[os.path.join(*module.split('.'), p) for p in sources], include_dirs=extra_include_path, define_macros=define_macros, extra_compile_args=extra_compile_args) if __name__ == '__main__': # add_mim_extention() setup( name='OpenOccupancy', version='0.0', description=("OpenOccupancy: A Large Scale Benchmark for Surrounding Semantic Occupancy Perception"), author='OpenOccupancy Contributors', author_email='wangxiaofeng2020@ia.ac.cn', keywords='Occupancy Perception', packages=find_packages(), include_package_data=True, package_data={'projects.occ_plugin.ops': ['*/*.so']}, classifiers=[ "Development Status :: 4 - Beta", "License :: OSI Approved :: Apache Software License", "Operating System :: OS Independent", "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.6", "Programming Language :: Python :: 3.7", ], license="Apache License 2.0", ext_modules=[ make_cuda_ext( name="occ_pool_ext", module="projects.occ_plugin.ops.occ_pooling", sources=[ "src/occ_pool.cpp", "src/occ_pool_cuda.cu", ]), ], cmdclass={'build_ext': BuildExtension}) ================================================ FILE: tools/dist_test.sh ================================================ #!/usr/bin/env bash CONFIG=$1 CHECKPOINT=$2 GPUS=$3 PORT=${PORT:-29504} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.run --nproc_per_node=$GPUS --master_port=$PORT \ $(dirname "$0")/test.py $CONFIG $CHECKPOINT --launcher pytorch ${@:4} --deterministic --eval bbox ================================================ FILE: tools/dist_train.sh ================================================ #!/usr/bin/env bash CONFIG=$1 GPUS=$2 NNODES=${NNODES:-1} NODE_RANK=${NODE_RANK:-0} PORT=${PORT:-29501} MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \ python -m torch.distributed.run \ --nnodes=$NNODES \ --node_rank=$NODE_RANK \ --master_addr=$MASTER_ADDR \ --nproc_per_node=$GPUS \ --master_port=$PORT \ $(dirname "$0")/train.py \ $CONFIG \ --seed 2 \ --resume ./work_dirs/OCFNet_in_Cam4DOcc_V1.2/epoch_15.pth --launcher pytorch ${@:3} ================================================ FILE: tools/gen_data/gen_depth_gt.py ================================================ import os from multiprocessing import Pool import mmcv import numpy as np from nuscenes.utils.data_classes import LidarPointCloud from nuscenes.utils.geometry_utils import view_points from pyquaternion import Quaternion import copy # https://github.com/nutonomy/nuscenes-devkit/blob/master/python-sdk/nuscenes/nuscenes.py#L834 def map_pointcloud_to_image( pc, im, lidar2ego_translation, lidar2ego_rotation, ego2global_translation, ego2global_rotation, sensor2ego_translation, sensor2ego_rotation, cam_ego2global_translation, cam_ego2global_rotation, cam_intrinsic, min_dist: float = 0.0, ): # Points live in the point sensor frame. So they need to be # transformed via global to the image plane. # First step: transform the pointcloud to the ego vehicle # frame for the timestamp of the sweep. pc = LidarPointCloud(pc.T) pc.rotate(Quaternion(lidar2ego_rotation).rotation_matrix) pc.translate(np.array(lidar2ego_translation)) # Second step: transform from ego to the global frame. pc.rotate(Quaternion(ego2global_rotation).rotation_matrix) pc.translate(np.array(ego2global_translation)) # Third step: transform from global into the ego vehicle # frame for the timestamp of the image. pc.translate(-np.array(cam_ego2global_translation)) pc.rotate(Quaternion(cam_ego2global_rotation).rotation_matrix.T) # Fourth step: transform from ego into the camera. pc.translate(-np.array(sensor2ego_translation)) pc.rotate(Quaternion(sensor2ego_rotation).rotation_matrix.T) # Fifth step: actually take a "picture" of the point cloud. # Grab the depths (camera frame z axis points away from the camera). depths = pc.points[2, :] coloring = depths # Take the actual picture (matrix multiplication with camera-matrix # + renormalization). points = view_points(pc.points[:3, :], cam_intrinsic, normalize=True) # Remove points that are either outside or behind the camera. # Leave a margin of 1 pixel for aesthetic reasons. Also make # sure points are at least 1m in front of the camera to avoid # seeing the lidar points on the camera casing for non-keyframes # which are slightly out of sync. mask = np.ones(depths.shape[0], dtype=bool) mask = np.logical_and(mask, depths > min_dist) mask = np.logical_and(mask, points[0, :] > 1) mask = np.logical_and(mask, points[0, :] < im.shape[1] - 1) mask = np.logical_and(mask, points[1, :] > 1) mask = np.logical_and(mask, points[1, :] < im.shape[0] - 1) points = points[:, mask] coloring = coloring[mask] return points, coloring data_root = './data/nuscenes' info_path_train = './data/nuscenes/nuscenes_occ_infos_train.pkl' info_path_val = './data/nuscenes/nuscenes_occ_infos_val.pkl' # data3d_nusc = NuscMVDetData() lidar_key = 'LIDAR_TOP' cam_keys = [ 'CAM_FRONT_LEFT', 'CAM_FRONT', 'CAM_FRONT_RIGHT', 'CAM_BACK_RIGHT', 'CAM_BACK', 'CAM_BACK_LEFT' ] def worker(info): lidar_path = info['lidar_path'] points = np.fromfile(lidar_path, dtype=np.float32, count=-1).reshape(-1, 5)[..., :4] lidar2ego_translation = info['lidar2ego_translation'] lidar2ego_rotation = info['lidar2ego_rotation'] ego2global_translation = info['ego2global_translation'] ego2global_rotation = info['ego2global_rotation'] for i, cam_key in enumerate(cam_keys): sensor2ego_translation = info['cams'][cam_key]['sensor2ego_translation'] sensor2ego_rotation = info['cams'][cam_key]['sensor2ego_rotation'] cam_ego2global_translation = info['cams'][cam_key]['ego2global_translation'] cam_ego2global_rotation = info['cams'][cam_key]['ego2global_rotation'] cam_intrinsic = info['cams'][cam_key]['cam_intrinsic'] img = mmcv.imread( os.path.join(info['cams'][cam_key]['data_path'])) pts_img, depth = map_pointcloud_to_image( points.copy(), img, copy.deepcopy(lidar2ego_translation), copy.deepcopy(lidar2ego_rotation), copy.deepcopy(ego2global_translation), copy.deepcopy(ego2global_rotation), copy.deepcopy(sensor2ego_translation), copy.deepcopy(sensor2ego_rotation), copy.deepcopy(cam_ego2global_translation), copy.deepcopy(cam_ego2global_rotation), copy.deepcopy(cam_intrinsic)) file_name = os.path.split(info['cams'][cam_key]['data_path'])[-1] np.concatenate([pts_img[:2, :].T, depth[:, None]], axis=1).astype(np.float32).flatten().tofile( os.path.join('./data', 'depth_gt', f'{file_name}.bin')) if __name__ == '__main__': po = Pool(12) mmcv.mkdir_or_exist(os.path.join('./data', 'depth_gt')) infos = mmcv.load(info_path_train)['infos'] for info in infos: po.apply_async(func=worker, args=(info, )) po.close() po.join() po2 = Pool(12) infos = mmcv.load(info_path_val)['infos'] for info in infos: po2.apply_async(func=worker, args=(info, )) po2.close() po2.join() ================================================ FILE: tools/misc/browse_dataset.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import numpy as np import warnings from mmcv import Config, DictAction, mkdir_or_exist, track_iter_progress from os import path as osp from mmdet3d.core.bbox import (Box3DMode, CameraInstance3DBoxes, Coord3DMode, DepthInstance3DBoxes, LiDARInstance3DBoxes) from mmdet3d.core.visualizer import (show_multi_modality_result, show_result, show_seg_result) from mmdet3d.datasets import build_dataset def parse_args(): parser = argparse.ArgumentParser(description='Browse a dataset') parser.add_argument('config', help='train config file path') parser.add_argument( '--skip-type', type=str, nargs='+', default=['Normalize'], help='skip some useless pipeline') parser.add_argument( '--output-dir', default=None, type=str, help='If there is no display interface, you can save it') parser.add_argument( '--task', type=str, choices=['det', 'seg', 'multi_modality-det', 'mono-det'], help='Determine the visualization method depending on the task.') parser.add_argument( '--online', action='store_true', help='Whether to perform online visualization. Note that you often ' 'need a monitor to do so.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') args = parser.parse_args() return args def build_data_cfg(config_path, skip_type, cfg_options): """Build data config for loading visualization data.""" cfg = Config.fromfile(config_path) if cfg_options is not None: cfg.merge_from_dict(cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # extract inner dataset of `RepeatDataset` as `cfg.data.train` # so we don't need to worry about it later if cfg.data.train['type'] == 'RepeatDataset': cfg.data.train = cfg.data.train.dataset # use only first dataset for `ConcatDataset` if cfg.data.train['type'] == 'ConcatDataset': cfg.data.train = cfg.data.train.datasets[0] train_data_cfg = cfg.data.train # eval_pipeline purely consists of loading functions # use eval_pipeline for data loading train_data_cfg['pipeline'] = [ x for x in cfg.eval_pipeline if x['type'] not in skip_type ] return cfg def to_depth_mode(points, bboxes): """Convert points and bboxes to Depth Coord and Depth Box mode.""" if points is not None: points = Coord3DMode.convert_point(points.copy(), Coord3DMode.LIDAR, Coord3DMode.DEPTH) if bboxes is not None: bboxes = Box3DMode.convert(bboxes.clone(), Box3DMode.LIDAR, Box3DMode.DEPTH) return points, bboxes def show_det_data(idx, dataset, out_dir, filename, show=False): """Visualize 3D point cloud and 3D bboxes.""" example = dataset.prepare_train_data(idx) points = example['points']._data.numpy() gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'].tensor if dataset.box_mode_3d != Box3DMode.DEPTH: points, gt_bboxes = to_depth_mode(points, gt_bboxes) show_result( points, gt_bboxes.clone(), None, out_dir, filename, show=show, snapshot=True) def show_seg_data(idx, dataset, out_dir, filename, show=False): """Visualize 3D point cloud and segmentation mask.""" example = dataset.prepare_train_data(idx) points = example['points']._data.numpy() gt_seg = example['pts_semantic_mask']._data.numpy() show_seg_result( points, gt_seg.copy(), None, out_dir, filename, np.array(dataset.PALETTE), dataset.ignore_index, show=show, snapshot=True) def show_proj_bbox_img(idx, dataset, out_dir, filename, show=False, is_nus_mono=False): """Visualize 3D bboxes on 2D image by projection.""" try: example = dataset.prepare_train_data(idx) except AttributeError: # for Mono-3D datasets example = dataset.prepare_train_img(idx) gt_bboxes = dataset.get_ann_info(idx)['gt_bboxes_3d'] img_metas = example['img_metas']._data img = example['img']._data.numpy() # need to transpose channel to first dim img = img.transpose(1, 2, 0) # no 3D gt bboxes, just show img if gt_bboxes.tensor.shape[0] == 0: gt_bboxes = None if isinstance(gt_bboxes, DepthInstance3DBoxes): show_multi_modality_result( img, gt_bboxes, None, None, out_dir, filename, box_mode='depth', img_metas=img_metas, show=show) elif isinstance(gt_bboxes, LiDARInstance3DBoxes): show_multi_modality_result( img, gt_bboxes, None, img_metas['lidar2img'], out_dir, filename, box_mode='lidar', img_metas=img_metas, show=show) elif isinstance(gt_bboxes, CameraInstance3DBoxes): show_multi_modality_result( img, gt_bboxes, None, img_metas['cam2img'], out_dir, filename, box_mode='camera', img_metas=img_metas, show=show) else: # can't project, just show img warnings.warn( f'unrecognized gt box type {type(gt_bboxes)}, only show image') show_multi_modality_result( img, None, None, None, out_dir, filename, show=show) def main(): args = parse_args() if args.output_dir is not None: mkdir_or_exist(args.output_dir) cfg = build_data_cfg(args.config, args.skip_type, args.cfg_options) try: dataset = build_dataset( cfg.data.train, default_args=dict(filter_empty_gt=False)) except TypeError: # seg dataset doesn't have `filter_empty_gt` key dataset = build_dataset(cfg.data.train) data_infos = dataset.data_infos dataset_type = cfg.dataset_type # configure visualization mode vis_task = args.task # 'det', 'seg', 'multi_modality-det', 'mono-det' for idx, data_info in enumerate(track_iter_progress(data_infos)): if dataset_type in ['KittiDataset', 'WaymoDataset']: data_path = data_info['point_cloud']['velodyne_path'] elif dataset_type in [ 'ScanNetDataset', 'SUNRGBDDataset', 'ScanNetSegDataset', 'S3DISSegDataset', 'S3DISDataset' ]: data_path = data_info['pts_path'] elif dataset_type in ['NuScenesDataset', 'LyftDataset']: data_path = data_info['lidar_path'] elif dataset_type in ['NuScenesMonoDataset']: data_path = data_info['file_name'] else: raise NotImplementedError( f'unsupported dataset type {dataset_type}') file_name = osp.splitext(osp.basename(data_path))[0] if vis_task in ['det', 'multi_modality-det']: # show 3D bboxes on 3D point clouds show_det_data( idx, dataset, args.output_dir, file_name, show=args.online) if vis_task in ['multi_modality-det', 'mono-det']: # project 3D bboxes to 2D image show_proj_bbox_img( idx, dataset, args.output_dir, file_name, show=args.online, is_nus_mono=(dataset_type == 'NuScenesMonoDataset')) elif vis_task in ['seg']: # show 3D segmentation mask on 3D point clouds show_seg_data( idx, dataset, args.output_dir, file_name, show=args.online) if __name__ == '__main__': main() ================================================ FILE: tools/misc/fuse_conv_bn.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import torch from mmcv.runner import save_checkpoint from torch import nn as nn from mmdet.apis import init_model def fuse_conv_bn(conv, bn): """During inference, the functionary of batch norm layers is turned off but only the mean and var alone channels are used, which exposes the chance to fuse it with the preceding conv layers to save computations and simplify network structures.""" conv_w = conv.weight conv_b = conv.bias if conv.bias is not None else torch.zeros_like( bn.running_mean) factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) conv.weight = nn.Parameter(conv_w * factor.reshape([conv.out_channels, 1, 1, 1])) conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) return conv def fuse_module(m): last_conv = None last_conv_name = None for name, child in m.named_children(): if isinstance(child, (nn.BatchNorm2d, nn.SyncBatchNorm)): if last_conv is None: # only fuse BN that is after Conv continue fused_conv = fuse_conv_bn(last_conv, child) m._modules[last_conv_name] = fused_conv # To reduce changes, set BN as Identity instead of deleting it. m._modules[name] = nn.Identity() last_conv = None elif isinstance(child, nn.Conv2d): last_conv = child last_conv_name = name else: fuse_module(child) return m def parse_args(): parser = argparse.ArgumentParser( description='fuse Conv and BN layers in a model') parser.add_argument('config', help='config file path') parser.add_argument('checkpoint', help='checkpoint file path') parser.add_argument('out', help='output path of the converted model') args = parser.parse_args() return args def main(): args = parse_args() # build the model from a config file and a checkpoint file model = init_model(args.config, args.checkpoint) # fuse conv and bn layers of the model fused_model = fuse_module(model) save_checkpoint(fused_model, args.out) if __name__ == '__main__': main() ================================================ FILE: tools/misc/print_config.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse from mmcv import Config, DictAction def parse_args(): parser = argparse.ArgumentParser(description='Print the whole config') parser.add_argument('config', help='config file path') parser.add_argument( '--options', nargs='+', action=DictAction, help='arguments in dict') args = parser.parse_args() return args def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.options is not None: cfg.merge_from_dict(args.options) print(f'Config:\n{cfg.pretty_text}') if __name__ == '__main__': main() ================================================ FILE: tools/misc/visualize_results.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. import argparse import mmcv from mmcv import Config from mmdet3d.datasets import build_dataset def parse_args(): parser = argparse.ArgumentParser( description='MMDet3D visualize the results') parser.add_argument('config', help='test config file path') parser.add_argument('--result', help='results file in pickle format') parser.add_argument( '--show-dir', help='directory where visualize results will be saved') args = parser.parse_args() return args def main(): args = parse_args() if args.result is not None and \ not args.result.endswith(('.pkl', '.pickle')): raise ValueError('The results file must be a pkl file.') cfg = Config.fromfile(args.config) cfg.data.test.test_mode = True # build the dataset dataset = build_dataset(cfg.data.test) results = mmcv.load(args.result) if getattr(dataset, 'show', None) is not None: # data loading pipeline for showing eval_pipeline = cfg.get('eval_pipeline', {}) if eval_pipeline: dataset.show(results, args.show_dir, pipeline=eval_pipeline) else: dataset.show(results, args.show_dir) # use default pipeline else: raise NotImplementedError( 'Show is not implemented for dataset {}!'.format( type(dataset).__name__)) if __name__ == '__main__': main() ================================================ FILE: tools/test.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Modified by Junyi Ma, following OpenOccupancy of Zhiqi Li import argparse import mmcv import os import torch from mmcv import Config, DictAction from mmcv.cnn import fuse_conv_bn from mmcv.parallel import MMDataParallel, MMDistributedDataParallel from mmcv.runner import (get_dist_info, init_dist, load_checkpoint, wrap_fp16_model) from mmdet3d.apis import single_gpu_test from mmdet3d.datasets import build_dataset from projects.occ_plugin.datasets.builder import build_dataloader from mmdet3d.models import build_model from mmdet.apis import set_random_seed from projects.occ_plugin.occupancy.apis.test import custom_single_gpu_test, custom_multi_gpu_test from mmdet.datasets import replace_ImageToTensor import time import os.path as osp import warnings warnings.filterwarnings("ignore") warnings.simplefilter(action="ignore",category=FutureWarning) def parse_args(): parser = argparse.ArgumentParser( description='MMDet test (and eval) a model') parser.add_argument('config', help='test config file path') parser.add_argument('checkpoint', help='checkpoint file') parser.add_argument('--out', help='output result file in pickle format') parser.add_argument( '--fuse-conv-bn', action='store_true', help='Whether to fuse conv and bn, this will slightly increase' 'the inference speed') parser.add_argument( '--format-only', action='store_true', help='Format the output results without perform evaluation. It is' 'useful when you want to format the result to a specific format and ' 'submit it to the test server') parser.add_argument( '--eval', type=str, nargs='+', help='evaluation metrics, which depends on the dataset, e.g., "bbox",' ' "segm", "proposal" for COCO, and "mAP", "recall" for PASCAL VOC') parser.add_argument('--show', action='store_true', help='show results') parser.add_argument( '--show-dir', help='directory where results will be saved') parser.add_argument( '--gpu-collect', action='store_true', help='whether to use gpu to collect results.') parser.add_argument( '--tmpdir', help='tmp directory used for collecting results from multiple ' 'workers, available when gpu-collect is not specified') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function (deprecate), ' 'change to --eval-options instead.') parser.add_argument( '--eval-options', nargs='+', action=DictAction, help='custom options for evaluation, the key-value pair in xxx=yyy ' 'format will be kwargs for dataset.evaluate() function') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) if args.options and args.eval_options: raise ValueError( '--options and --eval-options cannot be both specified, ' '--options is deprecated in favor of --eval-options') if args.options: warnings.warn('--options is deprecated in favor of --eval-options') args.eval_options = args.options return args def main(): args = parse_args() assert args.out or args.eval or args.format_only or args.show \ or args.show_dir, \ ('Please specify at least one operation (save/eval/format/show the ' 'results / save the results) with the argument "--out", "--eval"' ', "--format-only", "--show" or "--show-dir"') if args.eval and args.format_only: raise ValueError('--eval and --format_only cannot be both specified') if args.out is not None and not args.out.endswith(('.pkl', '.pickle')): raise ValueError('The output file must be a pkl file.') cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from string list. if cfg.get('custom_imports', None): from mmcv.utils import import_modules_from_strings import_modules_from_strings(**cfg['custom_imports']) # import modules from plguin/xx, registry will be updated if hasattr(cfg, 'plugin'): if cfg.plugin: import importlib if hasattr(cfg, 'plugin_dir'): plugin_dir = cfg.plugin_dir _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m # print(_module_path) plg_lib = importlib.import_module(_module_path) else: # import dir is the dirpath for the config file _module_dir = os.path.dirname(args.config) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m # print(_module_path) plg_lib = importlib.import_module(_module_path) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True cfg.model.pretrained = None # in case the test dataset is concatenated samples_per_gpu = 1 if isinstance(cfg.data.test, dict): cfg.data.test.test_mode = True samples_per_gpu = cfg.data.test.pop('samples_per_gpu', 1) if samples_per_gpu > 1: # Replace 'ImageToTensor' to 'DefaultFormatBundle' cfg.data.test.pipeline = replace_ImageToTensor( cfg.data.test.pipeline) elif isinstance(cfg.data.test, list): for ds_cfg in cfg.data.test: ds_cfg.test_mode = True samples_per_gpu = max( [ds_cfg.pop('samples_per_gpu', 1) for ds_cfg in cfg.data.test]) if samples_per_gpu > 1: for ds_cfg in cfg.data.test: ds_cfg.pipeline = replace_ImageToTensor(ds_cfg.pipeline) # init distributed env first, since logger depends on the dist info. # print("args.launcher", args.launcher) if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # set random seeds if args.seed is not None: set_random_seed(args.seed, deterministic=args.deterministic) # build the dataloader dataset = build_dataset(cfg.data.test) data_loader = build_dataloader( dataset, samples_per_gpu=samples_per_gpu, workers_per_gpu=cfg.data.workers_per_gpu, dist=distributed, shuffle=True, num_gpus=2, ) # build the model and load checkpoint cfg.model.train_cfg = None model = build_model(cfg.model, test_cfg=cfg.get('test_cfg')) fp16_cfg = cfg.get('fp16', None) if fp16_cfg is not None: wrap_fp16_model(model) checkpoint = load_checkpoint(model, args.checkpoint, map_location='cpu') if args.fuse_conv_bn: model = fuse_conv_bn(model) # old versions did not save class info in checkpoints, this walkaround is # for backward compatibility if 'CLASSES' in checkpoint.get('meta', {}): model.CLASSES = checkpoint['meta']['CLASSES'] else: model.CLASSES = dataset.CLASSES # palette for visualization in segmentation tasks if 'PALETTE' in checkpoint.get('meta', {}): model.PALETTE = checkpoint['meta']['PALETTE'] elif hasattr(dataset, 'PALETTE'): # segmentation dataset has `PALETTE` attribute model.PALETTE = dataset.PALETTE if args.show: if args.show_dir is None: args.show_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0], 'visualization') print('save dir: ', args.show_dir) os.makedirs(args.show_dir, exist_ok=True) if not distributed: model = MMDataParallel(model, device_ids=[0]) outputs = custom_single_gpu_test(model, data_loader, args.show, args.show_dir) else: model = MMDistributedDataParallel( model.cuda(), device_ids=[torch.cuda.current_device()], broadcast_buffers=False) outputs = custom_multi_gpu_test(model, data_loader, args.tmpdir, args.gpu_collect, args.show, args.show_dir) rank, _ = get_dist_info() if rank == 0 and distributed: kwargs = {} if args.eval_options is None else args.eval_options kwargs['jsonfile_prefix'] = osp.join('test', args.config.split( '/')[-1].split('.')[-2], time.ctime().replace(' ', '_').replace(':', '_')) if args.format_only: dataset.format_results(outputs, **kwargs) if args.eval: eval_kwargs = cfg.get('evaluation', {}).copy() # hard-code way to remove EvalHook args for key in [ 'interval', 'tmpdir', 'start', 'gpu_collect', 'save_best', 'rule' ]: eval_kwargs.pop(key, None) eval_kwargs.update(dict(metric=args.eval, **kwargs)) print(dataset.evaluate(outputs, **eval_kwargs)) if __name__ == '__main__': main() ================================================ FILE: tools/train.py ================================================ # Copyright (c) OpenMMLab. All rights reserved. # Cam4DOcc refers to OpenOccupancy of Zhiqi Li from __future__ import division import argparse import copy import mmcv import os import time import torch import warnings from mmcv import Config, DictAction from mmcv.runner import get_dist_info, init_dist from os import path as osp from mmdet import __version__ as mmdet_version from mmdet3d import __version__ as mmdet3d_version from mmseg import __version__ as mmseg_version from mmdet3d.datasets import build_dataset from mmdet3d.models import build_model from mmdet3d.utils import collect_env, get_root_logger from mmdet.apis import set_random_seed from mmcv.utils import TORCH_VERSION, digit_version from projects.occ_plugin.occupancy.apis.train import custom_train_model import warnings warnings.filterwarnings("ignore") warnings.simplefilter(action="ignore",category=FutureWarning) def parse_args(): parser = argparse.ArgumentParser(description='Train a detector') parser.add_argument('config', help='train config file path') parser.add_argument('--work-dir', help='the dir to save logs and models') parser.add_argument( '--resume', help='the checkpoint file to resume from') parser.add_argument( '--no-validate', action='store_true', help='whether not to evaluate the checkpoint during training') group_gpus = parser.add_mutually_exclusive_group() group_gpus.add_argument( '--gpus', type=int, help='number of gpus to use ' '(only applicable to non-distributed training)') group_gpus.add_argument( '--gpu-ids', type=int, nargs='+', help='ids of gpus to use ' '(only applicable to non-distributed training)') parser.add_argument('--seed', type=int, default=0, help='random seed') parser.add_argument( '--deterministic', action='store_true', help='whether to set deterministic options for CUDNN backend.') parser.add_argument( '--options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file (deprecate), ' 'change to --cfg-options instead.') parser.add_argument( '--cfg-options', nargs='+', action=DictAction, help='override some settings in the used config, the key-value pair ' 'in xxx=yyy format will be merged into config file. If the value to ' 'be overwritten is a list, it should be like key="[a,b]" or key=a,b ' 'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" ' 'Note that the quotation marks are necessary and that no white space ' 'is allowed.') parser.add_argument( '--launcher', choices=['none', 'pytorch', 'slurm', 'mpi'], default='pytorch', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) parser.add_argument( '--autoscale-lr', action='store_true', help='automatically scale lr with the number of gpus') args = parser.parse_args() if 'LOCAL_RANK' not in os.environ: os.environ['LOCAL_RANK'] = str(args.local_rank) return args def main(): args = parse_args() cfg = Config.fromfile(args.config) if args.cfg_options is not None: cfg.merge_from_dict(args.cfg_options) # import modules from plguin/xx, registry will be updated if hasattr(cfg, 'plugin') and cfg.plugin: assert cfg.plugin_dir is not None import importlib plugin_dir = cfg.plugin_dir _module_dir = os.path.dirname(plugin_dir) _module_dir = _module_dir.split('/') _module_path = _module_dir[0] for m in _module_dir[1:]: _module_path = _module_path + '.' + m # print(_module_path) plg_lib = importlib.import_module(_module_path) # set cudnn_benchmark if cfg.get('cudnn_benchmark', False): torch.backends.cudnn.benchmark = True # work_dir is determined in this priority: CLI > segment in file > filename if args.work_dir is not None: cfg.work_dir = args.work_dir elif cfg.get('work_dir', None) is None: cfg.work_dir = osp.join('./work_dirs', osp.splitext(osp.basename(args.config))[0]) if args.resume is not None and osp.isfile(args.resume): cfg.resume_from = args.resume if args.gpu_ids is not None: cfg.gpu_ids = args.gpu_ids else: cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus) if args.autoscale_lr: cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8 # init distributed env first, since logger depends on the dist info. if args.launcher == 'none': distributed = False else: distributed = True init_dist(args.launcher, **cfg.dist_params) # re-set gpu_ids with distributed training mode _, world_size = get_dist_info() cfg.gpu_ids = range(world_size) # create work_dir mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir)) # dump config cfg.dump(osp.join(cfg.work_dir, osp.basename(args.config))) # init the logger before other steps timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime()) log_file = osp.join(cfg.work_dir, f'{timestamp}.log') logger = get_root_logger( log_file=log_file, log_level=cfg.log_level, name='mmdet') # init the meta dict to record some important information such as # environment info and seed, which will be logged meta = dict() # log env info env_info_dict = collect_env() env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()]) dash_line = '-' * 60 + '\n' logger.info('Environment info:\n' + dash_line + env_info + '\n' + dash_line) meta['env_info'] = env_info meta['config'] = cfg.pretty_text # log some basic info logger.info(f'Distributed training: {distributed}') # set random seeds if args.seed is not None: logger.info(f'Set random seed to {args.seed}, ' f'deterministic: {args.deterministic}') set_random_seed(args.seed, deterministic=args.deterministic) cfg.seed = args.seed meta['seed'] = args.seed meta['exp_name'] = osp.basename(args.config) model = build_model( cfg.model, train_cfg=cfg.get('train_cfg'), test_cfg=cfg.get('test_cfg')) n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) logger.info(f'Number of params: {n_parameters}') model.init_weights() datasets = [build_dataset(cfg.data.train)] # add an attribute for visualization convenience model.CLASSES = datasets[0].CLASSES custom_train_model( model, datasets, cfg, distributed=distributed, validate=(not args.no_validate), timestamp=timestamp, meta=meta) if __name__ == '__main__': main() ================================================ FILE: viz/viz_gt.py ================================================ # Developed by Junyi Ma # Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in Autonomous Driving Applications # https://github.com/haomo-ai/Cam4DOcc from tqdm import tqdm import pickle import numpy as np from mayavi import mlab from tqdm import trange import os from xvfbwrapper import Xvfb # export QT_QPA_PLATFORM='offscreen' mlab.options.offscreen = True def viz_occ(occ, occ_mo, file_name, voxel_size, show_occ, show_time_change): vdisplay = Xvfb(width=1, height=1) vdisplay.start() mlab.figure(size=(800,800), bgcolor=(1,1,1)) plt_plot_occ = mlab.points3d( occ[:, 0] * voxel_size, occ[:, 1] * voxel_size, occ[:, 2] * voxel_size, occ[:, 3], colormap="viridis", scale_factor=voxel_size - 0.05 * voxel_size, mode="cube", opacity=0.9, vmin=1, ) colors_occ = np.array( [ [152, 251, 152, 255], [152, 251, 152, 255], [152, 251, 152, 255], [152, 251, 152, 255], [152, 251, 152, 255], ] ).astype(np.uint8) plt_plot_occ.glyph.scale_mode = "scale_by_vector" plt_plot_occ.module_manager.scalar_lut_manager.lut.table = colors_occ plt_plot_mov = mlab.points3d( occ_mo[:, 0] * voxel_size, occ_mo[:, 1] * voxel_size, occ_mo[:, 2] * voxel_size, occ_mo[:, 3], colormap="viridis", scale_factor=voxel_size - 0.05 * voxel_size, mode="cube", opacity=0.9, vmin=1, ) if show_time_change: colors_occ_mo = np.array( [ [255, 70, 255, 255], [255, 110, 255, 255], [255, 150, 255, 255], [255, 190, 255, 255], [255, 250, 250, 255], ] ).astype(np.uint8) else: colors_occ_mo = np.array( [ [220, 20, 60, 255], [255, 127, 80, 255], [0, 0, 230, 255], [255, 158, 0, 255], [233, 150, 70, 255], [47, 79, 79, 255], [255, 99, 71, 255], [175, 0, 75, 255], [255, 61, 99, 255], ] ).astype(np.uint8) plt_plot_mov.glyph.scale_mode = "scale_by_vector" plt_plot_mov.module_manager.scalar_lut_manager.lut.table = colors_occ_mo fig_dir = "./figs" if not os.path.exists(fig_dir): os.mkdir(fig_dir) mlab.savefig(os.path.join(fig_dir, file_name[:-4]+".png")) vdisplay.stop() def main(): show_time_change = True nuscocc_path = "../data/nuScenes-Occupancy/" cam4docc_path = "../data/cam4docc/GMO/segmentation/" segmentation_files = os.listdir(cam4docc_path) segmentation_files.sort(key=lambda x: (x.split("_")[1])) index = 0 for file_ in tqdm(segmentation_files): scene_token = file_.split("_")[0] lidar_token = file_.split("_")[1] gt_file = nuscocc_path+"scene_"+scene_token+"/occupancy/"+lidar_token[:-4]+".npy" gt_occ_semantic = np.load(gt_file,allow_pickle=True) gt_occ_semantic = gt_occ_semantic[gt_occ_semantic[:, -1]!=0] gt_occ_semantic = gt_occ_semantic[::2] gt_occ_semantic_refine = np.zeros_like(gt_occ_semantic) gt_occ_semantic_refine[:, 0] = gt_occ_semantic[:, 2] gt_occ_semantic_refine[:, 1] = gt_occ_semantic[:, 1] gt_occ_semantic_refine[:, 2] = gt_occ_semantic[:, 0] gt_occ_semantic_refine[:, 3] = 1 gt_mo_semantic = np.load(cam4docc_path+file_,allow_pickle=True)['arr_0'] gt_mo_semantic_to_draw=np.zeros((0,4)) for t in range(0,4): gt_mo_cur = gt_mo_semantic[t] gt_mo_cur = np.array(gt_mo_cur) gt_mo_cur = gt_mo_cur[::2] if show_time_change: gt_mo_cur[:, -1] = int(t+1) gt_mo_semantic_to_draw = np.concatenate((gt_mo_semantic_to_draw, gt_mo_cur)) viz_occ(gt_occ_semantic_refine, gt_mo_semantic_to_draw, file_, voxel_size=0.2, show_occ=True, show_time_change=show_time_change) index += 1 if __name__ == "__main__": main() ================================================ FILE: viz/viz_pred.py ================================================ from tqdm import tqdm import pickle import numpy as np from mayavi import mlab from tqdm import trange import os from xvfbwrapper import Xvfb mlab.options.offscreen = True def viz_occ(occ, occ_mo, file_name, voxel_size, show_occ, show_time_change): vdisplay = Xvfb(width=1, height=1) vdisplay.start() mlab.figure(size=(800,800), bgcolor=(1,1,1)) plt_plot_occ = mlab.points3d( occ[:, 0] * voxel_size, occ[:, 1] * voxel_size, occ[:, 2] * voxel_size, occ[:, 3], colormap="viridis", scale_factor=voxel_size - 0.05 * voxel_size, mode="cube", opacity=0.9, vmin=1, ) colors_occ = np.array( [ [152, 251, 152, 255], [152, 251, 152, 255], [152, 251, 152, 255], [152, 251, 152, 255], [152, 251, 152, 255], ] ).astype(np.uint8) plt_plot_occ.glyph.scale_mode = "scale_by_vector" plt_plot_occ.module_manager.scalar_lut_manager.lut.table = colors_occ plt_plot_mov = mlab.points3d( occ_mo[:, 0] * voxel_size, occ_mo[:, 1] * voxel_size, occ_mo[:, 2] * voxel_size, occ_mo[:, 3], colormap="viridis", scale_factor=voxel_size - 0.05 * voxel_size, mode="cube", opacity=0.9, vmin=1, ) if show_time_change: colors_occ_mo = np.array( [ [255, 70, 255, 255], [255, 110, 255, 255], [255, 150, 255, 255], [255, 190, 255, 255], [255, 250, 250, 255], ] ).astype(np.uint8) else: colors_occ_mo = np.array( [ [220, 20, 60, 255], [255, 127, 80, 255], [0, 0, 230, 255], [255, 158, 0, 255], [233, 150, 70, 255], [47, 79, 79, 255], [255, 99, 71, 255], [175, 0, 75, 255], [255, 61, 99, 255], ] ).astype(np.uint8) plt_plot_mov.glyph.scale_mode = "scale_by_vector" plt_plot_mov.module_manager.scalar_lut_manager.lut.table = colors_occ_mo fig_dir = "./figs" if not os.path.exists(fig_dir): os.mkdir(fig_dir) mlab.savefig(os.path.join(fig_dir, file_name[:-4]+".png")) vdisplay.stop() def main(): show_time_change = True nuscocc_path = "../data/nuScenes-Occupancy/" pred_path = "../data/cam4docc/results/" segmentation_files = os.listdir(pred_path) segmentation_files.sort(key=lambda x: (x.split("_")[1])) index = 0 segmentation_files = segmentation_files[::10] for file_ in tqdm(segmentation_files): scene_token = file_.split("_")[0] lidar_token = file_.split("_")[1] gt_file = nuscocc_path+"scene_"+scene_token+"/occupancy/"+lidar_token[:-4]+".npy" gt_occ_semantic = np.load(gt_file,allow_pickle=True) gt_occ_semantic = gt_occ_semantic[gt_occ_semantic[:, -1]!=0] gt_occ_semantic = gt_occ_semantic[::2] gt_occ_semantic_refine = np.zeros_like(gt_occ_semantic) gt_occ_semantic_refine[:, 0] = gt_occ_semantic[:, 2] gt_occ_semantic_refine[:, 1] = gt_occ_semantic[:, 1] gt_occ_semantic_refine[:, 2] = gt_occ_semantic[:, 0] gt_occ_semantic_refine[:, 3] = 1 pred_mo_semantic = np.load(pred_path+file_,allow_pickle=True)['arr_0'] pred_mo_semantic_to_draw=np.zeros((0,4)) for t in range(0,4): pred_mo_cur = pred_mo_semantic[t] pred_mo_cur = np.array(pred_mo_cur) pred_mo_cur = pred_mo_cur[::2] if show_time_change: pred_mo_cur[:, -1] = int(t+1) pred_mo_semantic_to_draw = np.concatenate((pred_mo_semantic_to_draw, pred_mo_cur)) viz_occ(gt_occ_semantic_refine, pred_mo_semantic_to_draw, file_, voxel_size=0.2, show_occ=True, show_time_change=show_time_change) index += 1 if __name__ == "__main__": main() # export QT_QPA_PLATFORM='offscreen'